### Requirements

In [3]:
# %pip install numpy pandas

Note: you may need to restart the kernel to use updated packages.


### Data preprocessing
We first define auxilary functions for ease of use.

In [62]:
import os
import json
import requests
import zipfile
import pandas as pd

In [5]:
if not os.path.exists(r'./data/03cleaned_events.json'):
    raise Exception("There is no data to preprocess. Please run the notebook 01data_scraping.ipynb.")

In [112]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [89]:
col_names=[
    "Global_Event_ID", "Day", "YYYYMM", "YYYY", "Day_Time", "Actor_1_Country_Code", "Actor_1_Name",
    "Actor_1_Country_ABBR", "Actor_1_Known_Group_Code", "Actor_1_Ethnic_Code", "Actor_1_Religion_Code",
    "Actor_1_Religion_2_Code", "Actor_1_Role", "Actor_1_Role2", "Actor_1_Role3", "Actor_2_Country_Code",
    "Actor_2_Name", "Actor_2_Country_ABBR", "Actor_2_Know_Group_Code", "Actor_2_Ethnic_Code",
    "Actor_2_Religion_Code", "Actor_2_Religion_2_Code", "Actor_2_Role", "Actor_2_Role2", "Actor_2_Role3",
    "Is_Root_Event", "Event_Code", "Event_Base_Code", "Event_Root_Code", "Quad_Class", "Goldstein_Scale",
    "Num_Mentions", "Num_Sources", "Num_Articles", "AVG_TONE", "Actor_1_Geo_Type", "Actor_1_Geo_FullName",
    "Actor_1_Geo_Country_Code", "Actor1Geo_ADM1Code", "Actor1Geo_ADM2Code", "Actor1Geo_Lat", "Actor1Geo_Long",
    "Actor1Geo_FeatureID", "Actor_2_Geo_Type", "Actor_2_Geo_FullName", "Actor_2_Geo_Country_Code",
    "Actor2Geo_ADM1Code", "Actor2Geo_ADM2Code", "Actor2Geo_Lat", "Actor2Geo_Long", "Actor2Geo_FeatureID",
    "Mention_Type", "ST_PR_CNTRY", "Country", "ADM1Code_Extra", "ADM2Code_Extra", "Lat_Extra", "Long_Extra",
    "ActorGeo_FeaturID_Extra", "Date_Added", "Source_URL"
]

In [108]:
def find_gdelt_url_by_yyyymmddhhmm(
        year: str|int, month: str|int, day: str|int, hours: str|int, minutes: str|int
) -> str|None:
    """
    Find the masterfile link for gdelt events logged on yyyymmddhhmm. 
    """
    with open(r'./data/03cleaned_events.json', 'r') as file:
        data = json.load(file)
    
    year, month, day, hours, minutes = map(str, [year, month, day, hours, minutes])
    times = [year, month, day, hours, minutes]
    for i in range(len(times)):
        if len(times[i]) == 1:
            times[i] = "0" + times[i]
    
    year, month, day, hours, minutes = (*times,)

    if day_row := data.get(year + month + day):
        key = hours + minutes
        for row in day_row:
            if row_url := row.get(key):
                return row_url
            
    return None


def load_gdelt_by_yyyymmddhhmm(
        year: str|int, month: str|int, day: str|int, hours: str|int, minutes: str|int
) -> pd.DataFrame|None:
    """
    Load the gdelt events dataset logged on yyyymmddhhmm.
    """
    url = find_gdelt_url_by_yyyymmddhhmm(year, month, day, hours, minutes)
    if not url:
        return None

    response = requests.get(url, stream=True)

    if response.status_code == 200:
        with open(r'./data/04temp_15min_data.zip', 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
    else:
        print(f'Failed to retrieve gdelt data. Status code: {response.status_code}')
        return None
    
    with zipfile.ZipFile(r'./data/04temp_15min_data.zip', 'r') as zip_ref:
        zip_ref.extractall(r'./data/')
    
    year, month, day, hours, minutes = map(str, [year, month, day, hours, minutes])
    times = [year, month, day, hours, minutes]
    for i in range(len(times)):
        if len(times[i]) == 1:
            times[i] = "0" + times[i]
    
    year, month, day, hours, minutes = (*times,)
    
    os.rename(r'./data/' + year + month + day + hours + minutes + "00" + ".export.csv", r'./data/05temp_15min_data.CSV')

    data = pd.read_csv(r'./data/05temp_15min_data.CSV', sep=r'\t', engine='python', header=None)
    data.columns = col_names

    os.remove(r'./data/04temp_15min_data.zip')
    os.remove(r'./data/05temp_15min_data.CSV')
    
    return data

In [109]:
dataset = load_gdelt_by_yyyymmddhhmm(2015, 11, 13, 2, 00)

In [111]:
dataset.head()

Unnamed: 0,Global_Event_ID,Day,YYYYMM,YYYY,Day_Time,Actor_1_Country_Code,Actor_1_Name,Actor_1_Country_ABBR,Actor_1_Known_Group_Code,Actor_1_Ethnic_Code,Actor_1_Religion_Code,Actor_1_Religion_2_Code,Actor_1_Role,Actor_1_Role2,Actor_1_Role3,Actor_2_Country_Code,Actor_2_Name,Actor_2_Country_ABBR,Actor_2_Know_Group_Code,Actor_2_Ethnic_Code,Actor_2_Religion_Code,Actor_2_Religion_2_Code,Actor_2_Role,Actor_2_Role2,Actor_2_Role3,Is_Root_Event,Event_Code,Event_Base_Code,Event_Root_Code,Quad_Class,Goldstein_Scale,Num_Mentions,Num_Sources,Num_Articles,AVG_TONE,Actor_1_Geo_Type,Actor_1_Geo_FullName,Actor_1_Geo_Country_Code,Actor1Geo_ADM1Code,Actor1Geo_ADM2Code,Actor1Geo_Lat,Actor1Geo_Long,Actor1Geo_FeatureID,Actor_2_Geo_Type,Actor_2_Geo_FullName,Actor_2_Geo_Country_Code,Actor2Geo_ADM1Code,Actor2Geo_ADM2Code,Actor2Geo_Lat,Actor2Geo_Long,Actor2Geo_FeatureID,Mention_Type,ST_PR_CNTRY,Country,ADM1Code_Extra,ADM2Code_Extra,Lat_Extra,Long_Extra,ActorGeo_FeaturID_Extra,Date_Added,Source_URL
0,484431843,20141113,201411,2014,2014.8575,BUS,COMPANY,,,,,,BUS,,,,,,,,,,,,,0,10,10,1,1,0.0,10,1,10,1.641791,3,"Phoenix, Arizona, United States",US,USAZ,AZ013,33.4484,-112.074,44784,0,,,,,,,,3,"Phoenix, Arizona, United States",US,USAZ,AZ013,33.4484,-112.074,44784,20151113020000,http://www.proactiveinvestors.com.au/companies...
1,484431844,20141113,201411,2014,2014.8575,BUS,COMPANY,,,,,,BUS,,,,,,,,,,,,,0,60,60,6,2,6.0,10,1,10,1.641791,3,"Phoenix, Arizona, United States",US,USAZ,AZ013,33.4484,-112.074,44784,0,,,,,,,,3,"Phoenix, Arizona, United States",US,USAZ,AZ013,33.4484,-112.074,44784,20151113020000,http://www.proactiveinvestors.com.au/companies...
2,484431845,20141113,201411,2014,2014.8575,COP,POLICE,,,,,,COP,,,,,,,,,,,,,1,173,173,17,4,-5.0,10,1,10,-11.458333,0,,,,,,,,0,,,,,,,,0,,,,,,,,20151113020000,http://www.fox8live.com/story/30504959/serial-...
3,484431846,20141113,201411,2014,2014.8575,COP,PRISON,,,,,,COP,,,BRA,BRAZIL,BRA,,,,,,,,1,80,80,8,2,5.0,4,1,4,-8.411215,1,Brazil,BR,BR,,-10.0,-55.0,BR,1,Brazil,BR,BR,,-10.0,-55.0,BR,1,Brazil,BR,BR,,-10.0,-55.0,BR,20151113020000,http://home.nzcity.co.nz/news/article.aspx?id=...
4,484431847,20141113,201411,2014,2014.8575,COP,PRISON,,,,,,COP,,,BRA,BRAZIL,BRA,,,,,,,,1,112,112,11,3,-2.0,4,1,4,-8.411215,1,Brazil,BR,BR,,-10.0,-55.0,BR,1,Brazil,BR,BR,,-10.0,-55.0,BR,1,Brazil,BR,BR,,-10.0,-55.0,BR,20151113020000,http://home.nzcity.co.nz/news/article.aspx?id=...
