In [1]:
import pandas as pd

In [6]:
column_names = ['global_id',
 'day', # Date the event took place in YYYYMMDD format
 'month_year', # Alternative formating YYYYMM
 'year', # Year
 'fraction_date', # Alternative formating YYYY.FFFF, where FFFF is the percentage of the year completed by that day
# actor 1
 'actor1_code',
 'actor1_name', # Name of Actor 1
 'actor1_country_code',
 'actor1_known_group_code', # Which group the actor belongs to NGO/ IGO/ rebel group. Ex: United Nations
 'actor1_ethnic_code',
 'actor1_religion1_code',
 'actor1_religion2_code',
 'actor1_type1_code', # Type codes talk about roles, for example police forces
 'actor1_type2_code', # goverment, military, education, elites, media, etc
 'actor1_type3_code', # -
# actor 2
 'actor2_code',
 'actor2_name', # Name of actor 2
 'actor2_country_code',
 'actor2_known_group_code',
 'actor2_ethnic_code',
 'actor2_religion1_code',
 'actor2_religion2_code',
 'actor2_type1_code', # Same as in actor 1
 'actor2_type2_code', # -
 'actor2_type3_code', # -
# ----------------
 'is_root_event', # Binary. Says if it is the root event. Can give insight into importance
 'event_code',
 'event_base_code',
 'event_root_code',
 'quad_class', # Event taxonomy: 1. Verbal cooperation, 2. Material Cooperation, 3. Verbal Conflict, 4. Material Conflict
 'goldstein_scale', # Numeric score from -10 to +10 capturing potential impact that the event will have in countries stability
 'num_mentions', # Number of mentions of the event across all documents. Can be seen as importance measure
 'num_sources', # Number of information sources containing mentions of the event
 'num_articles',# Number of source documents containing mentions of this event
 'avg_tone', # Avg tone of documents that mention the event. Goes from -100 (extremely negative) to 100 (extremely positive)
# actor 1 geo
 'actor1_geo_type', # Maps to: 1.Country, 2. US State, 3. US City, 4. World city, 5. World State
 'actor1_geo_full_name', # Name of location
 'actor1_geo_country_code',
 'actor1_geo_adm1_code',
 'actor1_geo_lat', # Latitude
 'actor1_geo_long', # Longitude
 'actor1_geo_feature_id',
# actor 2 geo
 'actor2_geo_type', # Check actor 1
 'actor2_geo_fullname',
 'actor2_geo_countrycode',
 'actor2_geo_adm1_code',
 'actor2_geo_lat',
 'actor2_geo_long',
 'actor2_geo_feature_id',
# action geo
 'action_geo_type', # Check actor 1
 'action2_geo_full_name',
 'action_geo_country_code',
 'action_geo_adm1_code',
 'action_geo_lat',
 'action_geo_long',
 'action_geo_feature_id',
# date and url
 'date_added', # Date the event was added to master database
 'source_url'] # URL

In [34]:
codes_letters = ['actor1_code', 'actor1_country_code', 'actor1_known_group_code', 'actor1_type1_code', 
         'actor2_code', 'actor2_country_code', 'actor2_known_group_code', 'actor2_type1_code']

codes_nums = ['event_code']

other_important = ['actor1_name', 'actor2_name', 'actor1_geo_full_name','actor2_geo_fullname']

In [56]:
df = pd.read_csv('./Data_Storage/GDELT Event Files/20240813.export.CSV', sep = '\t', header = None)
df.columns = column_names

df = df.head(100)

  df = pd.read_csv('./Data_Storage/GDELT Event Files/20240813.export.CSV', sep = '\t', header = None)


In [57]:
# Define a function for concatenation
def concatenate_values(series):
    # Remove NaNs, convert to string, and join with a space
    return ' '.join(series.dropna().astype(str))

def remove_duplicates_from_string(s):
    # Split the string into a list of words, remove duplicates, and maintain original order
    return list(sorted(set(s.split()), key=s.split().index))

def process_dataframe(df, columns):
    # Apply the remove_duplicates_from_string function to each specified column
    for col in columns:
        df[col] = df[col].apply(remove_duplicates_from_string)
    return df

#define a function to remove duplicates and join codes
def remove_duplicates_concatenate(df):
    df = df.groupby('source_url', as_index=False).agg({
    col: concatenate_values for col in codes_letters
    })

    for col in codes_letters:
        df[col] = df[col].apply(remove_duplicates_from_string)

    return df


df = remove_duplicates_concatenate(df)
df


Unnamed: 0,source_url,actor1_code,actor1_country_code,actor1_known_group_code,actor1_type1_code,actor2_code,actor2_country_code,actor2_known_group_code,actor2_type1_code
0,http://www.ourmidland.com/news/article/30-year...,[],[],[],[],[COP],[],[],[COP]
1,https://107jamz.com/ixp/159/p/gordon-mckernan-...,[],[],[],[],[CVL],[],[],[CVL]
2,https://alaskapublic.org/2024/08/12/kodiak-cel...,[],[],[],[],[COP],[],[],[COP]
3,https://citynews.com.au/2024/fatal-hotel-chopp...,[],[],[],[],[COP],[],[],[COP]
4,https://citynews.com.au/2024/scott-morrison-to...,[],[],[],[],[AUS],[AUS],[],[]
5,https://perezhilton.com/passengers-complained-...,[],[],[],[],[BRA],[BRA],[],[]
6,https://wbt.com/1095438/deacon-darren-bitter-d...,[],[],[],[],[CHRCTH],[],[],[]
7,https://wishtv.com/news/business/biggest-shake...,[],[],[],[],[BUS],[],[],[BUS]
8,https://wishtv.com/news/crime-watch-8/suspect-...,[],[],[],[],[COP],[],[],[COP]
9,https://wpde.com/news/nation-world/one-year-la...,[],[],[],[],[COP],[],[],[COP]


In [63]:
actors = pd.read_csv("Entity_Codes/actor_codes.csv")
code_dictionary = dict(zip(actors['Code'], actors['Description']))

def map_codes_to_description(codes_list, code_dictionary):
    descriptions = []
    for code in codes_list:
        if len(code) == 3:  # Ensure the code is 3 characters
            descriptions.append(code_dictionary.get(code, 'Unknown'))
        else:
            descriptions.append('Invalid Code')
    return ', '.join(descriptions)

# Apply the function to the DataFrame
df['descriptions'] = df['actor1_code'].apply(lambda x: map_codes_to_description(x, code_dictionary))

df


Unnamed: 0,source_url,actor1_code,actor1_country_code,actor1_known_group_code,actor1_type1_code,actor2_code,actor2_country_code,actor2_known_group_code,actor2_type1_code,descriptions
0,http://www.ourmidland.com/news/article/30-year...,[],[],[],[],[COP],[],[],[COP],
1,https://107jamz.com/ixp/159/p/gordon-mckernan-...,[],[],[],[],[CVL],[],[],[CVL],
2,https://alaskapublic.org/2024/08/12/kodiak-cel...,[],[],[],[],[COP],[],[],[COP],
3,https://citynews.com.au/2024/fatal-hotel-chopp...,[],[],[],[],[COP],[],[],[COP],
4,https://citynews.com.au/2024/scott-morrison-to...,[],[],[],[],[AUS],[AUS],[],[],
5,https://perezhilton.com/passengers-complained-...,[],[],[],[],[BRA],[BRA],[],[],
6,https://wbt.com/1095438/deacon-darren-bitter-d...,[],[],[],[],[CHRCTH],[],[],[],
7,https://wishtv.com/news/business/biggest-shake...,[],[],[],[],[BUS],[],[],[BUS],
8,https://wishtv.com/news/crime-watch-8/suspect-...,[],[],[],[],[COP],[],[],[COP],
9,https://wpde.com/news/nation-world/one-year-la...,[],[],[],[],[COP],[],[],[COP],
