In [1]:
import pandas as pd
from data_columns import *

In [2]:
# Define a function for concatenation
def concatenate_values(series):
    return ' '.join(series.dropna().astype(str))

# Split the string into a list of words, remove duplicates, and maintain original order
def remove_duplicates_from_string(s):
    return list(sorted(set(s.split()), key=s.split().index))

#Define a function to remove duplicates and join codes
def remove_duplicates_concatenate(df, columns):
    df = df.groupby('source_url', as_index=False).agg({
    col: concatenate_values for col in columns
    })

    for col in columns:
        df[col] = df[col].apply(remove_duplicates_from_string)

    return df

def event_to_description(codes_list, code_dictionary):
    descriptions = []
    for code in codes_list:
        code = int(code)
        descriptions.append(code_dictionary.get(code, ''))
    return ' '.join(descriptions)

def actor_to_description(codes_list, code_dictionary):
    descriptions = []
    for code in codes_list:
        if len(code) % 3 == 0:
            # Code length is a multiple of 3
            chunks = [code[i:i+3] for i in range(0, len(code), 3)]
            chunk_descriptions = [code_dictionary.get(chunk, '') for chunk in chunks]
            descriptions.append(' '.join(chunk_descriptions))
        else:
            continue
    return ' '.join(descriptions)

def remove_duplicates_description(text):
    words = text.split()
    seen = set()
    unique_words = []
    
    for word in words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)
    
    return ' '.join(unique_words)

In [3]:
actor_codes = ['actor1_code', 'actor1_country_code', 'actor1_known_group_code', 'actor1_type1_code', 
         'actor2_code', 'actor2_country_code', 'actor2_known_group_code', 'actor2_type1_code']

event_codes = ['event_code']

important_columns = actor_codes + event_codes

#other_important = ['actor1_name', 'actor2_name', 'actor1_geo_full_name','actor2_geo_fullname']

actors = pd.read_csv("Entity_Codes/actor_codes.csv")
countries = pd.read_csv("Entity_Codes/country_codes.csv")
events = pd.read_csv("Entity_Codes/event_codes.csv")
codes = pd.concat([actors, countries, events], ignore_index=True)
code_dictionary = dict(zip(codes['Code'], codes['Description']))

In [8]:
df = pd.read_csv('./Data_Storage/GDELT Event Files/20240813.export.CSV', sep = '\t', header = None)
df.columns = column_names

df = remove_duplicates_concatenate(df, important_columns)

  df = pd.read_csv('./Data_Storage/GDELT Event Files/20240813.export.CSV', sep = '\t', header = None)


In [12]:
master_data = pd.read_csv(r'masterData.csv')
master_data.columns = column_names
master_data = remove_duplicates_concatenate(master_data, important_columns)

  master_data = pd.read_csv(r'masterData.csv')


In [10]:
# Initialize the descriptions column
df['description'] = ''

# Iterate over each column and append descriptions for codes with letters
for column in actor_codes:
    df['description'] += df[column].apply(lambda x: actor_to_description(x, code_dictionary)) + ' '

# Iterate over each column and append descriptions for codes with numbers
for column in event_codes:
    df['description'] += df[column].apply(lambda x: event_to_description(x, code_dictionary)) + ' '

df['description'] = df['description'].apply(remove_duplicates_description)

df
    

Unnamed: 0,source_url,actor1_code,actor1_country_code,actor1_known_group_code,actor1_type1_code,actor2_code,actor2_country_code,actor2_known_group_code,actor2_type1_code,event_code,description
0,http://016.jose947.com/,[USA],[USA],[],[],[BUS],[],[],[BUS],[90],United States of America Business Investigate
1,http://0kcw.jose947.com/,[AGR],[],[],[AGR],[],[],[],[],[30],Agriculture Express intent to cooperate
2,http://africa.chinadaily.com.cn/a/202408/13/WS...,"[CHNGOV, GBR]","[CHN, GBR]",[],[GOV],"[GBR, CHNGOV]","[GBR, CHN]",[],[GOV],"[40, 161]","China Government, political parties United Kin..."
3,http://africa.chinadaily.com.cn/a/202408/13/WS...,[CHN],[CHN],[],[],[],[],[],[],"[51, 71]",China Praise or endorse Provide economic aid
4,http://agenda.ge/en/news/2024/40172,"[AZE, HUN, ROU, USA]","[AZE, HUN, USA]",[],[],"[HUN, ROU, USA, AZE]","[HUN, USA, AZE]",[],[],[57],Azerbaijan Hungary Romania United States of Am...
...,...,...,...,...,...,...,...,...,...,...,...
28634,https://zeenews.india.com/world/who-is-faiz-ha...,"[MIL, PAKMIL]",[PAK],[],[MIL],[PAK],[PAK],[],[],"[90, 173]","Military Pakistan Investigate Arrest, detain, ..."
28635,https://zenopa.com/news/fda-approves-ascendis-...,[GOV],[],[],[GOV],[],[],[],[],[20],"Government, political parties USE UNCONVENTION..."
28636,https://znsbahamas.com/pm-davis-keen-to-see-sm...,"[BHS, BHSGOV, GOV]",[BHS],[],[GOV],"[GOV, HLH, BHS]",[BHS],[],"[GOV, HLH]","[36, 43, 10, 51, 60, 71, 114, 42]","Bahamas Government, political parties ""Health""..."
28637,https://znsbahamas.com/residents-unite-against...,[CVL],[],[],[CVL],[BHS],[BHS],[],[],[141],Civilians Bahamas Demonstrate or rally


In [9]:
df

Unnamed: 0,source_url,actor1_code,actor1_country_code,actor1_known_group_code,actor1_type1_code,actor2_code,actor2_country_code,actor2_known_group_code,actor2_type1_code,event_code
0,http://016.jose947.com/,[USA],[USA],[],[],[BUS],[],[],[BUS],[90]
1,http://0kcw.jose947.com/,[AGR],[],[],[AGR],[],[],[],[],[30]
2,http://africa.chinadaily.com.cn/a/202408/13/WS...,"[CHNGOV, GBR]","[CHN, GBR]",[],[GOV],"[GBR, CHNGOV]","[GBR, CHN]",[],[GOV],"[40, 161]"
3,http://africa.chinadaily.com.cn/a/202408/13/WS...,[CHN],[CHN],[],[],[],[],[],[],"[51, 71]"
4,http://agenda.ge/en/news/2024/40172,"[AZE, HUN, ROU, USA]","[AZE, HUN, USA]",[],[],"[HUN, ROU, USA, AZE]","[HUN, USA, AZE]",[],[],[57]
...,...,...,...,...,...,...,...,...,...,...
28634,https://zeenews.india.com/world/who-is-faiz-ha...,"[MIL, PAKMIL]",[PAK],[],[MIL],[PAK],[PAK],[],[],"[90, 173]"
28635,https://zenopa.com/news/fda-approves-ascendis-...,[GOV],[],[],[GOV],[],[],[],[],[20]
28636,https://znsbahamas.com/pm-davis-keen-to-see-sm...,"[BHS, BHSGOV, GOV]",[BHS],[],[GOV],"[GOV, HLH, BHS]",[BHS],[],"[GOV, HLH]","[36, 43, 10, 51, 60, 71, 114, 42]"
28637,https://znsbahamas.com/residents-unite-against...,[CVL],[],[],[CVL],[BHS],[BHS],[],[],[141]


In [13]:
#TODO: ValueError: invalid literal for int() with base 10: '---'

master_data['description'] = ''

# Iterate over each column and append descriptions for codes with letters
for column in actor_codes:
    master_data['description'] += master_data[column].apply(lambda x: actor_to_description(x, code_dictionary)) + ' '

# Iterate over each column and append descriptions for codes with numbers
for column in event_codes:
    master_data['description'] += master_data[column].apply(lambda x: event_to_description(x, code_dictionary)) + ' '

master_data['description'] = master_data['description'].apply(remove_duplicates_description)

master_data
    

ValueError: invalid literal for int() with base 10: '---'

In [15]:
master_data.to_csv("masterDataDropDuplicates.csv", index=False)