In [17]:
import pandas as pd
from data_columns import *
import datetime
import os
from url_getter import get_sorted_data

In [42]:
def concatenate_values(series):
    return ' '.join(series.dropna().astype(str))

def remove_duplicates_from_string(s):
    return list(sorted(set(s.split()), key=s.split().index))

def remove_duplicates_concatenate(df, columns):
    df = df.groupby('source_url', as_index=False).agg({
        col: concatenate_values for col in columns
    })

    for col in columns:
        df[col] = df[col].apply(remove_duplicates_from_string)

    return df

def event_to_description(codes_list, code_dictionary):
    descriptions = []
    for code in codes_list:
        # Check if the code is alphanumeric
        if code.isalnum():
            # Optionally, handle numeric or mixed alphanumeric codes
            descriptions.append(code_dictionary.get(code, ''))
        else:
            # Skip or handle non-alphanumeric codes
            continue  # or descriptions.append('Invalid code')
    return ' '.join(descriptions)

def actor_to_description(codes_list, code_dictionary):
    descriptions = []
    for code in codes_list:
        if len(code) % 3 == 0:
            chunks = [code[i:i+3] for i in range(0, len(code), 3)]
            chunk_descriptions = [code_dictionary.get(chunk, '') for chunk in chunks]
            descriptions.append(' '.join(chunk_descriptions))
        else:
            continue
    return ' '.join(descriptions)

def remove_duplicates_description(text):
    words = text.split()
    seen = set()
    unique_words = []
    
    for word in words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)
    
    return ' '.join(unique_words)

# Definir la función para calcular los promedios
def clean_dataframe(df):
    clean_data = df.groupby('source_url', as_index=False).agg({
        'num_sources': 'mean',
        'avg_tone': 'mean',
        'goldstein_scale' : 'mean',
        'num_articles': 'mean',
        'num_mentions': 'mean'
    })
    return clean_data

def add_description(df, actor_codes, event_codes, code_dictionary):
    # Initialize the descriptions column
    df['description'] = ''

    # Add descriptions for actor codes
    for column in actor_codes:
        df['description'] += df[column].apply(lambda x: actor_to_description(x, code_dictionary)) + ' '

    # Add descriptions for event codes
    for column in event_codes:
        df['description'] += df[column].apply(lambda x: event_to_description(x, code_dictionary)) + ' '

    df['description'] = df['description'].apply(remove_duplicates_description)

    # Get the sorted data
    sorted_data = get_sorted_data(df,int((len(df)*0.15)))
    sorted_data
    
    return sorted_data


In [55]:
actor_codes = ['actor1_code', 'actor1_known_group_code', 'actor1_type1_code', 
         'actor2_code', 'actor2_known_group_code', 'actor2_type1_code']

event_codes = ['event_code']

important_columns = actor_codes + event_codes

#other_important = ['actor1_name', 'actor2_name', 'actor1_geo_full_name','actor2_geo_fullname']

actors = pd.read_csv("Entity_Codes/actor_codes.csv")
# countries = pd.read_csv("Entity_Codes/country_codes.csv")
events = pd.read_csv("Entity_Codes/event_codes.csv")
codes = pd.concat([actors, events], ignore_index=True)
code_dictionary = dict(zip(codes['Code'], codes['Description']))

In [None]:
# Define the date range
start_date = datetime.datetime(2023, 8, 13)
end_date = datetime.datetime(2024, 8, 13)

# Create a date range list
date_range = pd.date_range(start=start_date, end=end_date)

# Define the file path template
file_path_template = "./Data_Storage/GDELT Event Files/{date}.export.CSV"

describe_columns = []
# Loop through each date in the range
for single_date in date_range:
    date_str = single_date.strftime("%Y%m%d")
    
    # Construct the file path for the current date
    file_path = file_path_template.format(date=date_str)
    
    # Check if the file exists
    if os.path.exists(file_path):
        print(f"Processing file: {file_path}")
        
        # Read the CSV file
        df = pd.read_csv(file_path, sep='\t', header=None)
        df.columns = column_names

        # Clean the dataframe
        df_avg = clean_dataframe(df)
        
        # Apply the remove_duplicates_concatenate function
        df = remove_duplicates_concatenate(df, important_columns)

        # Fusionar los datos procesados con los promedios calculados
        df = df.merge(df_avg, on='source_url', how='left')

        # Agregar las descripciones
        sorted_data = add_description(df, actor_codes, event_codes, code_dictionary)

        describe_columns.append(sorted_data)


In [62]:
result_df = pd.concat(describe_columns, ignore_index=True)

In [63]:
result_df.to_csv('result.csv', index=False)