# Guide to the data
In this notebook I will show you how to access the data wtih Pandas.
Here you will find:
- How to open the file
- The correct column names
- An explanation of some relevant column names

## Part 1: Reading the data
You can use the data scraping script on a jupyter notebook or on a google colab to download the data. On google colab it runs quickly.
After you download the data, to open the file without problems you need to do the following

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
df = pd.read_csv('/Users/grego/OneDrive/Documentos/GitHub/factored-datathon-2024-exploding-gradients/Data_Storage/GDELT Event Files/20240813.export.CSV', sep = '\t', header = None)

  df = pd.read_csv('/Users/grego/OneDrive/Documentos/GitHub/factored-datathon-2024-exploding-gradients/Data_Storage/GDELT Event Files/20240813.export.CSV', sep = '\t', header = None)


## Part 2 Columns and explanation
Here is a list containing all the column names and a brief explanation of the ones I found relevant. The dataset needs the column names to be assigned

In [2]:
column_names = ['global_id',
 'day', # Date the event took place in YYYYMMDD format
 'month_year', # Alternative formating YYYYMM
 'year', # Year
 'fraction_date', # Alternative formating YYYY.FFFF, where FFFF is the percentage of the year completed by that day
# actor 1
 'actor1_code',
 'actor1_name', # Name of Actor 1
 'actor1_country_code',
 'actor1_known_group_code', # Which group the actor belongs to NGO/ IGO/ rebel group. Ex: United Nations
 'actor1_ethnic_code',
 'actor1_religion1_code',
 'actor1_religion2_code',
 'actor1_type1_code', # Type codes talk about roles, for example police forces
 'actor1_type2_code', # goverment, military, education, elites, media, etc
 'actor1_type3_code', # -
# actor 2
 'actor2_code',
 'actor2_name', # Name of actor 2
 'actor2_country_code',
 'actor2_known_group_code',
 'actor2_ethnic_code',
 'actor2_religion1_code',
 'actor2_religion2_code',
 'actor2_type1_code', # Same as in actor 1
 'actor2_type2_code', # -
 'actor2_type3_code', # -
# ----------------
 'is_root_event', # Binary. Says if it is the root event. Can give insight into importance
 'event_code',
 'event_base_code',
 'event_root_code',
 'quad_class', # Event taxonomy: 1. Verbal cooperation, 2. Material Cooperation, 3. Verbal Conflict, 4. Material Conflict
 'goldstein_scale', # Numeric score from -10 to +10 capturing potential impact that the event will have in countries stability
 'num_mentions', # Number of mentions of the event across all documents. Can be seen as importance measure
 'num_sources', # Number of information sources containing mentions of the event
 'num_articles',# Number of source documents containing mentions of this event
 'avg_tone', # Avg tone of documents that mention the event. Goes from -100 (extremely negative) to 100 (extremely positive)
# actor 1 geo
 'actor1_geo_type', # Maps to: 1.Country, 2. US State, 3. US City, 4. World city, 5. World State
 'actor1_geo_full_name', # Name of location
 'actor1_geo_country_code',
 'actor1_geo_adm1_code',
 'actor1_geo_lat', # Latitude
 'actor1_geo_long', # Longitude
 'actor1_geo_feature_id',
# actor 2 geo
 'actor2_geo_type', # Check actor 1
 'actor2_geo_fullname',
 'actor2_geo_countrycode',
 'actor2_geo_adm1_code',
 'actor2_geo_lat',
 'actor2_geo_long',
 'actor2_geo_feature_id',
# action geo
 'action_geo_type', # Check actor 1
 'action2_geo_full_name',
 'action_geo_country_code',
 'action_geo_adm1_code',
 'action_geo_lat',
 'action_geo_long',
 'action_geo_feature_id',
# date and url
 'date_added', # Date the event was added to master database
 'source_url'] # URL

In [3]:
df.columns = column_names

Now the data is ready and you can use it as you would normally with Pandas

In [4]:
# df = df.dropna(subset=['avg_tone', 'goldstein_scale', 'num_mentions', 'num_sources', 'num_articles'])

# Deleating duplicated rows
len_before = df
len_after = df.drop_duplicates(subset=['source_url'])
print('Rows before:', len(len_before))
print('Rows after:', len(len_after))

Rows before: 132404
Rows after: 28639


In [5]:
# Identify the most repeated URL
url_counts = df['source_url'].value_counts()
most_repeated_url = url_counts.idxmax()
repeat_count = url_counts.max()

print(f"The most repeated URL is: {most_repeated_url} with {repeat_count} occurrences.")

# Inspect variables associated with the most repeated URL
most_repeated_url_data = df[df['source_url'] == most_repeated_url]
most_repeated_url_data[['avg_tone', 'goldstein_scale', 'num_mentions', 'num_sources', 'num_articles']].describe()

The most repeated URL is: https://www.theborneopost.com/2024/08/13/abg-jo-use-niah-caves-to-delve-deeper-into-history-of-human-settlement-in-borneo/ with 173 occurrences.


Unnamed: 0,avg_tone,goldstein_scale,num_mentions,num_sources,num_articles
count,173.0,173.0,173.0,173.0,173.0
mean,0.581913,2.494798,2.861272,1.011561,2.861272
std,0.262376,0.068426,1.779519,0.107208,1.779519
min,-2.784529,1.9,2.0,1.0,2.0
25%,0.59761,2.5,2.0,1.0,2.0
50%,0.59761,2.5,2.0,1.0,2.0
75%,0.59761,2.5,4.0,1.0,4.0
max,1.264252,2.8,20.0,2.0,20.0


In [6]:
# Define the columns to preserve the most common values
columns_to_preserve = ['avg_tone', 'goldstein_scale', 'num_mentions', 'num_sources', 'num_articles']

# Function to get the mode (most frequent value)
def mode(series):
    return series.mode()[0] if not series.mode().empty else series.iloc[0]

# Group by 'source_url' and apply the mode function to the specified columns
df_grouped = df.groupby('source_url').agg({col: mode for col in columns_to_preserve}).reset_index()

# Merge the grouped data with other columns from the original DataFrame
df_combined = pd.merge(df_grouped, df.drop(columns=columns_to_preserve).drop_duplicates('source_url'), on='source_url', how='left')

# Check the shape of the original and combined DataFrames
print('Original DataFrame shape:', df.shape)
print('Combined DataFrame shape:', df_combined.shape)

Original DataFrame shape: (132404, 58)
Combined DataFrame shape: (28639, 58)


In [7]:
trial=df_combined.head(10)

## Web scrapping

In [12]:
import scrapy
import pandas as pd
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from twisted.internet.defer import inlineCallbacks, Deferred
import nest_asyncio
from newspaper import Article
from twisted.internet import reactor

# Apply the necessary fix for asyncio to work in Jupyter Notebooks
nest_asyncio.apply()

class NewsSpider(scrapy.Spider):
    name = "news_spider"

    def __init__(self, urls, *args, **kwargs):
        super(NewsSpider, self).__init__(*args, **kwargs)
        self.urls = urls
        self.extracted_texts = []

    def start_requests(self):
        for url in self.urls:
            yield scrapy.Request(url=url, callback=self.parse, errback=self.errback_handler)

    def parse(self, response):
        try:
            article = Article(response.url)
            article.download()
            article.parse()
            text = article.text
            if text:
                self.extracted_texts.append(text)
            else:
                self.extracted_texts.append("No content extracted")
            self.logger.info(f"Extracted text for {response.url}")
        except Exception as e:
            self.logger.error(f"Failed to extract text from {response.url} with error: {str(e)}")
            self.extracted_texts.append("Error during extraction")

    def errback_handler(self, failure):
        self.logger.error(f"Request failed for {failure.request.url} with error: {failure.value}")
        self.extracted_texts.append("Request failed")

In [None]:
urls = trial['source_url'].tolist()
# Configure logging for Scrapy
configure_logging()
runner = CrawlerRunner()

@inlineCallbacks
def crawl():
    spider = yield runner.crawl(NewsSpider, urls=urls)
    reactor.stop()
    return spider

# Run the crawl process
deferred = crawl()
reactor.run()

# Retrieve the spider instance and its extracted texts
spider = deferred.result
df['extracted_text'] = spider.extracted_texts

In [None]:
df_numeric = df.select_dtypes(include=[float, int])
df_numeric.columns

Index(['global_id', 'day', 'month_year', 'year', 'fraction_date',
       'is_root_event', 'event_code', 'event_base_code', 'event_root_code',
       'quad_class', 'goldstein_scale', 'num_mentions', 'num_sources',
       'num_articles', 'avg_tone', 'actor1_geo_type', 'actor1_geo_lat',
       'actor1_geo_long', 'actor2_geo_type', 'actor2_geo_lat',
       'actor2_geo_long', 'action_geo_type', 'action_geo_lat',
       'action_geo_long', 'date_added'],
      dtype='object')