In [1]:
import os
import requests
import pandas as pd
import numpy as np

from scrapy import Selector
from datetime import datetime

import zipfile

## GDELT 1.0 GKG Data

In [2]:
gkg_url = 'http://data.gdeltproject.org/gkg/index.html'
response = requests.get(gkg_url)
sel = Selector(text = response.text)

start_date = datetime(2023, 8, 13)
end_date = datetime(2024, 8, 13)

# Get the links as a url that can be downloaded later
links = sel.xpath('//a/@href').extract()

gkg_links = []
gkgcounts_links = []
base = "http://data.gdeltproject.org/gkg/"

for link in links:
    if link.endswith('.zip'):
        date_str = link.split('.')[0]
        try:
            file_date = datetime.strptime(date_str, "%Y%m%d")
            if start_date <= file_date <= end_date:
                if 'gkgcounts' in link:
                    gkgcounts_links.append(base + link)
                else:
                    gkg_links.append(base + link)
        except ValueError:
            continue

In [None]:
destination = "GDELT GKG Files"

os.makedirs(destination, exist_ok=True)

for link in gkg_links:
    file_name = link.split('/')[-1]
    file_path = os.path.join(destination, file_name)

    response = requests.get(link)

    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Decompress the zip file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(destination)
        print(f"Extracted {file_name} to {destination}")

    # Delete the zip file after extraction
    os.remove(file_path)

print("All files downloaded and extracted.")

In [None]:
destination = "GDELT GKG Files/gkgcounts"

os.makedirs(destination, exist_ok=True)

for link in gkgcounts_links:
    file_name = link.split('/')[-1]
    file_path = os.path.join(destination, file_name)

    response = requests.get(link)

    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Decompress the zip file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(destination)
        print(f"Extracted {file_name} to {destination}")

    # Delete the zip file after extraction
    os.remove(file_path)

print("All files downloaded and extracted.")

## GDELT 1.0 Events Data

In [5]:
events_url = "http://data.gdeltproject.org/events/index.html"
response = requests.get(events_url)
sel = Selector(text = response.text)

start_date = datetime(2023, 8, 13)
end_date = datetime(2024, 8, 13)

# Get the links as a url that can be downloaded later
links = sel.xpath('//a/@href').extract()

downloadable_links = []
base = "http://data.gdeltproject.org/events/"
for link in links:
    if link.endswith('.zip'):
        date_str = link.split('.')[0]
        try:
            file_date = datetime.strptime(date_str, "%Y%m%d")
            if start_date <= file_date <= end_date:
                downloadable_links.append(base + link)
        except ValueError:
            continue

In [None]:
destination = "GDELT Event Files"

os.makedirs(destination, exist_ok=True)

for link in downloadable_links:
    file_name = link.split('/')[-1]
    file_path = os.path.join(destination, file_name)

    response = requests.get(link)

    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Decompress the zip file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(destination)
        print(f"Extracted {file_name} to {destination}")

    # Delete the zip file after extraction
    os.remove(file_path)

print("All files downloaded and extracted.")

### Data Exploration

In [9]:
trial = pd.read_csv("./GDELT Event Files/20240813.export.CSV", sep= "\t", header = None)

  trial = pd.read_csv("./GDELT Event Files/20240813.export.CSV", sep= "\t", header = None)


In [10]:
csv_columns = ['global_id',
 'day', # Date the event took place in YYYYMMDD format
 'month_year', # Alternative formating YYYYMM
 'year', # Year
 'fraction_date', # Alternative formating YYYY.FFFF, where FFFF is the percentage of the year completed by that day
# actor 1
 'actor1_code',
 'actor1_name', # Name of Actor 1
 'actor1_country_code',
 'actor1_known_group_code', # Which group the actor belongs to NGO/ IGO/ rebel group. Ex: United Nations
 'actor1_ethnic_code',
 'actor1_religion1_code',
 'actor1_religion2_code',
 'actor1_type1_code', # Type codes talk about roles, for example police forces
 'actor1_type2_code', # goverment, military, education, elites, media, etc
 'actor1_type3_code', # -
# actor 2
 'actor2_code',
 'actor2_name', # Name of actor 2
 'actor2_country_code',
 'actor2_known_group_code',
 'actor2_ethnic_code',
 'actor2_religion1_code',
 'actor2_religion2_code',
 'actor2_type1_code', # Same as in actor 1
 'actor2_type2_code', # -
 'actor2_type3_code', # -
# ----------------
 'is_root_event', # Binary. Says if it is the root event. Can give insight into importance
 'event_code',
 'event_base_code',
 'event_root_code',
 'quad_class', # Event taxonomy: 1. Verbal cooperation, 2. Material Cooperation, 3. Verbal Conflict, 4. Material Conflict
 'goldstein_scale', # Numeric score from -10 to +10 capturing potential impact that the event will have in countries stability
 'num_mentions', # Number of mentions of the event across all documents. Can be seen as importance measure
 'num_sources', # Number of information sources containing mentions of the event
 'num_articles',# Number of source documents containing mentions of this event
 'avg_tone', # Avg tone of documents that mention the event. Goes from -100 (extremely negative) to 100 (extremely positive)
# actor 1 geo
 'actor1_geo_type', # Maps to: 1.Country, 2. US State, 3. US City, 4. World city, 5. World State
 'actor1_geo_full_name', # Name of location
 'actor1_geo_country_code',
 'actor1_geo_adm1_code',
 'actor1_geo_lat', # Latitude
 'actor1_geo_long', # Longitude
 'actor1_geo_feature_id',
# actor 2 geo
 'actor2_geo_type', # Check actor 1
 'actor2_geo_fullname',
 'actor2_geo_countrycode',
 'actor2_geo_adm1_code',
 'actor2_geo_lat',
 'actor2_geo_long',
 'actor2_geo_feature_id',
# action geo
 'action_geo_type', # Check actor 1
 'action2_geo_full_name',
 'action_geo_country_code',
 'action_geo_adm1_code',
 'action_geo_lat',
 'action_geo_long',
 'action_geo_feature_id',
# date and url
 'date_added', # Date the event was added to master database
 'source_url'] # URL

In [11]:
trial.columns = csv_columns

In [14]:
for i in range(1,3):
    df = pd.read_csv(f"GDELT Event Files/2024081{i}.export.CSV", sep= "\t", header = None)
    df.columns = csv_columns
    print(df.shape)

  df = pd.read_csv(f"GDELT Event Files/2024081{i}.export.CSV", sep= "\t", header = None)


(69353, 58)
(116950, 58)


In [15]:
df.iloc[:,24].value_counts()

actor2_type3_code
MED    20
COP    13
GOV     8
MIL     6
ELI     5
BUS     4
EDU     3
ENV     2
JUD     1
LEG     1
Name: count, dtype: int64

In [None]:
from newspaper import Article
import spacy

# Initialize spacy NER model
    # nlp = spacy.load("en_core_web_sm")

# List of URLs
# urls = [
#     "https://theafronews.com/price-fixing-accusations-cast-shadow-on-food-industry-giants/",
#     "https://theafronews.com/price-fixing-accusations-cast-shadow-on-food-industry-giants/",
#     "https://www.wkrb13.com/2023/08/12/ingalls-snyder-llc-sells-8972-shares-of-bank-of-america-co-nysebac.html",
#     "https://www.thetimes.co.uk/article/questions-over-who-really-owns-ppe-firm-linked-to-mone-given-200m-0t3vk5pbl"
# ]

# Function to extract content from a URL
def extract_article_text(url):
    article = Article(url)
    article.download()
    try:
        article.parse()
    except:
        pass
    return article.title

# Function to perform NER on the article title
    def perform_ner(text):
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]

# Process each URL
# for url in urls:
#     title, text = extract_article_text(url)
#     entities = perform_ner(title)
#     print(f"Title: {title}")
#     print(f"Entities: {entities}")
#     print("-" * 40)


In [None]:
prueba = df[:100]
prueba['titulo'] = prueba['source_url'].apply(extract_article_text)
prueba

In [None]:
df = pd.read_csv(r"masterData.csv")

df