In [3]:
import os
import requests
import pandas as pd
import numpy as np

from scrapy import Selector
from datetime import datetime

import zipfile

## GDELT 1.0 GKG Data

In [4]:
gkg_url = 'http://data.gdeltproject.org/gkg/index.html'
response = requests.get(gkg_url)
sel = Selector(text = response.text)

start_date = datetime(2023, 8, 13)
end_date = datetime(2024, 8, 13)

# Get the links as a url that can be downloaded later
links = sel.xpath('//a/@href').extract()

gkg_links = []
gkgcounts_links = []
base = "http://data.gdeltproject.org/gkg/"

for link in links:
    if link.endswith('.zip'):
        date_str = link.split('.')[0]
        try:
            file_date = datetime.strptime(date_str, "%Y%m%d")
            if start_date <= file_date <= end_date:
                if 'gkgcounts' in link:
                    gkgcounts_links.append(base + link)
                else:
                    gkg_links.append(base + link)
        except ValueError:
            continue

In [None]:
destination = "GDELT GKG Files"

os.makedirs(destination, exist_ok=True)

for link in gkg_links:
    file_name = link.split('/')[-1]
    file_path = os.path.join(destination, file_name)

    response = requests.get(link)

    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Decompress the zip file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(destination)
        print(f"Extracted {file_name} to {destination}")

    # Delete the zip file after extraction
    os.remove(file_path)

print("All files downloaded and extracted.")

In [None]:
destination = "GDELT GKG Files/gkgcounts"

os.makedirs(destination, exist_ok=True)

for link in gkgcounts_links:
    file_name = link.split('/')[-1]
    file_path = os.path.join(destination, file_name)

    response = requests.get(link)

    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Decompress the zip file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(destination)
        print(f"Extracted {file_name} to {destination}")

    # Delete the zip file after extraction
    os.remove(file_path)

print("All files downloaded and extracted.")

## GDELT 1.0 Events Data

In [6]:
events_url = "http://data.gdeltproject.org/events/index.html"
response = requests.get(events_url)
sel = Selector(text = response.text)

start_date = datetime(2023, 8, 13)
end_date = datetime(2024, 8, 13)

# Get the links as a url that can be downloaded later
links = sel.xpath('//a/@href').extract()

downloadable_links = []
base = "http://data.gdeltproject.org/events/"
for link in links:
    if link.endswith('.zip'):
        date_str = link.split('.')[0]
        try:
            file_date = datetime.strptime(date_str, "%Y%m%d")
            if start_date <= file_date <= end_date:
                downloadable_links.append(base + link)
        except ValueError:
            continue

In [None]:
destination = "GDELT Event Files"

os.makedirs(destination, exist_ok=True)

for link in downloadable_links:
    file_name = link.split('/')[-1]
    file_path = os.path.join(destination, file_name)

    response = requests.get(link)

    with open(file_path, 'wb') as file:
        file.write(response.content)

    # Decompress the zip file
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(destination)
        print(f"Extracted {file_name} to {destination}")

    # Delete the zip file after extraction
    os.remove(file_path)

print("All files downloaded and extracted.")

### Data Exploration

In [34]:
trial = pd.read_csv("/content/GDELT Event Files/20230813.export.CSV", sep= "\t", header = None)

In [39]:
csv_columns = ["gdelt_id", "published_date", "insert_date", "url",
                "actor1_code", "actor1_name", "actor1_country_code", "actor1_known_group_code", "actor1_ethnic_code",
                 "actor1_religion1_code", "actor1_religion2_code", "actor1_type1_code", "actor1_type2_code",
                 "actor1_type3_code", "actor2_code", "actor2_name", "actor2_country_code", "actor2_known_group_code",
                 "actor2_ethnic_code", "actor2_religion1_code", "actor2_religion2_code", "actor2_type1_code",
                 "actor2_type2_code", "actor2_type3_code", "is_root_event", "event_code", "event_base_code",
                 "event_root_code", "quad_class", "goldstein_scale", "num_mentions", "num_sources", "num_articles",
                 "avg_tone","actor1_geo_type", "actor1_geo_full_name", "actor1_geo_country_code", "actor1_geo_adm1_code",
                 "actor1_geo_adm2_code", "actor1_geo_lat", "actor1_geo_long", "actor1_geo_feature_id",
                 "actor2_geo_type", "actor2_geo_full_name", "actor2_geo_country_code", "actor2_geo_adm1_code",
                 "actor2_geo_adm2_code", "actor2_geo_lat", "actor2_geo_long", "actor2_geo_feature_id",
                 "action_geo_type", "action_geo_full_name", "action_geo_country_code", "action_geo_adm1_code",
                 "action_geo_adm2_code", "action_geo_lat", "action_geo_long", "action_geo_feature_id"]

In [40]:
trial.columns = csv_columns