In [1]:
import os
import requests
import zipfile
import pandas as pd
from bs4 import BeautifulSoup
from clickhouse_driver import Client
from datetime import datetime


In [2]:
# Configuration
GDELT_BASE_URL = "http://data.gdeltproject.org/events/"
DOWNLOAD_DIR = "./gdelt_data"
EXTRACTED_DIR = "./gdelt_data/extracted"
CLICKHOUSE_HOST = "localhost"
CLICKHOUSE_PORT = 9000
CLICKHOUSE_DATABASE = "your_database"
CLICKHOUSE_TABLE = "your_table"

In [3]:
# Create necessary directories
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(EXTRACTED_DIR, exist_ok=True)


In [4]:
def list_files(base_url):
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a')
        files = [link.get('href') for link in links if link.get('href').endswith('.zip')]
        return files
    except requests.HTTPError as e:
        print(f"Failed to list files: {e}")
        return []


In [5]:
def download_file(url, local_filename):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(local_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {url} to {local_filename}")
    except requests.HTTPError as e:
        print(f"Failed to download {url}: {e}")


In [6]:
def unzip_file(zip_path, extract_to):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Unzipped {zip_path} to {extract_to}")
    except zipfile.BadZipFile as e:
        print(f"Failed to unzip {zip_path}: {e}")


In [7]:
def process_files(extracted_dir):
    files = [os.path.join(extracted_dir, f) for f in os.listdir(extracted_dir) if f.endswith('.csv')]
    client = Client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT)
    for file in files:
        df = pd.read_csv(file)
        # Example: Filter for MCC related entries in Nepal
        df_filtered = df[(df['Actor1CountryCode'] == 'NP') & (df['Actor1Name'].str.contains('MCC', na=False))]
        if not df_filtered.empty:
            # Insert into ClickHouse
            columns = ", ".join(df_filtered.columns)
            values = [tuple(row) for row in df_filtered.values]
            try:
                client.execute(f'INSERT INTO {CLICKHOUSE_DATABASE}.{CLICKHOUSE_TABLE} ({columns}) VALUES', values)
                print(f"Inserted data from {file} into ClickHouse")
            except Exception as e:
                print(f"Failed to insert data from {file} into ClickHouse: {e}")

In [8]:
def main():
    start_date = datetime(2018, 1, 1)
    end_date = datetime(2018, 12, 31)
    
    files = list_files(GDELT_BASE_URL)
    print("Available files:")
    for file in files:
        # Extract the date part from the filename (assuming it's the first 8 characters)
        file_date_str = file[:8]
        try:
            file_date = datetime.strptime(file_date_str, '%Y%m%d')
        except ValueError:
            print(f"Skipping file with invalid date format: {file}")
            continue
        
        # Check if the file date is within the desired range
        if start_date <= file_date <= end_date:
            print(file)
            file_url = os.path.join(GDELT_BASE_URL, file)
            local_filename = os.path.join(DOWNLOAD_DIR, file)
            
            # Download the file
            download_file(file_url, local_filename)
            
            # Unzip the file
            unzip_file(local_filename, EXTRACTED_DIR)

    # Process extracted files
    process_files(EXTRACTED_DIR)

if __name__ == "__main__":
    main()


Available files:
Skipping file with invalid date format: GDELT.MASTERREDUCEDV2.1979-2013.zip
20181231.export.CSV.zip
Downloaded http://data.gdeltproject.org/events/20181231.export.CSV.zip to ./gdelt_data\20181231.export.CSV.zip
Unzipped ./gdelt_data\20181231.export.CSV.zip to ./gdelt_data/extracted
20181230.export.CSV.zip
Downloaded http://data.gdeltproject.org/events/20181230.export.CSV.zip to ./gdelt_data\20181230.export.CSV.zip
Unzipped ./gdelt_data\20181230.export.CSV.zip to ./gdelt_data/extracted
20181229.export.CSV.zip
Downloaded http://data.gdeltproject.org/events/20181229.export.CSV.zip to ./gdelt_data\20181229.export.CSV.zip
Unzipped ./gdelt_data\20181229.export.CSV.zip to ./gdelt_data/extracted
20181228.export.CSV.zip
Downloaded http://data.gdeltproject.org/events/20181228.export.CSV.zip to ./gdelt_data\20181228.export.CSV.zip
Unzipped ./gdelt_data\20181228.export.CSV.zip to ./gdelt_data/extracted
20181227.export.CSV.zip
Downloaded http://data.gdeltproject.org/events/20181227