In [1]:
import boto3
import requests
import zipfile
import io
import os
import pandas as pd
from datetime import datetime
from parsel import Selector

In [4]:
# Helper function to extract the date from the file name
def extract_date(filename):
    try:
        date_str = filename.split('.')[0].split('/')[1]
        return datetime.strptime(date_str, "%Y%m%d")
    except ValueError:
        return None

# Get the list of files in S3 and determine the most recent date
def get_most_recent_date(s3_client, bucket_name, folder_name):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)
    files = response.get('Contents', [])
    dates = []
    for file in files:
        filename = file['Key']
        date = extract_date(filename)
        if date:
            dates.append(date)
    return max(dates) if dates else datetime(1970, 1, 1)

# Fetch new files from GDELT events page
def fetch_new_files(start_date):
    new_files = {'events': [], 'gkg': [], 'gkgcounts': []}    

    events_url = "http://data.gdeltproject.org/events/index.html"
    base = "http://data.gdeltproject.org/events/"
    response = requests.get(events_url)
    sel = Selector(text=response.text)
    links = sel.xpath('//a/@href').extract()
    
    for link in links:
        if link.endswith('.zip'):
            date_str = link.split('.')[0]
            try:
                file_date = datetime.strptime(date_str, "%Y%m%d")
                if file_date > start_date:
                    new_files['events'].append(base + link)
            except ValueError:
                continue
    
    gkg_url = 'http://data.gdeltproject.org/gkg/index.html'    
    base = "http://data.gdeltproject.org/gkg/"
    response = requests.get(gkg_url)
    sel = Selector(text=response.text)
    links = sel.xpath('//a/@href').extract()   

    for link in links:
        if link.endswith('.zip'):
            date_str = link.split('.')[0]
            try:
                file_date = datetime.strptime(date_str, "%Y%m%d")
                if file_date > start_date:
                    file_type = 'gkgcounts' if 'gkgcounts' in link else 'gkg'
                    new_files[file_type].append(base + link)
            except ValueError:
                continue     

    return new_files

In [19]:
# Download and upload new files
def process_new_files(new_files, s3_client, bucket_name):
    for file_type, links in new_files.items():
        for link in links:
            file_name = link.split('/')[-1]
            response = requests.get(link)
            file_content = io.BytesIO(response.content)
            
            with zipfile.ZipFile(file_content, 'r') as zip_ref:
                for file_info in zip_ref.infolist():
                    date = file_info.filename.split('.')[0]

                    if file_type == 'events':
                        folder_name = "GDELT Event Files"
                        new_file_name = f"{date}.csv"
                    if file_type == 'gkg':
                        folder_name = "GDELT GKG Files"
                        new_file_name = f"{date}.{file_type}.csv"
                    if file_type == 'gkgcounts':
                        folder_name = 'GDELT GKG Files/gkgcounts'
                        new_file_name = f"{date}.{file_type}.csv"

                    s3_key = folder_name + '/' + new_file_name
                    
                    with zip_ref.open(file_info) as extracted_file:
                        s3_client.upload_fileobj(extracted_file, bucket_name, s3_key)
                        print(f"Uploaded {new_file_name} to S3 bucket {bucket_name} with key {s3_key}")

In [11]:
# AWS S3 Configuration
credentials = pd.read_csv('../Data_Storage/isabelmorar_accessKeys.csv')
access_key_id = credentials['Access key ID'][0]
secret_access_key = credentials['Secret access key'][0]

# AWS S3 Configuration
s3_client = boto3.client(
    's3',
    aws_access_key_id= access_key_id,        
    aws_secret_access_key= secret_access_key,
    region_name='us-east-1' 
)

bucket_name = 'datathonfactored2024' 
folder_name = 'GDELT Event Files/'

In [20]:
start_date = get_most_recent_date(s3_client, bucket_name, folder_name)
new_files = fetch_new_files(start_date)

process_new_files(new_files, s3_client, bucket_name)

GDELT Event Files/20240814.csv
Uploaded 20240814.csv to S3 bucket datathonfactored2024 with key GDELT Event Files/20240814.csv
GDELT GKG Files/20240814.gkg.csv
Uploaded 20240814.gkg.csv to S3 bucket datathonfactored2024 with key GDELT GKG Files/20240814.gkg.csv
GDELT GKG Files/gkgcounts/20240814.gkgcounts.csv
Uploaded 20240814.gkgcounts.csv to S3 bucket datathonfactored2024 with key GDELT GKG Files/gkgcounts/20240814.gkgcounts.csv
