In [1]:
import pandas as pd
from datetime import timedelta

from gdelt_loading_files import load_gdelt_index, get_files_for_day_3h
from gdelt_cyclone_pipeline import run_pipeline

In [24]:
# Get the processed IBTrACS cyclone data

df_cyclones = pd.read_csv("../data/processed/ibtracs_era5_20251217_1447.csv")

df_cyclones['Timestamp'] = pd.to_datetime(df_cyclones['Timestamp'])

# keep only dates from 2022 to 2023
df_cyclones = df_cyclones[
    (df_cyclones['Timestamp'].dt.year >= 2022) &
    (df_cyclones['Timestamp'].dt.year <= 2023)
]

window_days = 3
all_dates_set = set()

for ts in df_cyclones['Timestamp']:
    # Create the ±3 day window for this timestamp
    window_range = [ts + timedelta(days=offset) for offset in range(-window_days, window_days+1)]
    # Convert to string YYYYMMDD and add to the set
    all_dates_set.update([d.strftime('%Y%m%d') for d in window_range])

# Sorted list of unique active dates including the ±3-day windows
active_dates = sorted(all_dates_set)

# Get unique cyclone names
cyclone_names = df_cyclones['Storm_Name'].str.lower().unique()

print(f"{len(cyclone_names)} active cyclones")
print(f"{len(active_dates)} active dates including ±{window_days} day windows")

149 active cyclones
677 active dates including ±3 day windows


In [25]:
df_cyclones.head(1)

Unnamed: 0,Storm_ID,Storm_Name,Ocean_Basin,Year,Timestamp,Latitude,Longitude,Observed_Wind_Max_Knots,Observed_Pressure_Min_mb,Storm_Speed_Knots,Storm_Direction_Deg,ERA5_Temp_2m_Kelvin,ERA5_Pressure_MSL_hPa,ERA5_Wind_U_Component,ERA5_Wind_V_Component,ERA5_Position_Error_km,latitude,longitude
0,b'2022008S17173',b'CODY',SP,2022.0,2022-01-07 12:00:00.000039936,-17.4,172.8,25.0,1004.0,3.0,205.0,300.53052,1002.84686,2.650101,-8.584717,23.9529,-17.5,173.0


In [4]:
# Load GDELT file index
gdelt_files = load_gdelt_index()
print(f"{len(gdelt_files)} fichiers GDELT indexés")

1117161 fichiers GDELT indexés


In [30]:
gdelt_mentions_files = []
for date_str in active_dates:
    files_for_day = get_files_for_day_3h(date_str, gdelt_files, type="mentions.CSV")
    gdelt_mentions_files.extend(files_for_day)

gdelt_gkg_files = []
for date_str in active_dates:
    files_for_day = get_files_for_day_3h(date_str, gdelt_files, type="gkg.csv")
    gdelt_gkg_files.extend(files_for_day)

In [None]:
"""# test with first date only
gdelt_mentions_files = []
gdelt_gkg_files = []

date_str = active_dates[0]
files_for_day = get_files_for_day_3h(date_str, gdelt_files, type="mentions.CSV")
gdelt_mentions_files.extend(files_for_day)

files_for_day = get_files_for_day_3h(date_str, gdelt_files, type="gkg.csv")
gdelt_gkg_files.extend(files_for_day)"""

In [None]:
"""# test this date 20220602 because no ocean_basin
date_str = "20220602"
files_for_day = get_files_for_day_3h(date_str, gdelt_files, type="mentions.CSV")
gdelt_mentions_files.extend(files_for_day)

files_for_day = get_files_for_day_3h(date_str, gdelt_files, type="gkg.csv")
gdelt_gkg_files.extend(files_for_day)"""

In [None]:
df_cyclones['date'] = df_cyclones['Timestamp'].dt.strftime('%Y%m%d')

# Run the full pipeline
df_all_articles = run_pipeline(df_cyclones, gdelt_mentions_files, gdelt_gkg_files)

In [32]:
df_final = pd.read_csv("../data/processed/cyclones_mentions_gdelt_3h.csv")
df_final.head()

Unnamed: 0,EventID,mention_timestamp,event_date,source,url,cyclone_name,file_timestamp,url_end
0,1022318927,20220108020000,20220108030000,wypr.org,https://www.wypr.org/2022-01-07/pacific-northw...,b'cody',20220108030000,pacific-northwest-storm-causes-flooding-and-la...
1,1022339688,20220108071500,20220108150000,fijivillage.com,https://www.fijivillage.com/feature/TD03F-is-n...,b'cody',20220108150000,TD03F-is-now-a-Tropical-Depression--likely-to-...
2,1022406102,20220109000000,20220109000000,rnz.co.nz,https://www.rnz.co.nz/international/pacific-ne...,b'cody',20220109000000,fiji-schools-close-evacuation-centres-open-as-...
3,1022406199,20220109000000,20220109000000,dailyrecordnews.com,https://www.dailyrecordnews.com/ap_news/washin...,b'cody',20220109000000,article_5b722a19-8929-5d33-a8de-afad943173a6.html
4,1022514078,20220110030000,20220110030000,newstalkzb.co.nz,https://www.newstalkzb.co.nz/news/national/wea...,b'cody',20220110030000,weather-tropical-cyclone-in-fiji-strengthens-b...


In [40]:
df_final['event_date'] = df_final['event_date'].astype(str)

# Extract day from 'event_date'
df_final['day'] = df_final['event_date'].str[:8]

# Group by 'day' and aggregate URLs and sources into lists
daily_agg = df_final.groupby('day').agg({
    'url': lambda x: list(x),
    'source': lambda x: list(x),
    'cyclone_name': 'first',
    'EventID': 'count'         # number of articles
}).reset_index()

# index by cyclone_name and day
daily_agg.set_index(['cyclone_name', 'day'], inplace=True)
daily_agg.rename(columns={'EventID': 'num_articles'}, inplace=True)
daily_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,url,source,num_articles
cyclone_name,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
b'cody',20220108,[https://www.wypr.org/2022-01-07/pacific-north...,"[wypr.org, fijivillage.com]",2
b'cody',20220109,[https://www.rnz.co.nz/international/pacific-n...,"[rnz.co.nz, dailyrecordnews.com]",2
b'cody',20220110,[https://www.newstalkzb.co.nz/news/national/we...,"[newstalkzb.co.nz, rnz.co.nz]",2
b'cody',20220111,[https://www.stuff.co.nz/national/127472853/no...,[stuff.co.nz],1
b'cody',20220112,[https://www.rnz.co.nz/international/pacific-n...,[rnz.co.nz],1


In [41]:
daily_agg.to_csv("../data/processed/cyclones_daily_mentions_gdelt.csv")