In [None]:
import glob
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, multilabel_confusion_matrix
pd.set_option('display.max_columns', None)

events = pd.read_parquet("../data/gdelt/events/6_final/events_dataset.parquet")
gkg = pd.read_parquet("../data/gdelt/gkg/6_final/gkg_dataset.parquet")

In [None]:
print(len(events.index))
print(events.columns)
events.tail(3)

In [None]:
# Forecasting Configuration
# ============================
# Set the forecast horizon: predict CS_score N periods ahead
# Options: 1 (nowcasting), 2, 3, or 4 (forecasting)
# Literature shows GDELT features are more valuable for forecasting (2-4 periods ahead)
# because baseline (previous_CS) becomes less powerful with longer horizons

FORECAST_HORIZON = 3  # Predict 3 periods ahead (can be changed to 2 or 4)

print(f"=== Forecasting Configuration ===")
print(f"Forecast Horizon: {FORECAST_HORIZON} period(s) ahead")
if FORECAST_HORIZON == 1:
    print("Mode: NOWCASTING (predicting next period)")
else:
    print(f"Mode: FORECASTING (predicting {FORECAST_HORIZON} periods ahead)")
print(f"\nWhy forecasting helps GDELT features:")
print(f"  - Baseline 'previous_CS' becomes less predictive ({FORECAST_HORIZON} periods old)")
print(f"  - GDELT temporal patterns can capture early warning signals")
print(f"  - More realistic for early warning systems\n")


In [None]:
df = pd.merge(events, gkg, on=["ADMIN0", "ADMIN1", "ADMIN2", "period"], how="outer", suffixes=("_events", "_gkg"))
print(len(df.index))
print(max(df['period']))

# Fix ADMIN0=None issue from outer merge
# When merging with outer join, if one dataset has ADMIN1/ADMIN2 but missing ADMIN0,
# the merged ADMIN0 becomes None. We need to fill it from lookup.
print(f"\nRows with ADMIN0=None or NaN: {(df['ADMIN0'].isna() | (df['ADMIN0'] == 'None') | (df['ADMIN0'] == None)).sum()}")

# Create lookup from events: (ADMIN1, ADMIN2) -> ADMIN0
events_lookup = {}
for _, row in events[['ADMIN0', 'ADMIN1', 'ADMIN2']].drop_duplicates().iterrows():
    if pd.notna(row['ADMIN0']) and row['ADMIN0'] != 'None' and pd.notna(row['ADMIN1']) and pd.notna(row['ADMIN2']):
        key = (row['ADMIN1'], row['ADMIN2'])
        if key not in events_lookup:
            events_lookup[key] = row['ADMIN0']

# Create lookup from gkg: (ADMIN1, ADMIN2) -> ADMIN0  
gkg_lookup = {}
for _, row in gkg[['ADMIN0', 'ADMIN1', 'ADMIN2']].drop_duplicates().iterrows():
    if pd.notna(row['ADMIN0']) and row['ADMIN0'] != 'None' and pd.notna(row['ADMIN1']) and pd.notna(row['ADMIN2']):
        key = (row['ADMIN1'], row['ADMIN2'])
        if key not in gkg_lookup:
            gkg_lookup[key] = row['ADMIN0']

# Fill missing ADMIN0 using lookups
mask_missing = (df['ADMIN0'].isna()) | (df['ADMIN0'] == 'None') | (df['ADMIN0'] == None)
if mask_missing.sum() > 0:
    # Try events lookup first
    df.loc[mask_missing, 'ADMIN0'] = df.loc[mask_missing].apply(
        lambda row: events_lookup.get((row['ADMIN1'], row['ADMIN2']), None), axis=1
    )
    # Then try gkg lookup for any still missing
    mask_still_missing = (df['ADMIN0'].isna()) | (df['ADMIN0'] == 'None') | (df['ADMIN0'] == None)
    if mask_still_missing.sum() > 0:
        df.loc[mask_still_missing, 'ADMIN0'] = df.loc[mask_still_missing].apply(
            lambda row: gkg_lookup.get((row['ADMIN1'], row['ADMIN2']), None), axis=1
        )

# Convert string 'None' to NaN for consistency
df.loc[df['ADMIN0'] == 'None', 'ADMIN0'] = None

print(f"Rows with ADMIN0=None after fixing: {df['ADMIN0'].isna().sum()}")

# Drop rows where ADMIN0 is still None (can't be grouped for y_next creation)
if df['ADMIN0'].isna().sum() > 0:
    print(f"   Dropping {df['ADMIN0'].isna().sum()} rows with ADMIN0=None (cannot be grouped)")
    df = df[df['ADMIN0'].notna()].copy()

print(f"Final rows: {len(df)}")
print(f"Period range: {df['period'].min()} to {df['period'].max()}")
df.tail()

In [None]:
key_cols = ["ADMIN0", "ADMIN1", "ADMIN2", "period"]

def check_duplicates(df, name):
    dupes = (
        df
        .groupby(key_cols)
        .size()
        .reset_index(name="n")
        .query("n > 1")
    )
    print(f"{name}: {len(dupes)} duplicated keys")
    return dupes

dupes_events = check_duplicates(events, "events")
dupes_gkg = check_duplicates(gkg, "gkg")
dupes_merged = check_duplicates(df, "merged")

In [None]:
# BETWEEN-ASSESSMENT FORECASTING SETUP
# ====================================
# CRITICAL: This implements the correct "between-assessment forecasting" setup
# 
# Key principle: Observed IPC defines targets; unobserved months are prediction times, not missing labels.
#
# Target: y_next = next observed IPC assessment (forward-lagged label)
# Feature: IPC_last = last observed IPC before this row (forward-filled, constant between assessments)
#
# Rules:
# - Only use IPC_last as IPC feature (no rolling, no multiple lags, no trends)
# - CS_score_events is NEVER the target anymore
# - Keep ALL rows (including last periods without future assessments)
# - Filter to rows with valid y_next only when training models

print("="*70)
print("BETWEEN-ASSESSMENT FORECASTING SETUP")
print("="*70)
print("\nKey principle: Observed IPC defines targets; unobserved months are prediction times.\n")

# Create region identifiers
df['region'] = df['ADMIN0'] + '-' + df['ADMIN1']
df['district'] = df['ADMIN0'] + '-' + df['ADMIN1'] + '-' + df['ADMIN2']

# Sort by district (ADMIN2) and period to ensure deterministic operations
df = df.sort_values(['ADMIN0', 'ADMIN1', 'ADMIN2', 'period']).reset_index(drop=True)

# Get CS_score (IPC) from events or gkg - this is the observed IPC
df['CS_score'] = df['CS_score_events'].fillna(df['CS_score_gkg'])
df['CS_score'] = pd.to_numeric(df['CS_score'], errors='coerce')

# Create IPC_last: last observed IPC before this row (forward-filled)
# This is the ONLY IPC feature allowed - it's stale and constant between assessments
print("1. Creating IPC_last (last observed IPC, forward-filled)...")
df['IPC_last'] = df.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'], sort=False)['CS_score'].transform(
    lambda x: x.ffill()
)

# Create y_next: next observed IPC assessment (forward-lagged label)
# This is the target variable - we predict the next assessment outcome
print("2. Creating y_next (next observed IPC assessment - THE TARGET)...")

def get_next_assessment(group):
    """Get the next observed IPC assessment for each row"""
    cs_values = group['CS_score'].values
    result = []
    
    for i in range(len(cs_values)):
        # Look forward to find next non-null value
        found = False
        for j in range(i + 1, len(cs_values)):
            if pd.notna(cs_values[j]) and 1 <= cs_values[j] <= 5:
                result.append(cs_values[j])
                found = True
                break
        if not found:
            result.append(np.nan)
    
    return pd.Series(result, index=group.index)

df['y_next'] = df.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'], sort=False, group_keys=False).apply(
    get_next_assessment
)

# Keep ALL rows - don't drop periods without future assessments
# Rows without y_next (like 202402) can still be used for feature engineering
# Filter to rows with valid y_next only when training models
rows_with_y_next = df['y_next'].notna().sum()
rows_without_y_next = df['y_next'].isna().sum()
print(f"   Rows with y_next (can be used for training): {rows_with_y_next}")
print(f"   Rows without y_next (last periods, feature engineering only): {rows_without_y_next}")
print(f"   Total rows kept: {len(df)} (no periods dropped)")

# Clean y_next where it exists: ensure valid range (1-5) and convert to int
# Only clean where y_next is not NaN
mask_valid_y_next = df['y_next'].notna()
df.loc[mask_valid_y_next, 'y_next'] = df.loc[mask_valid_y_next, 'y_next'].round()
df.loc[mask_valid_y_next, 'y_next'] = df.loc[mask_valid_y_next, 'y_next'].clip(1, 5)
df.loc[mask_valid_y_next, 'y_next'] = df.loc[mask_valid_y_next, 'y_next'].astype(int)

# Summary statistics
print(f"\n=== Summary ===")
print(f"IPC_last: last observed IPC (forward-filled, constant between assessments)")
print(f"y_next: next observed IPC assessment (THE TARGET)")
print(f"\nIPC_last distribution:")
print(df['IPC_last'].value_counts().sort_index())
print(f"\ny_next distribution (TARGET):")
print(df['y_next'].value_counts().sort_index())
print(f"\nRows with valid y_next: {df['y_next'].notna().sum()}")
print(f"Rows without y_next (last periods): {df['y_next'].isna().sum()}")
print(f"Current DataFrame shape: {df.shape}")
print(f"Period range: {df['period'].min()} to {df['period'].max()}")

# Show example for verification
print(f"\n=== Example (first ADMIN2) ===")
if len(df) > 0:
    example_admin2 = df['ADMIN2'].iloc[0]
    example = df[df['ADMIN2'] == example_admin2].head(12)[['ADMIN2', 'period', 'CS_score', 'IPC_last', 'y_next']]
    print(example.to_string(index=False))
    print(f"\nNote: IPC_last is constant between assessments, y_next is the next assessment value")

print("\n" + "="*70)

In [None]:
print(min(df['period']))
print(max(df['period']))
df = df.sort_values(['ADMIN0', 'ADMIN1', 'ADMIN2', 'period'])
df[['ADMIN0', 'ADMIN1', 'ADMIN2', 'CS_score_events', 'period', 'y_next']].tail(20)

In [None]:
# Feature Availability Analysis - RIGHT AFTER MERGE
# ================================================
# This must be done BEFORE any feature engineering or filtering
# to accurately assess data coverage using original SQLDATE and DATE arrays

print("=== Data Coverage Analysis (After Merge) ===\n")

# Total rows after merge
total_rows = len(df)
print(f"1. Total rows after merge: {total_rows:,}")

# Function to check if arrays are empty
def is_empty_array(x):
    """Check if an array/list is empty"""
    if x is None:
        return True
    if isinstance(x, np.ndarray):
        return x.size == 0
    try:
        if pd.isna(x):
            return True
    except (ValueError, TypeError):
        pass
    if isinstance(x, (list, tuple)):
        return len(x) == 0
    return False

# Check for events data - SQLDATE being empty means no events
if 'SQLDATE' in df.columns:
    df['has_events_data'] = (~df['SQLDATE'].apply(is_empty_array)).astype(int)
    events_count = df['has_events_data'].sum()
    print(f"2. Rows with events data (non-empty SQLDATE): {events_count:,} ({events_count/total_rows*100:.1f}%)")
else:
    df['has_events_data'] = 0
    print("2. SQLDATE column not found - cannot check events data")

# Check for GKG data - DATE being empty means no GKG data  
if 'DATE' in df.columns:
    df['has_gkg_data'] = (~df['DATE'].apply(is_empty_array)).astype(int)
    gkg_count = df['has_gkg_data'].sum()
    print(f"3. Rows with GKG data (non-empty DATE): {gkg_count:,} ({gkg_count/total_rows*100:.1f}%)")
else:
    df['has_gkg_data'] = 0
    print("3. DATE column not found - cannot check GKG data")

# Overall feature availability
df['has_any_features'] = (df['has_events_data'] | df['has_gkg_data']).astype(int)
features_count = df['has_any_features'].sum()
print(f"4. Rows with ANY GDELT features: {features_count:,} ({features_count/total_rows*100:.1f}%)")
print(f"   Rows WITHOUT GDELT features: {total_rows - features_count:,} ({(total_rows - features_count)/total_rows*100:.1f}%)")

# Check CS_score availability
valid_cs_count = 0
valid_cs_with_features_count = 0

if 'CS_score_events' in df.columns and 'CS_score_gkg' in df.columns:
    df['CS_score'] = df['CS_score_events'].fillna(df['CS_score_gkg'])
    df['CS_score'] = pd.to_numeric(df['CS_score'], errors='coerce')
    valid_cs = df[(df['CS_score'] >= 1) & (df['CS_score'] <= 5) & (df['CS_score'].notna())]
    valid_cs_count = len(valid_cs)
    print(f"\n5. Rows with valid CS_score (1-5): {valid_cs_count:,} ({valid_cs_count/total_rows*100:.1f}%)")
    
    # Rows with valid CS_score AND features
    valid_cs_with_features = valid_cs[valid_cs['has_any_features'] == 1]
    valid_cs_with_features_count = len(valid_cs_with_features)
    print(f"6. Rows with valid CS_score AND GDELT features: {valid_cs_with_features_count:,}")
    print(f"   - Coverage: {valid_cs_with_features_count/valid_cs_count*100:.1f}% of valid CS_score rows")
    print(f"   - Coverage: {valid_cs_with_features_count/total_rows*100:.1f}% of total rows")
else:
    print("\n5. CS_score columns not found")

print(f"\n=== Summary ===")
print(f"Total rows: {total_rows:,}")
if valid_cs_count > 0:
    print(f"Valid CS_score rows: {valid_cs_count:,}")
    print(f"Valid CS_score + Features: {valid_cs_with_features_count:,}")

In [None]:
# Feature Engineering for CS_score Prediction
# ============================================

def safe_list_agg(lst, func):
    """Safely aggregate a list, handling None, empty lists, and non-numeric values"""
    # Handle None first
    if lst is None:
        return np.nan
    
    # Handle numpy arrays and lists
    if isinstance(lst, np.ndarray):
        if lst.size == 0:
            return np.nan
        # Convert to list for processing
        lst = lst.tolist()
    
    # Check for pandas NA/NaN (must check after None and array checks)
    try:
        if pd.isna(lst):
            return np.nan
    except (ValueError, TypeError):
        # pd.isna() failed, might be array-like, continue processing
        pass
    
    # Handle scalar numeric values
    if isinstance(lst, (int, float)):
        return float(lst)
    
    # Handle strings
    if isinstance(lst, str):
        try:
            # Try to evaluate if it's a string representation of a list
            if lst.startswith('[') or lst.startswith('('):
                lst = eval(lst)
            else:
                # Try to convert single value
                return float(lst)
        except:
            return np.nan
    
    # Check if it's a list-like structure
    if not isinstance(lst, (list, tuple)):
        return np.nan
    
    # Handle empty lists
    if len(lst) == 0:
        return np.nan
    
    # Process list elements
    try:
        # Convert to numeric, filtering out non-numeric values
        numeric_lst = []
        for x in lst:
            # Check for NaN/None values
            try:
                if pd.isna(x) or x is None:
                    continue
            except (ValueError, TypeError):
                # pd.isna() might fail for some types, try to convert anyway
                pass
            
            try:
                val = float(x)
                if not np.isinf(val) and not np.isnan(val):
                    numeric_lst.append(val)
            except (ValueError, TypeError):
                continue
        
        if len(numeric_lst) == 0:
            return np.nan
        
        result = func(numeric_lst)
        return float(result) if not np.isnan(result) and not np.isinf(result) else np.nan
    except Exception as e:
        return np.nan

def safe_list_count(x):
    """Safely count elements in a list/array, handling various data types"""
    if x is None:
        return 0
    if isinstance(x, np.ndarray):
        return x.size if x.size > 0 else 0
    if isinstance(x, (list, tuple)):
        return len(x)
    # For scalar values, check if it's not NaN
    try:
        if pd.isna(x):
            return 0
        return 1
    except (ValueError, TypeError):
        # If pd.isna fails, assume it's a valid value
        return 1

def aggregate_list_features(df, list_cols, prefix=""):
    """Aggregate list columns into multiple statistical features"""
    # Collect all new columns in a dictionary to avoid fragmentation
    new_cols = {}
    cols_to_drop = []
    
    for col in list_cols:
        if col not in df.columns:
            continue
        
        base_name = col.replace('_list', '').replace('_events', '').replace('_gkg', '')
        if prefix:
            base_name = f"{prefix}_{base_name}"
        
        # Compute all aggregations for this column
        new_cols[f"{base_name}_mean"] = df[col].apply(lambda x: safe_list_agg(x, np.mean))
        new_cols[f"{base_name}_max"] = df[col].apply(lambda x: safe_list_agg(x, np.max))
        new_cols[f"{base_name}_sum"] = df[col].apply(lambda x: safe_list_agg(x, np.sum))
        new_cols[f"{base_name}_count"] = df[col].apply(safe_list_count)
        new_cols[f"{base_name}_std"] = df[col].apply(lambda x: safe_list_agg(x, np.std))
        new_cols[f"{base_name}_min"] = df[col].apply(lambda x: safe_list_agg(x, np.min))
        
        # Track columns to drop
        cols_to_drop.append(col)
    
    # Add all new columns at once using pd.concat to avoid fragmentation
    if new_cols:
        new_df = pd.DataFrame(new_cols, index=df.index)
        df = pd.concat([df, new_df], axis=1)
    
    # Drop original list columns
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    
    return df

# Step 1: Determine target variable
# ===================================
# IMPORTANT: CS_score from FEWSNET is only available roughly every 4 months.
# Intermediary months can be used for feature engineering (lags, moving averages)
# but will be filtered out before model training (only CS_score between 1-5 are valid).

# Option 1: Use events CS_score as primary, fill with gkg if missing
df['CS_score'] = df['CS_score_events'].fillna(df['CS_score_gkg'])

# Option 2: Average if both exist (uncomment if preferred)
# df['CS_score'] = df[['CS_score_events', 'CS_score_gkg']].mean(axis=1)

# Option 3: Use maximum (uncomment if preferred)
# df['CS_score'] = df[['CS_score_events', 'CS_score_gkg']].max(axis=1)

print(f"CS_score distribution (before filtering):")
print(df['CS_score'].value_counts().sort_index())
print(f"\nMissing CS_score: {df['CS_score'].isna().sum()}")
print(f"\nNote: Intermediary months (with missing CS_score) will be used for")
print(f"feature engineering but filtered out before model training.")


In [None]:
print(min(df['period']))
print(max(df['period']))
df = df.sort_values(['ADMIN0', 'ADMIN1', 'ADMIN2', 'period'])
df.tail()

In [None]:
for c in df.columns:
    print(c)

### Feature engineering

In [None]:
# COMPREHENSIVE FEATURE ENGINEERING
# ===================================
# This cell performs ALL feature engineering for both IPC and GDELT features
# At this point: lists are NOT aggregated yet, y_next and IPC_last exist

print("="*70)
print("COMPREHENSIVE FEATURE ENGINEERING")
print("="*70)
print(f"\nStarting with {len(df)} rows")
print(f"Columns before feature engineering: {len(df.columns)}\n")

# ============================================================================
# 1. AGGREGATE GDELT LIST FEATURES
# ============================================================================
print("1. Aggregating GDELT list features...")

# Helper functions (already defined in previous cell, but ensure they exist)
def safe_list_agg(lst, func):
    """Safely aggregate a list, handling None, empty lists, and non-numeric values"""
    if lst is None:
        return np.nan
    if isinstance(lst, np.ndarray):
        if lst.size == 0:
            return np.nan
        lst = lst.tolist()
    try:
        if pd.isna(lst):
            return np.nan
    except (ValueError, TypeError):
        pass
    if isinstance(lst, (int, float)):
        return float(lst)
    if isinstance(lst, str):
        try:
            if lst.startswith('[') or lst.startswith('('):
                lst = eval(lst)
            else:
                return float(lst)
        except:
            return np.nan
    if not isinstance(lst, (list, tuple)):
        return np.nan
    if len(lst) == 0:
        return np.nan
    try:
        numeric_lst = []
        for x in lst:
            try:
                if pd.isna(x) or x is None:
                    continue
            except (ValueError, TypeError):
                pass
            try:
                val = float(x)
                if not np.isinf(val) and not np.isnan(val):
                    numeric_lst.append(val)
            except (ValueError, TypeError):
                continue
        if len(numeric_lst) == 0:
            return np.nan
        result = func(numeric_lst)
        return float(result) if not np.isnan(result) and not np.isinf(result) else np.nan
    except Exception as e:
        return np.nan

def safe_list_count(x):
    """Safely count elements in a list/array"""
    if x is None:
        return 0
    if isinstance(x, np.ndarray):
        return x.size if x.size > 0 else 0
    if isinstance(x, (list, tuple)):
        return len(x)
    try:
        if pd.isna(x):
            return 0
        return 1
    except (ValueError, TypeError):
        return 1

def aggregate_list_features(df, list_cols, prefix=""):
    """Aggregate list columns into meaningful statistical features for NLP-derived data
    
    For noisy NLP features, we keep only:
    - mean: central tendency
    - count: volume indicator
    - sum: total (especially useful for frequencies/counts)
    - max: peak intensity (only for crisis indicators)
    
    We skip: min (usually 0), std (variance not meaningful for noisy signals)
    """
    new_cols = {}
    cols_to_drop = []
    
    for col in list_cols:
        if col not in df.columns:
            continue
        
        base_name = col.replace('_list', '').replace('_events', '').replace('_gkg', '')
        if prefix:
            base_name = f"{prefix}_{base_name}"
        
        # For text columns (NER, clean_text), only create count
        if 'NER' in col or 'clean_text' in col:
            new_cols[f"{base_name}_count"] = df[col].apply(safe_list_count)
        else:
            # For numeric columns, create meaningful aggregations only
            # Mean: central tendency (most important for noisy signals)
            new_cols[f"{base_name}_mean"] = df[col].apply(lambda x: safe_list_agg(x, np.mean))
            
            # Count: volume indicator (how many articles/events)
            new_cols[f"{base_name}_count"] = df[col].apply(safe_list_count)
            
            # Sum: total (especially useful for frequencies and counts)
            new_cols[f"{base_name}_sum"] = df[col].apply(lambda x: safe_list_agg(x, np.sum))
            
            # Max: peak intensity (only for crisis indicators where peaks matter)
            # Check if this is a crisis-related feature
            is_crisis_feature = any(x in col.lower() for x in ['fatalities', 'displaced', 'injured', 
                                                                 'violence', 'torture', 'crisis'])
            if is_crisis_feature:
                new_cols[f"{base_name}_max"] = df[col].apply(lambda x: safe_list_agg(x, np.max))
            
            # Skip: min (usually 0), std (variance not meaningful for noisy NLP signals)
        
        cols_to_drop.append(col)
    
    if new_cols:
        new_df = pd.DataFrame(new_cols, index=df.index)
        df = pd.concat([df, new_df], axis=1)
    
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    
    return df

# Find and aggregate list columns
list_cols_events = [c for c in df.columns if c.endswith('_list_events') or (c.endswith('_events') and '_list' in c)]
list_cols_gkg = [c for c in df.columns if c.endswith('_list_gkg') or (c.endswith('_gkg') and '_list' in c)]

print(f"   Found {len(list_cols_events)} events list columns")
print(f"   Found {len(list_cols_gkg)} gkg list columns")

if list_cols_events:
    print("   Aggregating events features...")
    df = aggregate_list_features(df, list_cols_events, prefix="evt")

if list_cols_gkg:
    print("   Aggregating gkg features...")
    df = aggregate_list_features(df, list_cols_gkg, prefix="gkg")

# ============================================================================
# 2. CLEAN AND PREPARE IPC FEATURES
# ============================================================================
print("\n2. Cleaning IPC features...")

# Clean IPC_last: round to nearest integer and ensure valid range
df['IPC_last'] = df['IPC_last'].round()
# Keep IPC_last as float for now (can filter invalid values later if needed)
# Don't drop rows here - keep all periods

print(f"   IPC_last range: {df['IPC_last'].min():.1f} to {df['IPC_last'].max():.1f}")
print(f"   Valid IPC_last (1-5): {(df['IPC_last'].between(1, 5, inclusive='both')).sum()} rows")

# ============================================================================
# 3. CREATE TEMPORAL FEATURES FOR GDELT
# ============================================================================
print("\n3. Creating temporal features for GDELT...")

# Focus on most reliable features for temporal patterns
# Prioritize: sentiment, crisis indicators, food security
key_gdelt_features = [
    # Sentiment (most reliable NLP signals)
    'evt_compound_score_mean', 'evt_sentiment.compound_mean', 'gkg_compound_mean',
    
    # Crisis indicators (high signal-to-noise)
    'evt_fatalities_freq_mean', 'evt_displaced_freq_mean',
    'gkg_fatalities_freq_mean', 'gkg_displaced_freq_mean',
    
    # Food security (directly relevant to IPC)
    'evt_food_insecurity_freq_mean', 'gkg_food_insecurity_freq_mean',
    
    # Economic/agricultural (relevant but less reliable)
    'evt_economic_shocks_freq_mean', 'evt_agriculture_freq_mean',
]

# Filter to features that actually exist
key_gdelt_features = [f for f in key_gdelt_features if f in df.columns]

print(f"   Creating temporal features for {len(key_gdelt_features)} key GDELT features")

# Ensure sorted for lag operations
df = df.sort_values(['ADMIN0', 'ADMIN1', 'ADMIN2', 'period']).reset_index(drop=True)

# Create rolling windows (smoothing noise - most useful for NLP features)
# Use 3 and 6 periods to capture short and medium-term trends
for window in [3, 6]:
    for feat in key_gdelt_features:
        df[f"{feat}_rolling_{window}"] = df.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'], sort=False)[feat].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )

# Create lags (1 and 2 periods only - reduce noise)
# Lags can capture trends but too many lags add noise
for lag in [1, 2]:
    for feat in key_gdelt_features:
        df[f"{feat}_lag{lag}"] = df.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'], sort=False)[feat].shift(lag)

# Create escalation indicators (binary - less noisy than continuous change)
# Use rolling mean comparison instead of raw change to reduce noise
for feat in key_gdelt_features:
    rolling_3 = df.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'], sort=False)[feat].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )
    rolling_6 = df.groupby(['ADMIN0', 'ADMIN1', 'ADMIN2'], sort=False)[feat].transform(
        lambda x: x.rolling(window=6, min_periods=2).mean()
    )
    # Escalation: current 3-period average > 6-period average
    df[f"{feat}_escalation"] = (rolling_3 > rolling_6).astype(int)

# Skip: change, pct_change, anomaly (too noisy for NLP-derived features)

print(f"   Created temporal features: rolling windows (3, 6 periods), lags (1, 2 periods), escalation indicators")

# ============================================================================
# 4. CREATE FEATURE INDICATORS AND INTERACTIONS
# ============================================================================
print("\n4. Creating feature indicators and interactions...")

# GDELT feature availability indicator
gdelt_cols = [c for c in df.columns if c.startswith('evt_') or c.startswith('gkg_')]
if gdelt_cols:
    has_gdelt = df[gdelt_cols].notna().any(axis=1)
    df['has_gdelt'] = has_gdelt.astype(int)
else:
    df['has_gdelt'] = 0

# Combine sentiment features from events and gkg
if 'evt_compound_score_mean' in df.columns and 'gkg_compound_mean' in df.columns:
    df['sentiment_combined'] = (df['evt_compound_score_mean'].fillna(0) + df['gkg_compound_mean'].fillna(0)) / 2
    df['sentiment_agreement'] = ((df['evt_compound_score_mean'] * df['gkg_compound_mean']) > 0).astype(int)

# Combine crisis indicators
crisis_features = [c for c in df.columns if any(x in c for x in ['fatalities', 'displaced', 'injured']) 
                   and c.endswith('_mean') and (c.startswith('evt_') or c.startswith('gkg_'))]
if crisis_features:
    df['crisis_intensity'] = df[crisis_features].fillna(0).sum(axis=1)

# Food security combined
food_features = [c for c in df.columns if 'food_insecurity' in c and c.endswith('_mean') 
                 and (c.startswith('evt_') or c.startswith('gkg_'))]
if food_features:
    df['food_security_combined'] = df[food_features].fillna(0).mean(axis=1)

print(f"   Created interaction and combined features")

# ============================================================================
# 5. CREATE GEOGRAPHIC DUMMY VARIABLES
# ============================================================================
print("\n5. Creating geographic dummy variables...")

# Only create dummies for base geographic identifiers (ADMIN0, ADMIN1, ADMIN2)
# Note: 'region' and 'district' are redundant (derived from ADMIN0+ADMIN1 and ADMIN0+ADMIN1+ADMIN2)
geographic_cols = []
for col in ['ADMIN0', 'ADMIN1', 'ADMIN2']:
    if col in df.columns:
        # Create dummy variables (one-hot encoding)
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
        geographic_cols.extend(dummies.columns.tolist())
        print(f"   Created {len(dummies.columns)} dummy variables for {col}")

geographic_features = geographic_cols
print(f"   Total geographic features: {len(geographic_features)}")
print(f"   Note: 'region' and 'district' are kept as categorical identifiers but not converted to dummies (redundant with ADMIN0+ADMIN1+ADMIN2)")

# ============================================================================
# 6. SUMMARY
# ============================================================================
print("\n" + "="*70)
print("FEATURE ENGINEERING COMPLETE")
print("="*70)

print(f"\nFinal dataset: {len(df)} rows")
print(f"Total columns: {len(df.columns)}")

# Feature counts
ipc_features = ['IPC_last']

# GDELT base features: evt_/gkg_ prefixed features + raw GKG numeric columns
gdelt_base = [c for c in df.columns if (c.startswith('evt_') or c.startswith('gkg_')) 
              and not any(x in c for x in ['_lag', '_rolling', '_escalation'])]

# Add raw GKG numeric columns (these are GDELT features but don't have gkg_ prefix)
gkg_raw = [c for c in ['n_displaced', 'n_killed', 'n_injured', 'n_missing', 'usd_aid',
                       'n_food_related', 'n_water_related', 'n_price_related', 
                       'n_conflict_related', 'n_disease_related', 'n_weather_related',
                       'n_market_related', 'n_policy_related', 'tone', 'tone_abs', 'is_negative'] 
           if c in df.columns]
gdelt_base.extend(gkg_raw)

# GDELT temporal features: evt_/gkg_ prefixed features with temporal patterns
gdelt_temporal = [c for c in df.columns if (c.startswith('evt_') or c.startswith('gkg_')) 
                   and any(x in c for x in ['_lag', '_rolling', '_escalation'])]

# Geographic features will be created as dummies later, initialize as empty for now
geographic_features = []

# Geographic features: only ADMIN0, ADMIN1, ADMIN2 are converted to dummies
# 'region' and 'district' are kept as categorical identifiers for grouping but not converted to dummies (redundant)
# Exclude original categorical columns from feature lists (but keep them in dataframe for grouping purposes)
geographic_categorical_cols = ['ADMIN0', 'ADMIN1', 'ADMIN2', 'region', 'district']

# Exclude all non-feature columns (geographic categoricals, metadata, scores, indicators, target, list columns)
excluded_cols = (ipc_features + gdelt_base + gdelt_temporal + geographic_features +
                 ['y_next', 'has_gdelt', 'has_events_data', 'has_gkg_data', 'has_any_features',
                  'CS_score', 'CS_score_events', 'CS_score_gkg',
                  'period', 'SQLDATE', 'EventCode', 'SOURCEURL', 'NumMentions', 'NumSources', 
                  'NumArticles', 'valid_SOURCEURL', 'DATE', 'V2Themes', 'DocumentIdentifier', 
                  'Amounts', 'valid_DocumentIdentifier'] +
                 geographic_categorical_cols +
                 # Also exclude any remaining list columns that weren't aggregated
                 [c for c in df.columns if '_list' in c])
other_features = [c for c in df.columns if c not in excluded_cols]

print(f"\nFeature breakdown:")
print(f"  - IPC features: {len(ipc_features)}")
print(f"  - GDELT base features: {len(gdelt_base)} (mean, count, sum, max for crisis)")
print(f"  - GDELT temporal features: {len(gdelt_temporal)} (rolling windows, lags, escalation)")
print(f"  - Geographic features: {len(geographic_features)} (dummy variables)")
print(f"  - Other features: {len(other_features)}")
print(f"  - Total features: {len(ipc_features) + len(gdelt_base) + len(gdelt_temporal) + len(geographic_features) + len(other_features)}")
print(f"\nNote: Simplified feature set for noisy NLP-derived signals:")
print(f"  - Base: mean, count, sum (all features) + max (crisis features only)")
print(f"  - Temporal: rolling windows (3, 6 periods), lags (1, 2 periods), escalation indicators")
print(f"  - Removed: min, std, change, pct_change, anomaly (too noisy for NLP features)")

print(f"\nRows with y_next: {df['y_next'].notna().sum()} ({df['y_next'].notna().mean()*100:.1f}%)")
print(f"Rows with IPC_last: {df['IPC_last'].notna().sum()} ({df['IPC_last'].notna().mean()*100:.1f}%)")
print(f"Rows with GDELT features: {df['has_gdelt'].sum()} ({df['has_gdelt'].mean()*100:.1f}%)")

# ============================================================================
# 6. DROP PROCESSED/METADATA COLUMNS
# ============================================================================
print("\n6. Dropping processed metadata columns...")

cols_to_drop = ['SQLDATE', 'EventCode', 'SOURCEURL', 'NumMentions', 
                'NumSources', 'NumArticles', 'valid_SOURCEURL', 'CS_score_gkg', 
                'DATE', 'V2Themes', 'DocumentIdentifier', 'Amounts', 'valid_DocumentIdentifier',
                'CS_score_events']
# Only drop columns that exist
cols_to_drop = [c for c in cols_to_drop if c in df.columns]
if cols_to_drop:
    df.drop(cols_to_drop, axis=1, inplace=True)
    print(f"   Dropped {len(cols_to_drop)} metadata columns")

print("\n" + "="*70)
print("READY FOR MODELING")
print("="*70)
print("\nFeature sets available:")
print(f"  - IPC only: {len(ipc_features)} IPC features + {len(geographic_features)} geographic features")
print(f"  - IPC + GDELT base: {len(ipc_features)} IPC + {len(gdelt_base)} GDELT base + {len(geographic_features)} geographic")
print(f"  - IPC + GDELT + temporal: {len(ipc_features)} IPC + {len(gdelt_base)} GDELT base + {len(gdelt_temporal)} GDELT temporal + {len(geographic_features)} geographic")
print("="*70)

In [None]:
for c in df.columns:
    print(c)

In [None]:
print(min(df['period']))
print(max(df['period']))
df = df.sort_values(['ADMIN0', 'ADMIN1', 'ADMIN2', 'period'])
df.tail()

### Configure train and test, and loops

In [None]:
n_training_periods = 24  # Number of periods to use for training
n_eval_periods = 1       # Number of rolling evaluations to run (going backwards from latest period)

In [None]:
# ============================================================================
# ROLLING WINDOW EVALUATION CONFIGURATION
# ============================================================================
# Configure how many periods to use for training and how many rolling evaluations to run
# Get all unique periods and sort them
all_periods = sorted(df['period'].unique())
latest_period = all_periods[-1]

print("="*70)
print("ROLLING WINDOW EVALUATION SETUP")
print("="*70)
print(f"\nTotal periods available: {len(all_periods)}")
print(f"Period range: {all_periods[0]} to {all_periods[-1]}")
print(f"\nConfiguration:")
print(f"  - Training periods per evaluation: {n_training_periods}")
print(f"  - Number of evaluations: {n_eval_periods}")
print(f"\nEvaluation schedule (going backwards from latest period):")

# Create evaluation splits
evaluation_splits = []

for i in range(n_eval_periods):
    # Test period: latest period minus i
    test_period_idx = len(all_periods) - 1 - i
    test_period = all_periods[test_period_idx]
    
    # Training periods: n_training_periods periods before test period
    train_start_idx = test_period_idx - n_training_periods
    train_end_idx = test_period_idx - 1
    
    if train_start_idx < 0:
        print(f"\n  ⚠️  Evaluation {i+1}: Cannot create split - not enough periods")
        print(f"     Need {n_training_periods} training periods before {test_period}")
        continue
    
    train_periods = all_periods[train_start_idx:test_period_idx]
    
    evaluation_splits.append({
        'eval_id': i + 1,
        'test_period': test_period,
        'train_periods': train_periods,
        'train_start': train_periods[0],
        'train_end': train_periods[-1],
        'n_train_periods': len(train_periods)
    })
    
    print(f"\n  Evaluation {i+1}:")
    print(f"    Test period: {test_period}")
    print(f"    Training periods: {train_periods[0]} to {train_periods[-1]} ({len(train_periods)} periods)")

print(f"\n" + "="*70)
print(f"Created {len(evaluation_splits)} evaluation splits")
print("="*70)

# Store splits for use in modeling
# Each split can be accessed as: evaluation_splits[i]['test_period'], evaluation_splits[i]['train_periods']

# ============================================================================
# TRAIN AND EVALUATE LOGISTIC REGRESSION MODEL
# ============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("\n" + "="*70)
print("TRAINING AND EVALUATION")
print("="*70)

# ============================================================================
# DEFINE FEATURE SETS
# ============================================================================
# Get feature lists (check if they exist from feature engineering cell)
if 'ipc_features' in globals():
    ipc_feat = ipc_features
else:
    ipc_feat = ['IPC_last']

if 'gdelt_base' in globals():
    gdelt_base_feat = gdelt_base
else:
    gdelt_base_feat = [c for c in df.columns if (c.startswith('evt_') or c.startswith('gkg_')) 
                       and not any(x in c for x in ['_lag', '_rolling', '_escalation'])]
    # Add raw GKG numeric columns
    gkg_raw = [c for c in ['n_displaced', 'n_killed', 'n_injured', 'n_missing', 'usd_aid',
                           'n_food_related', 'n_water_related', 'n_price_related', 
                           'n_conflict_related', 'n_disease_related', 'n_weather_related',
                           'n_market_related', 'n_policy_related', 'tone', 'tone_abs', 'is_negative'] 
               if c in df.columns]
    gdelt_base_feat.extend(gkg_raw)

if 'gdelt_temporal' in globals():
    gdelt_temporal_feat = gdelt_temporal
else:
    gdelt_temporal_feat = [c for c in df.columns if (c.startswith('evt_') or c.startswith('gkg_')) 
                            and any(x in c for x in ['_lag', '_rolling', '_escalation'])]

# Find geographic dummy variables
geographic_feat = [c for c in df.columns if c.startswith('ADMIN0_') or c.startswith('ADMIN1_') or c.startswith('ADMIN2_')]

# Remove features that don't exist
ipc_feat = [c for c in ipc_feat if c in df.columns]
gdelt_base_feat = [c for c in gdelt_base_feat if c in df.columns]
gdelt_temporal_feat = [c for c in gdelt_temporal_feat if c in df.columns]
geographic_feat = [c for c in geographic_feat if c in df.columns]

# Define feature combinations
feature_sets = {
    'IPC only': ipc_feat + geographic_feat,
    'IPC + GDELT base': ipc_feat + gdelt_base_feat + geographic_feat,
    'IPC + GDELT + temporal': ipc_feat + gdelt_base_feat + gdelt_temporal_feat + geographic_feat
}

print(f"\nFeature sets available:")
for name, features in feature_sets.items():
    n_ipc = len([c for c in features if c in ipc_feat])
    n_gdelt_base = len([c for c in features if c in gdelt_base_feat])
    n_gdelt_temp = len([c for c in features if c in gdelt_temporal_feat])
    n_geo = len([c for c in features if c in geographic_feat])
    print(f"  - {name}: {n_ipc} IPC + {n_gdelt_base} GDELT base + {n_gdelt_temp} GDELT temporal + {n_geo} geographic = {len(features)} total")

# Store results for each evaluation and feature set
all_results = {}

# Evaluate each feature set
for feature_set_name, feature_cols in feature_sets.items():
    print(f"\n{'='*70}")
    print(f"FEATURE SET: {feature_set_name}")
    print(f"{'='*70}")
    print(f"Using {len(feature_cols)} features")
    
    feature_results = []
    
    for split in evaluation_splits:
        print(f"\n{'='*70}")
        print(f"Evaluation {split['eval_id']}: Test period {split['test_period']}")
        print(f"{'='*70}")
    
        # Create train/test splits
        train_df = df[
            (df['period'].isin(split['train_periods'])) & 
            (df['y_next'].notna())  # Only use rows with valid targets for training
        ].copy()
        
        test_df = df[df['period'] == split['test_period']].copy()
        
        # For test data: use y_next if available, otherwise use CS_score (for latest period)
        # Create target column: prefer y_next, fallback to CS_score (rounded to integer)
        test_df['target'] = test_df['y_next'].fillna(test_df['CS_score'].round().clip(1, 5))
        
        # Filter test to only rows with valid target (y_next or CS_score in valid range 1-5)
        test_df_eval = test_df[
            (test_df['target'].notna()) & 
            (test_df['target'] >= 1) & 
            (test_df['target'] <= 5)
        ].copy()
        
        print(f"  Training samples: {len(train_df)}")
        print(f"  Test samples (total): {len(test_df)}")
        print(f"  Test samples (with y_next): {(test_df['y_next'].notna()).sum()}")
        print(f"  Test samples (with CS_score): {(test_df['CS_score'].notna() & (test_df['CS_score'] >= 1) & (test_df['CS_score'] <= 5)).sum()}")
        print(f"  Test samples (with valid target): {len(test_df_eval)}")
        
        if len(train_df) == 0:
            print(f"  ⚠️  Skipping - no training data")
            continue
        
        if len(test_df_eval) == 0:
            print(f"  ⚠️  Skipping - no test data with valid target (y_next or CS_score)")
            continue
        
        # Prepare features and target
        X_train = train_df[feature_cols].fillna(0)
        y_train = train_df['y_next'].astype(int)
        
        X_test = test_df_eval[feature_cols].fillna(0)
        y_test = test_df_eval['target'].astype(int)
        
        # Check for missing values
        if X_train.isna().any().any() or X_test.isna().any().any():
            print(f"  ⚠️  Warning: Missing values in features, filling with 0")
            X_train = X_train.fillna(0)
            X_test = X_test.fillna(0)
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train logistic regression
        print(f"  Training logistic regression...")
        lr_model = LogisticRegression(
            max_iter=1000,
            random_state=42,
            solver='lbfgs'
        )
        lr_model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = lr_model.predict(X_test_scaled)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        cm = confusion_matrix(y_test, y_pred)
    
        # Store results
        feature_results.append({
            'eval_id': split['eval_id'],
            'test_period': split['test_period'],
            'n_train': len(train_df),
            'n_test': len(test_df_eval),
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'confusion_matrix': cm
        })
        
        print(f"\n  Results:")
        print(f"    Accuracy:  {accuracy:.4f}")
        print(f"    Precision: {precision:.4f}")
        print(f"    Recall:    {recall:.4f}")
        print(f"    F1 Score:  {f1:.4f}")
        print(f"\n  Confusion Matrix:")
        print(f"    {cm}")
    
    # Store results for this feature set
    all_results[feature_set_name] = feature_results

# ============================================================================
# AVERAGE RESULTS ACROSS ALL EVALUATIONS AND FEATURE SETS
# ============================================================================
if len(all_results) > 0:
    print("\n" + "="*70)
    print("SUMMARY: AVERAGE RESULTS BY FEATURE SET")
    print("="*70)
    
    for feature_set_name, results in all_results.items():
        if len(results) == 0:
            continue
            
        print(f"\n{feature_set_name}:")
        print("-" * 70)
        
        avg_accuracy = np.mean([r['accuracy'] for r in results])
        avg_precision = np.mean([r['precision'] for r in results])
        avg_recall = np.mean([r['recall'] for r in results])
        avg_f1 = np.mean([r['f1'] for r in results])
        
        # Average confusion matrix
        avg_cm = np.mean([r['confusion_matrix'] for r in results], axis=0).astype(int)
        
        print(f"  Average Metrics (across {len(results)} evaluations):")
        print(f"    Accuracy:  {avg_accuracy:.4f}")
        print(f"    Precision: {avg_precision:.4f}")
        print(f"    Recall:    {avg_recall:.4f}")
        print(f"    F1 Score:  {avg_f1:.4f}")
        
        print(f"\n  Average Confusion Matrix:")
        print(f"    {avg_cm}")
    
    print(f"\n{'='*70}")
    print(f"Total feature sets evaluated: {len([k for k, v in all_results.items() if len(v) > 0])}")
    print("="*70)
    
    # Create summary dataframe
    summary_data = []
    for feature_set_name, results in all_results.items():
        if len(results) == 0:
            continue
        
        avg_accuracy = np.mean([r['accuracy'] for r in results])
        avg_precision = np.mean([r['precision'] for r in results])
        avg_recall = np.mean([r['recall'] for r in results])
        avg_f1 = np.mean([r['f1'] for r in results])
        
        # Count features
        n_features = len(feature_sets[feature_set_name])
        n_ipc = len([c for c in feature_sets[feature_set_name] if c in ipc_feat])
        n_gdelt_base = len([c for c in feature_sets[feature_set_name] if c in gdelt_base_feat])
        n_gdelt_temp = len([c for c in feature_sets[feature_set_name] if c in gdelt_temporal_feat])
        n_geo = len([c for c in feature_sets[feature_set_name] if c in geographic_feat])
        
        summary_data.append({
            'Feature Set': feature_set_name,
            'N Features': n_features,
            'IPC': n_ipc,
            'GDELT Base': n_gdelt_base,
            'GDELT Temporal': n_gdelt_temp,
            'Geographic': n_geo,
            'Accuracy': avg_accuracy,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1 Score': avg_f1,
            'N Evaluations': len(results)
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Accuracy', ascending=False)
    
    print("\n" + "="*70)
    print("SUMMARY TABLE")
    print("="*70)
    print(summary_df.to_string(index=False))
    
else:
    print("\n⚠️  No evaluations completed - check data availability")


# Why Literature Shows Better Results: Key Differences

## Potential Reasons for Better Performance in Literature:

1. **Different Prediction Tasks**:
   - **Transitions/Changes**: Predicting when CS_score will worsen (crisis onset) rather than exact level
   - **Early Warning**: Predicting 2-4 periods ahead (forecasting) rather than next period (nowcasting)
   - **Binary Classification**: Predicting crisis (CS≥3) vs non-crisis (CS<3) rather than 5-class classification

2. **Feature Engineering**:
   - **Temporal Patterns**: Using lagged and rolling features (as we just added) rather than only current-period values
   - **Anomaly Detection**: Focusing on deviations from historical patterns
   - **Interaction Features**: Combining GDELT with other data sources (weather, prices, etc.)

3. **Data Coverage**:
   - **Spatial Aggregation**: Some studies aggregate to ADMIN0 or ADMIN1 level where coverage is better
   - **Temporal Aggregation**: Using quarterly rather than monthly data
   - **Filtering**: Focusing on regions/periods with good GDELT coverage

4. **Evaluation Metrics**:
   - **Recall for Rare Events**: Focusing on detecting crises (high recall for CS≥3) rather than overall accuracy
   - **Early Detection**: Measuring how early they detect transitions, not just accuracy

5. **Baseline Comparison**:
   - **Weaker Baselines**: Some studies don't include `previous_CS` in baseline, making improvement easier to show
   - **Different Baselines**: Comparing against simpler models or using different evaluation windows

## Our Current Approach:
- ✅ Now includes temporal GDELT features (lags, rolling windows, anomalies, escalation)
- ✅ Uses same framework as previous work (allowing fair comparison)
- ⚠️ Still predicting exact CS_score level (highly autocorrelated)
- ⚠️ Using monthly data with CS_score available every 4 months

## Potential Improvements:
1. **Predict Transitions**: Predict if CS_score will worsen (CS_t+1 > CS_t) rather than exact value
2. **Forecasting Horizon**: Predict 2-4 periods ahead instead of next period
3. **Crisis Detection**: Binary classification (crisis vs non-crisis) with focus on recall
4. **Feature Selection**: Use only temporal GDELT features, filter out low-importance current-period features


## Feature Engineering Summary

### Features Created:

1. **List Aggregations** (from both events and gkg):
   - Mean, Max, Min, Sum, Count, Std for all list features
   - Separated by `evt_` and `gkg_` prefixes

2. **Combined Features**:
   - CS_score combinations (combined, diff, has_both)
   - Casualty rates and totals
   - Theme aggregations (conflict intensity, crisis severity)
   - Sentiment combinations
   - Coverage intensity metrics

3. **Temporal Features**:
   - Lag features (CS_score_lag1, lag2, lag3)
   - Moving averages (ma2, ma3)
   - Change and percentage change features
   - Cyclical period encoding (sin/cos)

4. **Geographic Features**:
   - Aggregated CS_score by ADMIN0 and ADMIN1 levels
   - Standard deviations by geographic level
   - Count of regions with same score

5. **Interaction Features**:
   - Ratios (casualty_rate, aid_per_casualty)
   - Normalized features (tone_abs_normalized)
   - Coverage metrics (mentions_per_source, articles_per_source)

### Next Steps for ML:

1. **Feature Selection**: Consider using:
   - Correlation-based selection
   - Mutual information
   - Recursive feature elimination
   - L1 regularization (Lasso)

2. **Categorical Encoding**: If you have categorical features:
   - One-hot encoding for low cardinality
   - Target encoding for high cardinality
   - Embedding for very high cardinality

3. **Scaling**: Consider:
   - StandardScaler or MinMaxScaler for numeric features
   - Especially important for distance-based algorithms

4. **Model Suggestions**:
   - Random Forest (handles non-linear relationships well)
   - Gradient Boosting (XGBoost, LightGBM, CatBoost)
   - Neural Networks (if you have enough data)
   - Consider class weights if classes are imbalanced
