## Data Processing:

In [1]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset

### Load full ACLED data

file_path = "C:/Users/benja/Documents/ACLED/ACLED_1997_1_1-2025_1_31.csv"
df = pd.read_csv(file_path, engine="python")

# Convert event_date column to datetime
df['event_date'] = pd.to_datetime(df['event_date'])

# Extract year and compute `week_start` (Monday of that week)
df['year'] = df['event_date'].dt.year
df['week_start'] = df['event_date'] - pd.to_timedelta(df['event_date'].dt.weekday, unit='D')
df['week_start'] = pd.to_datetime(df['week_start'])

In [2]:
### Filter based on dataset purpose

# Impute population using median (admin1 level first, then country level)
df['population_best'] = df.groupby(['country', 'admin1'])['population_best'].transform(lambda x: x.fillna(x.median()))
df['population_best'] = df.groupby('country')['population_best'].transform(lambda x: x.fillna(x.median()))

# Define event types for aggregation
event_cols = ['count_battles', 'count_protests', 'count_riots', 'count_explosions', 'count_civ_violence']

# Socioeconomic Dataset: Use all available years up to 2024
socioeconomic_df = df[df['year'] <= 2024]

# RF Model: Use data from 2016 onward
rf_start_year = 2016
rf_df = df[df['year'] >= rf_start_year]

# ConvLSTM Model: Use region-specific start years
region_start_dates = {
    'Middle East': 2016, 'South America': 2018, 'Middle Africa': 1997, 'Northern Africa': 1997,
    'Western Africa': 1997, 'Central America': 2018, 'Eastern Africa': 1997, 'North America': 2020,
    'Southern Africa': 1997, 'Europe': 2018, 'Caucasus and Central Asia': 2018, 'South Asia': 2010,
    'East Asia': 2018, 'Southeast Asia': 2010, 'Oceania': 2021, 'Caribbean': 2018
}

# Assign start year per region
df['region_start_year'] = df['region'].map(region_start_dates)

# Only apply this filter to ConvLSTM dataset, not the full df
conv_df = df.dropna(subset=['region_start_year'])
conv_df = conv_df[conv_df['year'] >= conv_df['region_start_year']]

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [3]:
### Aggregate

# Aggregate fatalities and event types at weekly Admin1 level
agg_df = df.groupby(['country', 'admin1', 'week_start']).agg(
    fatalities=('fatalities', 'sum'),
    population=('population_best', 'median'),
    count_battles=('event_type', lambda x: (x == 'Battles').sum()),
    count_protests=('event_type', lambda x: (x == 'Protests').sum()),
    count_riots=('event_type', lambda x: (x == 'Riots').sum()),
    count_explosions=('event_type', lambda x: (x == 'Explosions/Remote violence').sum()),
    count_civ_violence=('event_type', lambda x: (x == 'Violence against civilians').sum())
).reset_index()

In [4]:
### Handle missing weeks

# Generate full range of weeks per country-admin1
min_week_start = agg_df['week_start'].min()
max_week_start = agg_df['week_start'].max()
all_weeks = pd.date_range(start=min_week_start, end=max_week_start, freq='W-MON')
all_weeks_df = pd.DataFrame({'week_start': all_weeks})

# Get unique country-admin1 pairs
all_regions = agg_df[['country', 'admin1']].drop_duplicates()

# Create full (country, admin1, week_start) grid
full_grid = all_regions.merge(all_weeks_df, how='cross')

# Merge with existing aggregated data
full_df = full_grid.merge(agg_df, on=['country', 'admin1', 'week_start'], how='left')

# Fill missing event counts and fatalities with 0
full_df.loc[:, ['fatalities'] + event_cols] = full_df[['fatalities'] + event_cols].fillna(0)

# Re-impute population values after merging
full_df['population'] = full_df.groupby(['country', 'admin1'])['population'].transform(lambda x: x.fillna(x.median()))
full_df['population'] = full_df.groupby('country')['population'].transform(lambda x: x.fillna(x.median()))

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [5]:
### Prepare RF with lags

# Add RF target: `fatalities_t+26`
full_df['fatalities_t+26'] = full_df.groupby(['country', 'admin1'])['fatalities'].shift(-26)

# Add lag features
lags = [2, 4, 12, 26, 52]  

for lag in lags:
    full_df[f'fatalities_lag_{lag}'] = full_df.groupby(['country', 'admin1'])['fatalities'].shift(lag)
    for event in event_cols:
        full_df[f'{event}_lag_{lag}'] = full_df.groupby(['country', 'admin1'])[event].shift(lag)

# Drop rows where `fatalities_t+26` is NaN
full_df = full_df.dropna(subset=['fatalities_t+26'])

In [None]:
### Prepare ConvLSTM dataset (legacy)

# Merge ConvLSTM dataset with aggregated weekly features
conv_df = conv_df.merge(agg_df, on=['country', 'admin1', 'week_start'], how='left')

# Ensure all expected columns exist after merge
expected_columns = ['fatalities', 'population'] + event_cols
for col in expected_columns:
    if col not in conv_df.columns:
        conv_df[col] = 0  # Add missing columns with default 0

# Fill any remaining NaN values with 0
conv_df[expected_columns] = conv_df[expected_columns].fillna(0)

class MultiStepConflictDataset(Dataset):
    def __init__(self, df, seq_length=52, forecast_horizon=26):
        self.seq_length = seq_length
        self.forecast_horizon = forecast_horizon
        self.X, self.y = self.create_sequences(df)

    def create_sequences(self, df):
        X, y = [], []
        unique_regions = df[['country', 'admin1']].drop_duplicates()

        for _, region in unique_regions.iterrows():
            region_data = df[(df['country'] == region['country']) & (df['admin1'] == region['admin1'])].copy()
            region_data = region_data.sort_values(by='week_start')

            for i in range(len(region_data) - self.seq_length - self.forecast_horizon):
                seq_X = region_data.iloc[i:i+self.seq_length][['fatalities', 'population'] + event_cols].values
                seq_y = region_data.iloc[i+self.seq_length:i+self.seq_length+self.forecast_horizon]['fatalities'].values
                X.append(seq_X)
                y.append(seq_y)

        return np.array(X), np.array(y)

# Instantiate ConvLSTM dataset only once
conv_dataset = MultiStepConflictDataset(conv_df, seq_length=52, forecast_horizon=26)

In [9]:
### Prepare socioeconomic

# Ensure all (country, year) pairs exist
all_years = pd.DataFrame({'year': range(df['year'].min(), df['year'].max() + 1)})
all_countries = df[['country']].drop_duplicates()
year_grid = all_countries.merge(all_years, how='cross')

socioeconomic_df = df.groupby(['country', 'year']).agg(
    total_fatalities=('fatalities', 'sum'),
    population=('population_best', 'median'),
    total_battles=('event_type', lambda x: (x == 'Battles').sum()),
    total_protests=('event_type', lambda x: (x == 'Protests').sum()),
    total_riots=('event_type', lambda x: (x == 'Riots').sum()),
    total_explosions=('event_type', lambda x: (x == 'Explosions/Remote violence').sum()),
    total_civ_violence=('event_type', lambda x: (x == 'Violence against civilians').sum())
).reset_index()

socioeconomic_df = year_grid.merge(socioeconomic_df, on=['country', 'year'], how='left').fillna(0)

In [10]:
### Save all datasets

output_dir = "C:/Users/benja/Documents/ACLED/processed_data"
os.makedirs(output_dir, exist_ok=True)

full_df.to_csv(os.path.join(output_dir, "rf_data.csv"), index=False)
np.save(os.path.join(output_dir, "convLSTM_data_X.npy"), conv_dataset.X)
np.save(os.path.join(output_dir, "convLSTM_data_y.npy"), conv_dataset.y)
socioeconomic_df.to_csv(os.path.join(output_dir, "socioeconomic_data.csv"), index=False)

print("Processed data saved at", output_dir)

Processed data saved at C:/Users/benja/Documents/ACLED/processed_data


In [14]:
### New RF version 2016-

import pandas as pd

# Load existing RF dataset
rf_file_path = "C:/Users/benja/Documents/ACLED/processed_data/rf_data.csv"
rf_df = pd.read_csv(rf_file_path, parse_dates=['week_start'])

# Remove rows where week_start is before 2016
rf_df = rf_df[rf_df['week_start'].dt.year >= 2016]

# Save the corrected dataset
rf_df.to_csv(rf_file_path, index=False)

print("RF dataset successfully filtered to start from 2016.")

RF dataset successfully filtered to start from 2016.


In [None]:
### Check tensors for DL (legacy)

import numpy as np

# Load ConvLSTM tensors
X_path = "C:/Users/benja/Documents/ACLED/processed_data/convLSTM_data_X.npy"
y_path = "C:/Users/benja/Documents/ACLED/processed_data/convLSTM_data_y.npy"

X = np.load(X_path)
y = np.load(y_path)

# Check shape
print(f"Shape of X (input features): {X.shape}")  # Expected: (num_samples, seq_length=52, num_features)
print(f"Shape of y (target values): {y.shape}")  # Expected: (num_samples, forecast_horizon=26)

# Check for missing values
print(f"Any NaNs in X? {np.isnan(X).sum()}")
print(f"Any NaNs in y? {np.isnan(y).sum()}")

print(f"Any Infs in X? {np.isinf(X).sum()}")
print(f"Any Infs in y? {np.isinf(y).sum()}")

# Check data type
print(f"Data type of X: {X.dtype}")
print(f"Data type of y: {y.dtype}")

# Basic statistics to ensure reasonable values
print(f"X Min: {np.min(X)}, X Max: {np.max(X)}, X Mean: {np.mean(X)}")
print(f"y Min: {np.min(y)}, y Max: {np.max(y)}, y Mean: {np.mean(y)}")

Shape of X (input features): (2061965, 52, 7)
Shape of y (target values): (2061965, 26)
Any NaNs in X? 0
Any NaNs in y? 0
Any Infs in X? 0
Any Infs in y? 0
Data type of X: float64
Data type of y: float64
X Min: 0.0, X Max: 16368.0, X Mean: 5.973562193142181
y Min: 0.0, y Max: 16368.0, y Mean: 41.53904757392547


In [2]:
import numpy as np
import pandas as pd
import os

### Load raw data
file_path = "C:/Users/benja/Documents/ACLED/ACLED_1997_1_1-2025_1_31.csv"
df = pd.read_csv(file_path, engine="python")
df['event_date'] = pd.to_datetime(df['event_date'])
df['week_start'] = df['event_date'] - pd.to_timedelta(df['event_date'].dt.weekday, unit='D')
df['week_start'] = pd.to_datetime(df['week_start'])

### Population imputation
df['population_best'] = df.groupby(['country', 'admin1'])['population_best'].transform(lambda x: x.fillna(x.median()))
df['population_best'] = df.groupby('country')['population_best'].transform(lambda x: x.fillna(x.median()))

### Weekly aggregation per (country, admin1)
event_types = {
    'count_battles': 'Battles',
    'count_protests': 'Protests',
    'count_riots': 'Riots',
    'count_explosions': 'Explosions/Remote violence',
    'count_civ_violence': 'Violence against civilians'
}

agg_df = df.groupby(['country', 'admin1', 'week_start']).agg(
    fatalities=('fatalities', 'sum'),
    population=('population_best', 'median'),
    **{k: ('event_type', lambda x, v=v: (x == v).sum()) for k, v in event_types.items()}
).reset_index()

### Fill in missing weeks (very important)
all_weeks = pd.date_range(start=agg_df['week_start'].min(), end=agg_df['week_start'].max(), freq='W-MON')
all_regions = agg_df[['country', 'admin1']].drop_duplicates()
full_grid = all_regions.merge(pd.DataFrame({'week_start': all_weeks}), how='cross')
conv_df = full_grid.merge(agg_df, on=['country', 'admin1', 'week_start'], how='left')

conv_df[['fatalities'] + list(event_types)] = conv_df[['fatalities'] + list(event_types)].fillna(0)
conv_df['population'] = conv_df.groupby(['country', 'admin1'])['population'].transform(lambda x: x.fillna(x.median()))
conv_df['population'] = conv_df.groupby('country')['population'].transform(lambda x: x.fillna(x.median()))

### Sequence generation (52-in, 26-out)
target_cols = ['fatalities',
               'count_battles',
               'count_protests',
               'count_riots',
               'count_explosions',
               'count_civ_violence']
input_cols  = ['fatalities', 'population'] + target_cols[1:]  # 7 features

X, y, sequence_info = [], [], []

for (country, admin1), group in conv_df.groupby(['country', 'admin1']):
    group = group.sort_values('week_start').reset_index(drop=True)
    for i in range(len(group) - 52 - 26):
        seq_X = group.iloc[i : i+52][input_cols].values
        seq_y = group.iloc[i+52 : i+52+26][target_cols].values
        X.append(seq_X)
        y.append(seq_y)
        sequence_info.append({
            "country": country,
            "admin1": admin1,
            "week_start": group.loc[i+52, "week_start"],
            "history_start": group.loc[i, "week_start"]
        })

X = np.array(X).astype(np.float32)  # (samples, 52, 7)
y = np.array(y).astype(np.float32)  # (samples, 26, 6)
index_df = pd.DataFrame(sequence_info)

### Save
output_dir = "C:/Users/benja/Documents/ACLED/processed_data"
os.makedirs(output_dir, exist_ok=True)
np.save(os.path.join(output_dir, "X_tcn.npy"), X)
np.save(os.path.join(output_dir, "y_tcn.npy"), y)
index_df.to_csv(os.path.join(output_dir, "sequence_index.csv"), index=False)

print(f"Saved: X_tcn.npy {X.shape}, y_tcn.npy {y.shape}, index: {index_df.shape}")

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Saved: X_tcn.npy (4709484, 52, 7), y_tcn.npy (4709484, 26, 6), index: (4709484, 4)
