In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm 
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures 

print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)

pd.options.mode.chained_assignment = None 

In [None]:
# Data Loading and Initial Filtering
df_acled = pd.read_csv("data/raw/afgh_may25.csv") 

print("Initial ACLED dataset loaded:")
display(df_acled.head())
print(f"Initial ACLED data shape: {df_acled.shape}")

df_af = df_acled[df_acled['country'] == "Afghanistan"].copy()
print("\nAfghanistan-specific data filtered:")
display(df_af.head())
print(f"Afghanistan data shape: {df_af.shape}")

In [None]:
# Data Cleaning for EDA
df_acled['readable_date'] = pd.to_datetime(df_acled['timestamp'], unit='s')
df_af['readable_date'] = pd.to_datetime(df_af['timestamp'], unit='s')

df_af[['assoc_actor_1', 'assoc_actor_2']] = df_af[['assoc_actor_1', 'assoc_actor_2']].fillna('Unknown')

print("\nData types adjusted and missing values handled for EDA:")
display(df_af.info())

In [None]:
# Global vs. Afghanistan Fatalities Overview
df_acled['fatalities'] = pd.to_numeric(df_acled['fatalities'], errors='coerce').fillna(0)
df_af['fatalities'] = pd.to_numeric(df_af['fatalities'], errors='coerce').fillna(0)

total_fat = df_acled['fatalities'].sum()
total_fat_since_2017 = df_acled[df_acled['year'] >= 2017]['fatalities'].sum()
total_af = df_af['fatalities'].sum()

relative_fatalities_global = round((total_af / total_fat) * 100, 2) if total_fat > 0 else 0
relative_fatalities_since_2017 = round((total_af / total_fat_since_2017) * 100, 2) if total_fat_since_2017 > 0 else 0

print(f'Total number of fatalities (All Countries, All Time): {int(total_fat):,}')
print(f'Total number of fatalities (All Countries, Since 2017): {int(total_fat_since_2017):,}')
print(f'Total number of fatalities (Afghanistan, All Time): {int(total_af):,}')
print(f'Relative percent of Afghanistan fatalities out of Global total (All Time): {relative_fatalities_global}%')
print(f'Relative percent of Afghanistan fatalities out of Global total (Since 2017): {relative_fatalities_since_2017}%')

table_1_data = {
    'Category': ['Global Total', 'Afghanistan'],
    'Total Fatalities (All)': [f"{int(total_fat):,}", f"{int(total_af):,}"],
    'Relative % (Global)': ["—", f"{relative_fatalities_global}%"],
    'Total Fatalities (Since 2017)': [f"{int(total_fat_since_2017):,}", f"{int(total_af):,}"],
    'Relative % (Since 2017)': ["—", f"{relative_fatalities_since_2017}%"]
}
table_1_df = pd.DataFrame(table_1_data)
print("\nTable 1: Fatalities and Relative Percentage (Global vs. Afghanistan, Total and Since 2017)")
display(table_1_df)

print("\nTable 2: Summary Statistics for Fatalities (Afghanistan)")
display(df_af['fatalities'].describe().to_frame().rename(columns={'fatalities': 'Value'}))

In [None]:
# Fatalities Trends by Year (Global & Afghanistan)
yearly_fatalities_all = df_acled.groupby('year')['fatalities'].sum()
yearly_fatalities_recent = df_acled[df_acled['year'] >= 2017].groupby('year')['fatalities'].sum()
yearly_fatalities_af = df_af.groupby('year')['fatalities'].sum()

plt.figure(figsize=(12, 7))
all_fatalities_plot = plt.bar(yearly_fatalities_all.index, yearly_fatalities_all.values, color='steelblue')
for bar in all_fatalities_plot:
    y = bar.get_height()
    if y > 0:
        plt.text(bar.get_x() + bar.get_width()/2, y + 200, int(y), ha='center', va='bottom', fontsize=7, rotation=30, color='black')
plt.xlabel("Year", labelpad=15)
plt.xticks(yearly_fatalities_all.index, rotation=90)
plt.ylabel("Fatalities (Total)")
plt.title("Fatalities by Year (All Countries, 1997-2025)", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/global_fatalities_by_year_all_time.png')
plt.show()

plt.figure(figsize=(12, 7))
all_fatalities_17 = plt.bar(yearly_fatalities_recent.index, yearly_fatalities_recent.values, color='steelblue')
for bar in all_fatalities_17:
    y = bar.get_height()
    if y > 0:
        plt.text(bar.get_x() + bar.get_width()/2, y + 200, int(y), ha='center', va='bottom', fontsize=8, fontweight='bold', color='black')
plt.xlabel("Year", labelpad=15)
plt.xticks(yearly_fatalities_recent.index, rotation=45)
plt.ylabel("Fatalities (#)")
plt.title("Total Fatalities by Year (All Countries, 2017-2025)", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/global_fatalities_by_year_2017_2025.png')
plt.show()

plt.figure(figsize=(12, 7))
all_fatalities_af = plt.bar(yearly_fatalities_af.index, yearly_fatalities_af.values, color='darkred')
for bar in all_fatalities_af:
    y = bar.get_height()
    if y > 0:
        plt.text(bar.get_x() + bar.get_width()/2, y + 5, int(y), ha='center', va='bottom', fontsize=8, fontweight='bold', color='black')
plt.xlabel("Year", labelpad=15)
plt.xticks(yearly_fatalities_af.index, rotation=45, fontsize=10)
plt.ylabel("Fatalities (Total)")
plt.title("Fatalities by Year (Afghanistan, 2017-2025)", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/afghanistan_fatalities_by_year_2017_2025.png')
plt.show()

years = yearly_fatalities_recent.index
afg_values = yearly_fatalities_af.reindex(years, fill_value=0)
global_values = yearly_fatalities_recent - afg_values

plt.figure(figsize=(12, 7))
plt.bar(years, global_values, label='Global Fatalities (Excl. Afghanistan)', color='steelblue')
plt.bar(years, afg_values, bottom=global_values, label='Afghanistan Fatalities', color='darkred')
plt.xlabel("Year", labelpad=15)
plt.ylabel("Fatalities (Total)")
plt.title("Stacked Fatalities by Year (Global vs. Afghanistan, 2017-2025)", fontsize=14)
plt.xticks(years, rotation=45, fontsize=10)
plt.legend(title="Region")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/stacked_fatalities_global_vs_afghanistan.png')
plt.show()

In [None]:
# Monthly Fatalities and Trend Lines
df_af['year_month'] = pd.to_datetime(df_af['event_date']).dt.to_period('M').astype(str)
monthly_fatalities_af = df_af.groupby(['year_month', 'event_type'])['fatalities'].sum().reset_index()

plt.figure(figsize=(14, 8))
sns.scatterplot(data=monthly_fatalities_af, x='year_month', y='fatalities', hue='event_type', style='event_type', palette='tab10', s=100, alpha=0.7)
plt.title("Monthly Fatalities by Event Type (Afghanistan)", fontsize=16)
plt.xlabel("Period (Year-Month)", fontsize=12, labelpad=15)
plt.ylabel("Fatalities", fontsize=12, labelpad=15)
unique_months = monthly_fatalities_af['year_month'].unique()
if len(unique_months) > 20:
    tick_interval = len(unique_months) // 8 
    tick_positions = np.arange(0, len(unique_months), tick_interval) 
    plt.xticks(tick_positions, unique_months[tick_positions], rotation=60, ha='right')
else:
    plt.xticks(rotation=60, ha='right') 
plt.legend(title="Event Type", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.tight_layout()
plt.savefig('visualizations/monthly_fatalities_by_event_type.png')
plt.show()

X_time_numeric = np.arange(len(monthly_fatalities_af)).reshape(-1, 1) 
y_fatalities = monthly_fatalities_af['fatalities'].values

linear_model = LinearRegression().fit(X_time_numeric, y_fatalities)
y_pred_linear = linear_model.predict(X_time_numeric)

plt.figure(figsize=(14, 8))
sns.scatterplot(data=monthly_fatalities_af, x='year_month', y='fatalities', hue='event_type', style='event_type', palette='tab10', s=100, alpha=0.7)
plt.plot(monthly_fatalities_af['year_month'], y_pred_linear, color='red', linestyle='--', linewidth=2, label='Linear Trend')
plt.title("Monthly Fatalities with Linear Regression (Afghanistan)", fontsize=16)
plt.xlabel("Period (Year-Month)", fontsize=12, labelpad=15)
plt.ylabel("Fatalities", fontsize=12, labelpad=15)
plt.xticks(tick_positions, unique_months[tick_positions], rotation=60, ha='right')
plt.legend(title="Event Type / Trend", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/monthly_fatalities_linear_regression.png')
plt.show()

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_time_numeric)
poly_model = LinearRegression().fit(X_poly, y_fatalities)
y_pred_poly = poly_model.predict(X_poly)

plt.figure(figsize=(14, 8))
sns.scatterplot(data=monthly_fatalities_af, x='year_month', y='fatalities', hue='event_type', style='event_type', palette='tab10', s=100, alpha=0.7)
plt.plot(monthly_fatalities_af['year_month'], y_pred_poly, color='green', linestyle='-', linewidth=2, label='Polynomial Trend (Degree 2)')
plt.title("Monthly Fatalities with Polynomial Regression (Afghanistan)", fontsize=16)
plt.xlabel("Period (Year-Month)", fontsize=12, labelpad=15)
plt.ylabel("Fatalities", fontsize=12, labelpad=15)
plt.xticks(tick_positions, unique_months[tick_positions], rotation=60, ha='right')
plt.legend(title="Event Type / Trend", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/monthly_fatalities_polynomial_regression.png')
plt.show()

lowess_model_fit = sm.nonparametric.lowess(y_fatalities, X_time_numeric.flatten(), frac=0.2)

plt.figure(figsize=(14, 8))
sns.scatterplot(data=monthly_fatalities_af, x='year_month', y='fatalities', hue='event_type', style='event_type', palette='tab10', s=100, alpha=0.7)
plt.plot(monthly_fatalities_af['year_month'], lowess_model_fit[:, 1], color='purple', linestyle=':', linewidth=3, label='LOESS Trend')
plt.title("Monthly Fatalities with LOESS Regression (Afghanistan)", fontsize=16)
plt.xlabel("Period (Year-Month)", fontsize=12, labelpad=15)
plt.ylabel("Fatalities", fontsize=12, labelpad=15)
plt.xticks(tick_positions, unique_months[tick_positions], rotation=60, ha='right')
plt.legend(title="Event Type / Trend", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/monthly_fatalities_loess_regression.png')
plt.show()

In [None]:
# Distribution of Fatalities and Actor Frequencies
df_af['year_month'] = pd.to_datetime(df_af['event_date']).dt.to_period('M').astype(str)
fatal_month_dist = df_af[df_af['year'] < 2024].groupby(['region', 'year_month'])['fatalities'].sum().reset_index()

plt.figure(figsize=(10, 6))
plt.hist(fatal_month_dist['fatalities'], bins=50, edgecolor='black', color='lightcoral')
plt.title('Distribution of Fatalities per Region-Month (Afghanistan, up to 2023)', fontsize=14)
plt.xlabel('Number of Fatalities', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('visualizations/fatalities_distribution_histogram.png')
plt.show()

plt.figure(figsize=(12, 7))
sns.barplot(x=df_acled['inter1'].value_counts().index, y=df_acled['inter1'].value_counts().values, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.xlabel("Interaction Type 1", labelpad=15)
plt.ylabel("Number of Events", labelpad=15)
plt.title("Event Frequency by Initiator Interaction Type (ACLED Global)", fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/event_frequency_by_initiator_type.png')
plt.show()

country_totals = df_acled.groupby("country", as_index=False)['fatalities'].sum()
top_n_countries = country_totals.sort_values('fatalities', ascending=False).head(20)

plt.figure(figsize=(12, 8))
sns.barplot(x='fatalities', y='country', data=top_n_countries, palette='rocket')
plt.xlabel("Fatalities (#)", labelpad=15)
plt.ylabel("Country", labelpad=15)
plt.title("Top 20 Countries by Fatalities (ACLED Global)", fontsize=16)
plt.gca().invert_yaxis() 
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/top_20_countries_by_fatalities.png')
plt.show()

actors_combined = pd.concat([df_af['actor1'], df_af['actor2']]).dropna()
top_actors = actors_combined.value_counts().head(10)

plt.figure(figsize=(12, 7))
sns.barplot(x=top_actors.values, y=top_actors.index, palette='crest')
plt.title("Top 10 Most Frequent Actors (Afghanistan)", fontsize=16)
plt.xlabel("Frequency", labelpad=15)
plt.ylabel("Actor", labelpad=15)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('visualizations/top_10_actors_afghanistan.png')
plt.show()