Team members : Thibault GAUTHÃ‰, Nathan GEORGES, Thomas JIN, Yijia ZENG, Zilong XU

Dataset : Humanitarian Aid : Country level data / Personnel 



In [14]:
# Import libraries

import pandas as pd
import numpy as np
import plotly.express as px  
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

In [15]:
# Function definitions

def load_data(file_path):
    """
    Function for reading a CSV file.
    Input: file_path (str) - the path to the csv file
    Output: pandas DataFrame containing the loaded data
    """
    return pd.read_csv(file_path)

def preprocess_data(df):
    """
    Cleans the dataframe: converts dates and fills missing values.
    Input: df (pandas DataFrame) - raw dataframe with potential missing values
    Output: pandas DataFrame - cleaned dataframe with datetime dates and filled missing values
    """
    # Convert Date column to datetime objects
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Fill NaN values with 0 for numerical contribution columns
    fill_cols = ['Troop Contributions', 'Police Contributions', 
                 'EOM Contributions', 'Total Contributions']
    df[fill_cols] = df[fill_cols].fillna(0)
    
    return df


def get_global_trend(df):
    """
    Indicator 1: Evolution of Total Contributions over time.
    Input: df (pandas DataFrame) - preprocessed dataframe with Date and Total Contributions columns
    Output: pandas DataFrame with Date and aggregated Total Contributions
    """
    trend = df.groupby('Date')['Total Contributions'].sum().reset_index()
    return trend

def get_top_contributors(df, n=10):
    """
    Indicator 2: Top n contributing countries (historical sum).
    Input: df (pandas DataFrame) - preprocessed dataframe; n (int) - number of top contributors to return (default=10)
    Output: pandas DataFrame with Contributor and Total Contributions for top n countries
    """
    top = df.groupby('Contributor')['Total Contributions'].sum().nlargest(n).reset_index()
    return top

def get_contribution_composition(df):
    """
    Indicator 3: Total split between Troops, Police, and EOM.
    Input: df (pandas DataFrame) - preprocessed dataframe with contribution type columns
    Output: pandas DataFrame with Type and Count columns showing distribution
    """
    # Summing up the specific columns
    composition = df[['Troop Contributions', 'Police Contributions', 'EOM Contributions']].sum().reset_index()
    composition.columns = ['Type', 'Count'] # Rename for easier plotting
    return composition

def get_regional_distribution(df):
    """
    Indicator 4: Total contributions grouped by Region.
    Input: df (pandas DataFrame) - preprocessed dataframe with Contributor Region column
    Output: pandas DataFrame with Contributor Region and Total Contributions
    """
    region_dist = df.groupby('Contributor Region')['Total Contributions'].sum().reset_index()
    return region_dist

def explore_data_structure(df):
    """
    Performs the required data summarization.
    Input: The dataframe.
    Output: None (prints info).
    """
    print("--- Data Types ---")
    print(df.dtypes)
    print("\n--- Missing Values ---")
    print(df.isnull().sum())
    print("\n--- Descriptive Statistics ---")
    print(df.describe())

def get_normalized_contributions(df):
    """
    Indicator 5: Normalized contributions showing each country's share of total contributions as a percentage.
    Input: df (pandas DataFrame) - preprocessed dataframe with Contributor and Total Contributions columns
    Output: pandas DataFrame with Contributor and Share_Percentage columns for top contributors
    """
    # Calculate total contributions per country
    country_totals = df.groupby('Contributor')['Total Contributions'].sum().reset_index()
    
    # Calculate global total
    global_total = country_totals['Total Contributions'].sum()
    
    # Calculate percentage share
    country_totals['Share_Percentage'] = (country_totals['Total Contributions'] / global_total) * 100
    
    # Sort by share and get top 10
    norm_df = country_totals.nlargest(10, 'Share_Percentage')[['Contributor', 'Share_Percentage']]
    
    return norm_df

def forecast_contributions(df, periods=12):
    """
    TEMPORAL ANALYSIS: Forecasts future peacekeeping contributions using exponential smoothing.
    Input: df (pandas DataFrame) - preprocessed dataframe with Date and Total Contributions
           periods (int) - number of months to forecast ahead
    Output: tuple (historical_df, forecast_df) - historical data and forecasted values
    """
    # Aggregate by month for time series
    monthly_data = df.groupby(pd.Grouper(key='Date', freq='MS'))['Total Contributions'].sum().reset_index()
    monthly_data = monthly_data[monthly_data['Total Contributions'] > 0]  # Remove zeros
    
    # Fit Exponential Smoothing model (Holt-Winters)
    model = ExponentialSmoothing(
        monthly_data['Total Contributions'], 
        seasonal_periods=12,
        trend='add',
        seasonal='add'
    )
    fitted_model = model.fit()
    
    # Make predictions
    forecast = fitted_model.forecast(steps=periods)
    
    # Create forecast dataframe
    last_date = monthly_data['Date'].max()
    forecast_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=periods, freq='MS')
    forecast_df = pd.DataFrame({
        'Date': forecast_dates,
        'Forecasted_Contributions': forecast.values
    })
    
    return monthly_data, forecast_df, fitted_model

def spatial_clustering(df, n_clusters=4):
    """
    SPATIAL ANALYSIS: Clusters countries based on contribution patterns and regional features.
    Input: df (pandas DataFrame) - preprocessed dataframe
           n_clusters (int) - number of clusters to create
    Output: pandas DataFrame with Contributor, cluster assignments, and contribution metrics
    """
    # Aggregate data by country
    country_features = df.groupby('Contributor').agg({
        'Total Contributions': ['sum', 'mean', 'std'],
        'Troop Contributions': 'sum',
        'Police Contributions': 'sum',
        'EOM Contributions': 'sum',
        'Contributor Region': 'first'
    }).reset_index()
    
    # Flatten column names
    country_features.columns = ['Contributor', 'Total_Sum', 'Total_Mean', 'Total_Std', 
                                 'Troops_Sum', 'Police_Sum', 'EOM_Sum', 'Region']
    
    # Fill NaN std with 0
    country_features['Total_Std'] = country_features['Total_Std'].fillna(0)
    
    # Select features for clustering
    feature_cols = ['Total_Sum', 'Total_Mean', 'Total_Std', 'Troops_Sum', 'Police_Sum', 'EOM_Sum']
    X = country_features[feature_cols]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    country_features['Cluster'] = kmeans.fit_predict(X_scaled)
    
    # Add cluster labels
    country_features['Cluster_Label'] = country_features['Cluster'].map({
        0: 'Low Contributors',
        1: 'Medium Contributors', 
        2: 'High Contributors',
        3: 'Very High Contributors'
    })
    
    return country_features, kmeans, scaler


In [25]:
# --- Main Execution Block ---
if __name__ == "__main__":
    # datacolec and cleaning
    file_path = "country_level_data.csv"
    
    # Load and clean
    print(43*"=")
    print("=== PEACEKEEPING CONTRIBUTIONS ANALYSIS ===")
    print(43*"=")
    df_raw = load_data(file_path)
    df = preprocess_data(df_raw)

    # modification
    print("--- DATA EXPLORATION ---")
    explore_data_structure(df)
    
    print(f"\nData loaded. Shape: {df.shape}")
    print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")

    # --- Step 2: Compute Indicators ---
    print(43*"=")
    print("Computing indicators...")
    trend_df = get_global_trend(df)
    top_df = get_top_contributors(df, n=10)
    comp_df = get_contribution_composition(df)
    reg_df = get_regional_distribution(df)
    norm_df = get_normalized_contributions(df)
    print("Indicators computed.")
    
    # --- TEMPORAL ANALYSIS: Forecasting ---
    print(43*"=")
    print("Performing temporal forecasting...")
    monthly_data, forecast_df, model = forecast_contributions(df, periods=24)
    print(f"Forecast generated for next 24 months")
    
    # --- SPATIAL ANALYSIS: Clustering ---
    print(43*"=")
    print("Performing spatial clustering...")
    cluster_df, kmeans_model, scaler = spatial_clustering(df, n_clusters=4)
    print(f"Countries clustered into 4 groups")
    
    # --- Visualization (Inline) ---
    print(43*"=")
    print("Generating visualizations...")
    
    # 1. Global Trend Line Chart
    fig1 = px.line(trend_df, x='Date', y='Total Contributions', 
                   title='Indicator 1: Global Peacekeeping Personnel Trend (1990-2017)')
    fig1.show()
    
    # 2. Top Contributors Bar Chart
    fig2 = px.bar(top_df, x='Total Contributions', y='Contributor', orientation='h',
                  title='Indicator 2: Top 10 Contributing Countries (Historical Total)',
                  text='Total Contributions')
    fig2.show()
    
    # 3. Pie Chart
    fig3 = px.pie(comp_df, values='Count', names='Type', 
                  title='Indicator 3: Composition of Peacekeeping Forces',
                  color_discrete_sequence=px.colors.sequential.RdBu)
    fig3.show()
    
    # 4. Regional Distribution Bar Chart
    fig4 = px.bar(reg_df, x='Contributor Region', y='Total Contributions', 
                  title='Indicator 4: Contributions by Region',
                  color='Contributor Region')
    fig4.show()

    # 5. Normalized Contributions Bar Chart
    fig5 = px.bar(norm_df, x='Share_Percentage', y='Contributor', orientation='h',
                  title='Indicator 5: Normalized Contributions (% of Global Total)',
                  text='Share_Percentage',
                  labels={'Share_Percentage': 'Share of Total Contributions (%)'})
    fig5.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
    fig5.show()
    
    # 6. TEMPORAL: Forecast Visualization
    fig6 = px.line(title='Indicator 6:Temporal Analysis: Peacekeeping Contributions Forecast (24 months)')
    fig6.add_scatter(x=monthly_data['Date'], y=monthly_data['Total Contributions'], 
                     mode='lines', name='Historical Data', line=dict(color='blue'))
    fig6.add_scatter(x=forecast_df['Date'], y=forecast_df['Forecasted_Contributions'], 
                     mode='lines', name='Forecast', line=dict(color='red'))
    fig6.update_xaxes(title_text='Date')
    fig6.update_yaxes(title_text='Total Contributions')
    fig6.show()
    
    # 7. SPATIAL: Cluster Visualization
    top_30 = cluster_df.nlargest(30, 'Total_Sum')
    fig7 = px.scatter(top_30, x='Total_Sum', y='Total_Mean', 
                      color='Cluster_Label', size='Troops_Sum',
                      hover_data=['Contributor', 'Region'],
                      title='Indicator 7: Spatial Analysis: Country Clustering by Contribution Patterns (Top 30)',
                      labels={'Total_Sum': 'Total Contributions (Sum)', 
                              'Total_Mean': 'Average Contribution per Period'})
    fig7.show()
    
    # 8. SPATIAL: Cluster Distribution by Region
    cluster_region = cluster_df.groupby(['Cluster_Label', 'Region']).size().reset_index(name='Count')
    fig8 = px.bar(cluster_region, x='Cluster_Label', y='Count', color='Region',
                  title='Indicator 8: Spatial Analysis: Cluster Distribution Across Regions',
                  barmode='stack')
    fig8.show()
    
    print("\n=== ANALYSIS COMPLETE ===")
    print(f"\nTemporal Forecast Summary:")
    print(f"  - Last historical value: {monthly_data['Total Contributions'].iloc[-1]:,.0f}")
    print(f"  - Forecasted 6-month avg: {forecast_df['Forecasted_Contributions'].head(6).mean():,.0f}")
    print(f"  - Forecasted 24-month avg: {forecast_df['Forecasted_Contributions'].mean():,.0f}")
    
    print(f"\nSpatial Clustering Summary:")
    print(cluster_df['Cluster_Label'].value_counts().sort_index())


=== PEACEKEEPING CONTRIBUTIONS ANALYSIS ===
--- DATA EXPLORATION ---
--- Data Types ---
Date                                 datetime64[ns]
Contributor                                  object
Contributor ISO-3                            object
Contributor Capital Longitude               float64
Contributor Capital Latitude                float64
Contributor Continent                        object
Contributor Region                           object
Contributor UN Bloc                          object
Contributor - P5, G4 or A3                   object
Contributor - NAM                             int64
Contributor - G77                             int64
Contributor - AU                              int64
Contributor - Arab League                     int64
Contributor - OIC                             int64
Contributor - CIS                             int64
Contributor - G20                             int64
Contributor - EU                              int64
Number of Missions Contribut


=== ANALYSIS COMPLETE ===

Temporal Forecast Summary:
  - Last historical value: 96,865
  - Forecasted 6-month avg: 96,560
  - Forecasted 24-month avg: 96,178

Spatial Clustering Summary:
Cluster_Label
High Contributors           3
Low Contributors          108
Medium Contributors         9
Very High Contributors     35
Name: count, dtype: int64
