In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def build_hazard_impact(
    df,
    hazard_name,
    period_size=5,
    min_events=1
):
    """
    Complete pipeline:
    - removes nulls
    - aggregates
    - standardizes
    - outputs total number of deaths and affected people

    Required columns:
    country | year | affected | death
    """

    df = df.copy()

    # Drop rows where both affected & death are missing
    df = df.dropna(subset=["affected", "death"], how="all")
    
    df = df.sort_values(["country", "year"])

    # CREATE TIME PERIOD (5-year bins)
    df["period"] = (df["year"] // period_size) * period_size

    # AGGREGATE WITHIN COUNTRYâ€“PERIOD
    agg = (
        df.groupby(["country", "period"])
          .agg(
              events=("year", "count"),
              total_deaths=("death", "sum"),
              total_affected=("affected", "sum")
          )
          .reset_index()
    )

    # Filter by minimum events
    agg = agg[agg["events"] >= min_events]

    # Rename columns to include hazard name
    agg = agg.rename(columns={
        "total_affected": f"{hazard_name}_total_affected",
        "total_deaths": f"{hazard_name}_total_deaths"
    })

    return agg[["country", "period", f"{hazard_name}_total_affected", f"{hazard_name}_total_deaths"]]


In [11]:
extreme_temp_aff = pd.read_csv('../data/raw/total_count/total-affected-by-extreme-temperatures/affected.csv')
extreme_temp_death = pd.read_csv('../data/raw/total_count/deaths-from-extreme-temperatures/deaths.csv')

extreme_temp_aff.rename(
    columns={
        'total_affected_temperature': 'affected'
    },
    inplace=True
)
extreme_temp_death.rename(
    columns={
        'deaths_temperature': 'death'
    },
    inplace=True
)

extreme_temp_df = pd.merge(extreme_temp_aff, extreme_temp_death, on=['country', 'year'])
extreme_temp_final = build_hazard_impact(extreme_temp_df, "extreme_temp")
extreme_temp_final.to_csv("../data/processed/extreme_temp_data.csv", index=False)
print("Extreme temperature data processed:")
print(extreme_temp_final.head())
print(f"\nTotal rows: {len(extreme_temp_final)}")

Extreme temperature data processed:
       country  period  extreme_temp_total_affected  extreme_temp_total_deaths
0  Afghanistan    1990                        200.0                      224.0
1  Afghanistan    2000                     200000.0                      327.0
2  Afghanistan    2005                     170684.0                     1338.0
3  Afghanistan    2010                         68.0                       45.0
4  Afghanistan    2020                     327422.0                     1363.0

Total rows: 505


In [12]:
# DROUGHT DATA PROCESSING
drought_aff = pd.read_csv('../data/raw/total_count/total-affected-by-drought/affected.csv')
drought_death = pd.read_csv('../data/raw/total_count/deaths-from-drought/deaths.csv')

drought_aff.rename(
    columns={
        'total_affected_drought': 'affected'
    },
    inplace=True
)
drought_death.rename(
    columns={
        'deaths_drought': 'death'
    },
    inplace=True
)

drought_df = pd.merge(drought_aff, drought_death, on=['country', 'year'])
drought_final = build_hazard_impact(drought_df, "drought")
drought_final.to_csv("../data/processed/drought_data.csv", index=False)
print("Drought data processed:")
print(drought_final.head())
print(f"\nTotal rows: {len(drought_final)}")


Drought data processed:
       country  period  drought_total_affected  drought_total_deaths
0  Afghanistan    1965                 48000.0                   0.0
1  Afghanistan    1970                     0.0                   0.0
2  Afghanistan    2000               2580000.0                  37.0
3  Afghanistan    2005               2180000.0                   0.0
4  Afghanistan    2010               1750000.0                   0.0

Total rows: 860


In [13]:
# FLOOD DATA PROCESSING
flood_aff = pd.read_csv('../data/raw/total_count/total-affected-by-floods/affected.csv')
flood_death = pd.read_csv('../data/raw/total_count/deaths-from-floods/deaths.csv')

flood_aff.rename(
    columns={
        'total_affected_flood': 'affected'
    },
    inplace=True
)
flood_death.rename(
    columns={
        'deaths_flood': 'death'
    },
    inplace=True
)

flood_df = pd.merge(flood_aff, flood_death, on=['country', 'year'])
flood_final = build_hazard_impact(flood_df, "flood")
flood_final.to_csv("../data/processed/flood_data.csv", index=False)
print("Flood data processed:")
print(flood_final.head())
print(f"\nTotal rows: {len(flood_final)}")


Flood data processed:
       country  period  flood_total_affected  flood_total_deaths
0  Afghanistan    1955                   0.0                51.0
1  Afghanistan    1960                   0.0               107.0
2  Afghanistan    1970              250000.0               150.0
3  Afghanistan    1975              351684.0               171.0
4  Afghanistan    1980               30000.0                 0.0

Total rows: 1652


In [14]:
# STORM DATA PROCESSING
storm_aff = pd.read_csv('../data/raw/total_count/total-affected-by-storms/affected.csv')
storm_death = pd.read_csv('../data/raw/total_count/deaths-from-storms/deaths.csv')

storm_aff.rename(
    columns={
        'total_affected_storm': 'affected'
    },
    inplace=True
)
storm_death.rename(
    columns={
        'deaths_storm': 'death'
    },
    inplace=True
)

storm_df = pd.merge(storm_aff, storm_death, on=['country', 'year'])
storm_final = build_hazard_impact(storm_df, "storm")
storm_final.to_csv("../data/processed/storms_data.csv", index=False)
print("Storm data processed:")
print(storm_final.head())
print(f"\nTotal rows: {len(storm_final)}")


Storm data processed:
       country  period  storm_total_affected  storm_total_deaths
0  Afghanistan    1990                   0.0                10.0
1  Afghanistan    2005               22656.0               331.0
2  Afghanistan    2010                   5.0                84.0
3  Afghanistan    2015                9055.0                67.0
4  Afghanistan    2020                7481.0                13.0

Total rows: 1584


In [16]:
# MERGE ALL HAZARDS INTO FINAL DATASET
# Merge all hazards on country and period
final_df = extreme_temp_final.copy()

final_df = final_df.merge(drought_final, on=['country', 'period'], how='outer')
final_df = final_df.merge(flood_final, on=['country', 'period'], how='outer')
final_df = final_df.merge(storm_final, on=['country', 'period'], how='outer')

# Fill NaN values with 0 (no impact for that hazard in that period)
final_df = final_df.fillna(0)

# Sort by country and period
final_df = final_df.sort_values(['country', 'period']).reset_index(drop=True)

# Save final merged dataset
final_df.to_csv("../data/processed/final_total_data.csv", index=False)
print("Final merged dataset:")
print(f"\nTotal rows: {len(final_df)}")
print(f"\nColumns: {list(final_df.columns)}")


Final merged dataset:

Total rows: 2440

Columns: ['country', 'period', 'extreme_temp_total_affected', 'extreme_temp_total_deaths', 'drought_total_affected', 'drought_total_deaths', 'flood_total_affected', 'flood_total_deaths', 'storm_total_affected', 'storm_total_deaths']
