In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def build_hazard_impact(
    df,
    hazard_name,
    period_size=5,
    min_events=1
):
    """
    Complete pipeline:
    - removes nulls
    - aggregates
    - standardizes
    - outputs hazard impact feature

    Required columns:
    country | year | affected | death
    """

    df = df.copy()

    # Drop rows where both affected & death are missing
    df = df.dropna(subset=["affected", "death"], how="all")
    
    df = df.sort_values(["country", "year"])

    # CREATE TIME PERIOD (5-year bins)
    df["period"] = (df["year"] // period_size) * period_size

    # AGGREGATE WITHIN COUNTRY–PERIOD
    agg = (
        df.groupby(["country", "period"])
          .agg(
              affected=("affected", "mean"),
              death=("death", "mean"),
              events=("year", "count")
          )
          .reset_index()
    )

    # COVERAGE FILTER
    agg = agg[agg["events"] >= min_events]

    # LOG TRANSFORM (reduce skew)
    agg["affected_log"] = np.log1p(agg["affected"])
    agg["death_log"] = np.log1p(agg["death"])

    # STANDARDIZE (z-score)
    scaler = StandardScaler()
    agg[["affected_z", "death_z"]] = scaler.fit_transform(
        agg[["affected_log", "death_log"]]
    )

    # FINAL IMPACT SCORE
    agg[f"{hazard_name}_impact"] = agg["affected_z"] + agg["death_z"]

    # RETURN CLEAN FEATURE
    return agg[["country", "period", f"{hazard_name}_impact"]]


In [22]:
drought_aff = pd.read_csv('../data/raw/total-affected-by-drought/affected.csv')
drought_death = pd.read_csv('../data/raw/death-rate-from-drought/death.csv')
drought_aff.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Total number of people affected by drought per 100,000': 'affected'
    },
    inplace=True
)
drought_death.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Death rates from drought': 'death'
    },
    inplace=True
)
drought_df = pd.merge(drought_aff, drought_death, on=['country', 'year'])
drought_final = build_hazard_impact(drought_df, "drought")
drought_final.to_csv("../data/processed/drought_data.csv")

In [23]:
storms_aff = pd.read_csv('../data/raw/total-affected-by-storms/affected.csv')
storms_death = pd.read_csv('../data/raw/death-rate-from-storms/death.csv')
storms_aff.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Total number of people affected by storms per 100,000': 'affected' 
    },
    inplace=True
)
storms_death.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Death rates from storms': 'death'
    },
    inplace=True
)
storms_df = pd.merge(storms_aff, storms_death, on=['country', 'year'])
storms_final = build_hazard_impact(storms_df, "storms") 
storms_final.to_csv("../data/processed/storms_data.csv")

In [24]:
extreme_temp_aff = pd.read_csv('../data/raw/total-affected-by-extreme-temperatures/affected.csv')
extreme_temp_death = pd.read_csv('../data/raw/death-rate-from-extreme-temperatures/death.csv')
extreme_temp_aff.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Total number of people affected by extreme temperatures per 100,000': 'affected'
    },
    inplace=True
)
extreme_temp_death.rename(
    columns={
        'Country name': 'country',
        'Year': 'year',
        'Death rates from extreme temperatures': 'death'
    },
    inplace=True
)
extreme_temp_df = pd.merge(extreme_temp_aff, extreme_temp_death, on=['country', 'year'])
extreme_temp_final = build_hazard_impact(extreme_temp_df, "extreme_temp")
extreme_temp_final.to_csv("../data/processed/extreme_temp_data.csv")

In [25]:
extreme_temp_final

Unnamed: 0,country,period,extreme_temp_impact
0,Afghanistan,1990,0.130512
1,Afghanistan,2000,3.357964
2,Afghanistan,2005,3.261712
3,Afghanistan,2010,-0.739896
4,Afghanistan,2020,2.932144
...,...,...,...
499,World,2010,0.565942
500,World,2015,0.178903
501,World,2020,1.520206
502,World,2025,-1.013837


In [26]:
storms_final

Unnamed: 0,country,period,storms_impact
0,Afghanistan,1990,-1.466066
1,Afghanistan,2005,0.196208
2,Afghanistan,2010,-1.400176
3,Afghanistan,2015,-0.349956
4,Afghanistan,2020,-0.581605
...,...,...,...
1553,Zimbabwe,1975,-1.357780
1554,Zimbabwe,2000,-1.467594
1555,Zimbabwe,2005,-1.535481
1556,Zimbabwe,2015,1.431947


In [27]:
drought_final

Unnamed: 0,country,period,drought_impact
0,Afghanistan,1965,-0.069536
1,Afghanistan,1970,-1.661444
2,Afghanistan,2000,0.569529
3,Afghanistan,2005,0.527329
4,Afghanistan,2010,0.615053
...,...,...,...
853,Zimbabwe,2000,0.881927
854,Zimbabwe,2005,0.589703
855,Zimbabwe,2010,0.589699
856,Zimbabwe,2015,0.786495
