In [1]:
# Imports
import numpy as np
import polars as pl

from lets_plot import*

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from catboost import Pool

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

LetsPlot.setup_html(no_js=True)
pl.Config.set_tbl_rows(20)

polars.config.Config

In [10]:
df = pl.read_csv('data/climate_resilience_simulation_dataset.csv')
df_train, df_test = train_test_split(df, test_size=0.20, random_state=19970507, shuffle=True)

In [11]:
df.schema

Schema([('City', String),
        ('Disaster_Type', String),
        ('Urban_Planning_Type', String),
        ('Disaster_Severity', Float64),
        ('Population_Density', Int64),
        ('Avg_Income', Int64),
        ('Response_Time_hr', Float64),
        ('Damage_Cost_USD', Float64),
        ('Recovery_Time_days', Int64),
        ('Resilience_Score', Float64)])

In [12]:
df.describe()

statistic,City,Disaster_Type,Urban_Planning_Type,Disaster_Severity,Population_Density,Avg_Income,Response_Time_hr,Damage_Cost_USD,Recovery_Time_days,Resilience_Score
str,str,str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""500""","""500""","""500""",500.0,500.0,500.0,500.0,500.0,500.0,500.0
"""null_count""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,,5.57162,5641.11,75584.468,35.9128,303893.1911,89.36,74.71942
"""std""",,,,2.591805,2583.013209,26283.709454,21.087089,215345.548197,63.665477,15.186488
"""min""","""Boston""","""Flood""","""Dense Urban""",1.07,1027.0,30053.0,1.1,18066.0,4.0,30.42
"""25%""",,,,3.34,3470.0,52216.0,16.47,129092.71,38.0,65.0
"""50%""",,,,5.66,5687.0,77333.0,35.62,255988.69,72.0,77.91
"""75%""",,,,7.8,7825.0,99515.0,54.03,433311.0,129.0,86.99
"""max""","""Seattle""","""Wildfire""","""Suburban Sprawl""",10.0,9991.0,119995.0,71.9,974537.31,281.0,97.07


In [13]:
print(f'Column names: {df_train.columns}')
print(f'The number of rows in the training data is {df_train.height}')
print(f'The number of columns in the training data is {df_train.width}')

Column names: ['City', 'Disaster_Type', 'Urban_Planning_Type', 'Disaster_Severity', 'Population_Density', 'Avg_Income', 'Response_Time_hr', 'Damage_Cost_USD', 'Recovery_Time_days', 'Resilience_Score']
The number of rows in the training data is 400
The number of columns in the training data is 10


In [14]:
target = {
    'City': 'city',
    'Disaster_Type': 'disaster_type',
    'Urban_Planning_Type': 'urban_planning_type',
    'Disaster_Severity': 'disaster_severity',
    'Population_Density':'population_density',
    'Avg_Income': 'avg_income',
    'Response_Time_hr': 'response_time_hr',
    'Damage_Cost_USD': 'damage_cost_usd',
    'Recovery_Time_days': 'recovery_time_days',
    'Resilience_Score': 'resilience_score'
}
df_train = df_train.rename(target)

df_test = df_test.rename(target)
df_train.columns, df_test.columns

(['city',
  'disaster_type',
  'urban_planning_type',
  'disaster_severity',
  'population_density',
  'avg_income',
  'response_time_hr',
  'damage_cost_usd',
  'recovery_time_days',
  'resilience_score'],
 ['city',
  'disaster_type',
  'urban_planning_type',
  'disaster_severity',
  'population_density',
  'avg_income',
  'response_time_hr',
  'damage_cost_usd',
  'recovery_time_days',
  'resilience_score'])

In [15]:
df_train.null_count(), df_test.null_count(), df_train.shape, df_test.shape

(shape: (1, 10)
 ┌──────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
 │ city ┆ disaster_t ┆ urban_plan ┆ disaster_s ┆ … ┆ response_t ┆ damage_co ┆ recovery_ ┆ resilienc │
 │ ---  ┆ ype        ┆ ning_type  ┆ everity    ┆   ┆ ime_hr     ┆ st_usd    ┆ time_days ┆ e_score   │
 │ u32  ┆ ---        ┆ ---        ┆ ---        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
 │      ┆ u32        ┆ u32        ┆ u32        ┆   ┆ u32        ┆ u32       ┆ u32       ┆ u32       │
 ╞══════╪════════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
 │ 0    ┆ 0          ┆ 0          ┆ 0          ┆ … ┆ 0          ┆ 0         ┆ 0         ┆ 0         │
 └──────┴────────────┴────────────┴────────────┴───┴────────────┴───────────┴───────────┴───────────┘,
 shape: (1, 10)
 ┌──────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
 │ city ┆ disaster_t ┆ urban_plan ┆ disaster_s ┆ 

In [8]:
print(df_train.get_column('city').value_counts())
print(df_train.get_column('disaster_type').value_counts())
print(df_train.get_column('urban_planning_type').value_counts())

shape: (10, 2)
┌───────────────┬───────┐
│ city          ┆ count │
│ ---           ┆ ---   │
│ str           ┆ u32   │
╞═══════════════╪═══════╡
│ Los Angeles   ┆ 37    │
│ Chicago       ┆ 45    │
│ Houston       ┆ 44    │
│ Miami         ┆ 47    │
│ New York      ┆ 40    │
│ San Francisco ┆ 47    │
│ Boston        ┆ 30    │
│ Phoenix       ┆ 35    │
│ Denver        ┆ 44    │
│ Seattle       ┆ 31    │
└───────────────┴───────┘
shape: (5, 2)
┌───────────────┬───────┐
│ disaster_type ┆ count │
│ ---           ┆ ---   │
│ str           ┆ u32   │
╞═══════════════╪═══════╡
│ Flood         ┆ 71    │
│ Tornado       ┆ 83    │
│ Hurricane     ┆ 82    │
│ Wildfire      ┆ 74    │
│ Heatwave      ┆ 90    │
└───────────────┴───────┘
shape: (5, 2)
┌──────────────────────┬───────┐
│ urban_planning_type  ┆ count │
│ ---                  ┆ ---   │
│ str                  ┆ u32   │
╞══════════════════════╪═══════╡
│ Mixed Use            ┆ 74    │
│ Suburban Sprawl      ┆ 92    │
│ Green Infrastructure ┆

In [18]:
df_train.write_csv('data/train.csv')
df_test.write_csv('data/test.csv')

In [19]:
features = [
    'city', 'disaster_type', 'urban_planning_type',
    'disaster_severity', 'population_density',
    'avg_income', 'response_time_hr',
    'damage_cost_usd',
]

target = 'recovery_time_days'

In [21]:
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [25]:
def encode_categoricals(df: pl.DataFrame, cat_cols):
    """auto label encoding """
    out = df.clone()
    mappings = {}

    for c in cat_cols:
        uniq = out[c].unique().to_list()
        mapping = {v: i for i, v in enumerate(uniq)}
        mappings[c] = mapping

        out = out.with_columns(
            pl.col(c).replace(mapping, default=-1).alias(c)
        )

    return out, mappings


def compute_metrics(y_true, y_pred):
    """compute metrics """
    return {
        'mse': mean_squared_error(y_true, y_pred),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
    }



In [116]:
import os
import joblib

def save_model(model, name: str, dir_path="models"):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    if name.lower() == "catboost":
        # catboost 必须用自身的保存方式
        model.save_model(os.path.join(dir_path, f"{name}.cbm"))
    else:
        joblib.dump(model, os.path.join(dir_path, f"{name}.pkl"))

def train_and_evaluate(df_train: pl.DataFrame, df_test: pl.DataFrame):

    categorical_cols = [c for c in features if df_train[c].dtype == pl.Utf8]

    # category encoding
    df_train_enc, map_train = encode_categoricals(df_train, categorical_cols)
    df_test_enc, _ = encode_categoricals(df_test, categorical_cols)

    X_train = df_train_enc[features].to_numpy()
    y_train = df_train_enc[target].to_numpy()

    X_test = df_test_enc[features].to_numpy()
    y_test = df_test_enc[target].to_numpy()

    # disaster_type group by
    test_group_type = df_test['disaster_type'].to_list()

    # model using
    models = {
        "GradientBoosting": GradientBoostingRegressor(),
        "XGBoost": XGBRegressor(
            n_estimators=300,
            learning_rate=0.08,
            max_depth=6,
            subsample=0.9,
            colsample_bytree=0.8,
            objective='reg:squarederror'
        ),
        "AdaBoost": AdaBoostRegressor(n_estimators=100, learning_rate=0.08),
        "LightGBM": LGBMRegressor(
            n_estimators=300,
            learning_rate=0.08,
            num_leaves=31,
            objective='regression'
        ),
        "CatBoost": CatBoostRegressor(
            iterations=300,
            learning_rate=0.08,
            depth=6,
            loss_function='RMSE',
            verbose=False
        ),
    }

    global_results = {}
    group_results = {}

    # training loop
    for name, model in models.items():
        print(f"\n==== Training {name} ====")

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        # global metrics
        global_results[name] = compute_metrics(y_test, preds)

        # group by disaster_type
        group_metrics = {}
        df_eval = pl.DataFrame({
            "disaster_type": test_group_type,
            "y_true": y_test,
            "y_pred": preds,
        })

        for group_val in df_eval["disaster_type"].unique().to_list():
            sub = df_eval.filter(pl.col("disaster_type") == group_val)
            gm = compute_metrics(
                sub["y_true"].to_numpy(),
                sub["y_pred"].to_numpy()
            )
            group_metrics[group_val] = gm

        group_results[name] = group_metrics
        save_model(model, name)
        print(f"Model saved to models/{name}.*")

    return global_results, group_results

In [120]:
global_results, group_results = train_and_evaluate(df_train, df_test)

(Deprecated in version 1.0.0)
  pl.col(c).replace(mapping, default=-1).alias(c)



==== Training GradientBoosting ====
Model saved to models/GradientBoosting.*

==== Training XGBoost ====
Model saved to models/XGBoost.*

==== Training AdaBoost ====
Model saved to models/AdaBoost.*

==== Training LightGBM ====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 688
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 8
[LightGBM] [Info] Start training from score 88.145000
Model saved to models/LightGBM.*

==== Training CatBoost ====
Model saved to models/CatBoost.*




In [121]:
global_results

{'GradientBoosting': {'mse': 2746.9501051804655,
  'mae': 39.553191612909814,
  'r2': 0.43672082122295985},
 'XGBoost': {'mse': 2821.19384765625,
  'mae': 40.224945068359375,
  'r2': 0.4214966297149658},
 'AdaBoost': {'mse': 2414.523261496384,
  'mae': 37.90386296675669,
  'r2': 0.5048870100302048},
 'LightGBM': {'mse': 2956.698504324753,
  'mae': 41.65628959859097,
  'r2': 0.3937106093530828},
 'CatBoost': {'mse': 2774.0197617968615,
  'mae': 40.211236417970845,
  'r2': 0.43117002001987126}}

In [122]:
group_results

{'GradientBoosting': {'Wildfire': {'mse': 2650.9362737281745,
   'mae': 36.46993358061885,
   'r2': 0.3625998754593176},
  'Heatwave': {'mse': 3774.145449493282,
   'mae': 51.61557936008905,
   'r2': 0.2501003430929506},
  'Flood': {'mse': 4337.552944099405,
   'mae': 46.22324808181839,
   'r2': 0.4198512623511711},
  'Hurricane': {'mse': 1740.585744977323,
   'mae': 34.44828717374186,
   'r2': 0.5978524675174859},
  'Tornado': {'mse': 1937.7186588309512,
   'mae': 33.41766353484796,
   'r2': 0.4742688483318469}},
 'XGBoost': {'Hurricane': {'mse': 1785.1773681640625,
   'mae': 32.8838996887207,
   'r2': 0.5875499248504639},
  'Tornado': {'mse': 1756.207763671875,
   'mae': 33.84458541870117,
   'r2': 0.5235153436660767},
  'Flood': {'mse': 4881.474609375,
   'mae': 52.912742614746094,
   'r2': 0.3471015691757202},
  'Heatwave': {'mse': 4334.412109375,
   'mae': 53.481597900390625,
   'r2': 0.13877874612808228},
  'Wildfire': {'mse': 2301.669921875,
   'mae': 34.742122650146484,
   'r2'

In [125]:
import json
with open("results/group_results.json", "w") as f:
    json.dump(group_results, f, indent = 2)

with open("results/global_results.json", "w") as f:
    json.dump(global_results, f, indent = 2)