In [1]:
# Imports
import numpy as np
import polars as pl

from lets_plot import*

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from catboost import Pool

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

LetsPlot.setup_html(no_js=True)
pl.Config.set_tbl_rows(20)

polars.config.Config

In [2]:
df = pl.read_csv('data/climate_resilience_simulation_dataset.csv')
df_train, df_test = train_test_split(df, test_size=0.20, random_state=19970507, shuffle=True)

In [3]:
df.schema

Schema([('City', String),
        ('Disaster_Type', String),
        ('Urban_Planning_Type', String),
        ('Disaster_Severity', Float64),
        ('Population_Density', Int64),
        ('Avg_Income', Int64),
        ('Response_Time_hr', Float64),
        ('Damage_Cost_USD', Float64),
        ('Recovery_Time_days', Int64),
        ('Resilience_Score', Float64)])

In [4]:
df.describe()

statistic,City,Disaster_Type,Urban_Planning_Type,Disaster_Severity,Population_Density,Avg_Income,Response_Time_hr,Damage_Cost_USD,Recovery_Time_days,Resilience_Score
str,str,str,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""500""","""500""","""500""",500.0,500.0,500.0,500.0,500.0,500.0,500.0
"""null_count""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,,5.57162,5641.11,75584.468,35.9128,303893.1911,89.36,74.71942
"""std""",,,,2.591805,2583.013209,26283.709454,21.087089,215345.548197,63.665477,15.186488
"""min""","""Boston""","""Flood""","""Dense Urban""",1.07,1027.0,30053.0,1.1,18066.0,4.0,30.42
"""25%""",,,,3.34,3470.0,52216.0,16.47,129092.71,38.0,65.0
"""50%""",,,,5.66,5687.0,77333.0,35.62,255988.69,72.0,77.91
"""75%""",,,,7.8,7825.0,99515.0,54.03,433311.0,129.0,86.99
"""max""","""Seattle""","""Wildfire""","""Suburban Sprawl""",10.0,9991.0,119995.0,71.9,974537.31,281.0,97.07


In [5]:
print(f'Column names: {df_train.columns}')
print(f'The number of rows in the training data is {df_train.height}')
print(f'The number of columns in the training data is {df_train.width}')

Column names: ['City', 'Disaster_Type', 'Urban_Planning_Type', 'Disaster_Severity', 'Population_Density', 'Avg_Income', 'Response_Time_hr', 'Damage_Cost_USD', 'Recovery_Time_days', 'Resilience_Score']
The number of rows in the training data is 400
The number of columns in the training data is 10


In [6]:
target = {
    'City': 'city',
    'Disaster_Type': 'disaster_type',
    'Urban_Planning_Type': 'urban_planning_type',
    'Disaster_Severity': 'disaster_severity',
    'Population_Density':'population_density',
    'Avg_Income': 'avg_income',
    'Response_Time_hr': 'response_time_hr',
    'Damage_Cost_USD': 'damage_cost_usd',
    'Recovery_Time_days': 'recovery_time_days',
    'Resilience_Score': 'resilience_score'
}
df_train = df_train.rename(target)

df_test = df_test.rename(target)
df_train.columns, df_test.columns

(['city',
  'disaster_type',
  'urban_planning_type',
  'disaster_severity',
  'population_density',
  'avg_income',
  'response_time_hr',
  'damage_cost_usd',
  'recovery_time_days',
  'resilience_score'],
 ['city',
  'disaster_type',
  'urban_planning_type',
  'disaster_severity',
  'population_density',
  'avg_income',
  'response_time_hr',
  'damage_cost_usd',
  'recovery_time_days',
  'resilience_score'])

In [7]:
df_train.null_count(), df_test.null_count(), df_train.shape, df_test.shape

(shape: (1, 10)
 ┌──────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
 │ city ┆ disaster_t ┆ urban_plan ┆ disaster_s ┆ … ┆ response_t ┆ damage_co ┆ recovery_ ┆ resilienc │
 │ ---  ┆ ype        ┆ ning_type  ┆ everity    ┆   ┆ ime_hr     ┆ st_usd    ┆ time_days ┆ e_score   │
 │ u32  ┆ ---        ┆ ---        ┆ ---        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
 │      ┆ u32        ┆ u32        ┆ u32        ┆   ┆ u32        ┆ u32       ┆ u32       ┆ u32       │
 ╞══════╪════════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
 │ 0    ┆ 0          ┆ 0          ┆ 0          ┆ … ┆ 0          ┆ 0         ┆ 0         ┆ 0         │
 └──────┴────────────┴────────────┴────────────┴───┴────────────┴───────────┴───────────┴───────────┘,
 shape: (1, 10)
 ┌──────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
 │ city ┆ disaster_t ┆ urban_plan ┆ disaster_s ┆ 

In [8]:
print(df_train.get_column('city').value_counts())
print(df_train.get_column('disaster_type').value_counts())
print(df_train.get_column('urban_planning_type').value_counts())

shape: (10, 2)
┌───────────────┬───────┐
│ city          ┆ count │
│ ---           ┆ ---   │
│ str           ┆ u32   │
╞═══════════════╪═══════╡
│ Los Angeles   ┆ 37    │
│ San Francisco ┆ 47    │
│ Chicago       ┆ 45    │
│ Phoenix       ┆ 35    │
│ Seattle       ┆ 31    │
│ Houston       ┆ 44    │
│ New York      ┆ 40    │
│ Denver        ┆ 44    │
│ Boston        ┆ 30    │
│ Miami         ┆ 47    │
└───────────────┴───────┘
shape: (5, 2)
┌───────────────┬───────┐
│ disaster_type ┆ count │
│ ---           ┆ ---   │
│ str           ┆ u32   │
╞═══════════════╪═══════╡
│ Hurricane     ┆ 82    │
│ Wildfire      ┆ 74    │
│ Tornado       ┆ 83    │
│ Heatwave      ┆ 90    │
│ Flood         ┆ 71    │
└───────────────┴───────┘
shape: (5, 2)
┌──────────────────────┬───────┐
│ urban_planning_type  ┆ count │
│ ---                  ┆ ---   │
│ str                  ┆ u32   │
╞══════════════════════╪═══════╡
│ Dense Urban          ┆ 78    │
│ Mixed Use            ┆ 74    │
│ Resilient Design     ┆

In [9]:
df_train.write_csv('data/train.csv')
df_test.write_csv('data/test.csv')

In [10]:
features = [
    'city', 'disaster_type', 'urban_planning_type',
    'disaster_severity', 'population_density',
    'avg_income', 'response_time_hr',
    'damage_cost_usd',
]

target = 'recovery_time_days'

In [11]:
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [12]:
def encode_categoricals(df: pl.DataFrame, cat_cols):
    """auto label encoding """
    out = df.clone()
    mappings = {}

    for c in cat_cols:
        uniq = out[c].unique().to_list()
        mapping = {v: i for i, v in enumerate(uniq)}
        mappings[c] = mapping

        out = out.with_columns(
            pl.col(c).replace(mapping, default=-1).alias(c)
        )

    return out, mappings


def compute_metrics(y_true, y_pred):
    """compute metrics """
    return {
        'mse': mean_squared_error(y_true, y_pred),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
    }



In [30]:
import os
import json
import polars as pl
import numpy as np
import joblib
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import shap

# ------------------------- 配置 -------------------------
features = [
    'city', 'disaster_type', 'urban_planning_type',
    'disaster_severity', 'population_density',
    'avg_income', 'response_time_hr',
    'damage_cost_usd',
]
target = 'recovery_time_days'

OUT_DIR_MODELS = 'models'
OUT_DIR_DATA = 'data'
OUT_DIR_REPORTS = 'results'

def encode_categoricals(df: pl.DataFrame, cat_cols):
    out = df.clone()
    mappings = {}

    for c in cat_cols:
        uniq = out[c].unique().to_list()
        mapping = {v: i for i, v in enumerate(uniq)}
        mappings[c] = mapping
        out = out.with_columns(
            pl.col(c).replace(mapping, default=-1).alias(c)
        )

    return out, mappings


def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return {
        'mse': float(mse),
        'rmse': float(np.sqrt(mse)),
        'mae': float(mean_absolute_error(y_true, y_pred)),
        'r2': float(r2_score(y_true, y_pred)),
    }


def save_model(model, name: str, dir_path=OUT_DIR_MODELS):
    path = os.path.join(dir_path, f"{name}")
    if name.lower().startswith('catboost'):
        model.save_model(path + '.cbm')
    else:
        joblib.dump(model, path + '.pkl')


# ------------------------- 报告与可视化 -------------------------

def plot_metric_bar(metric_dict, metric_name, save_path):
    names = list(metric_dict.keys())
    vals = [metric_dict[n][metric_name] for n in names]

    plt.figure(figsize=(8, 4))
    plt.bar(names, vals)
    plt.xticks(rotation=30)
    plt.title(f'{metric_name.upper()} by model')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()


def save_shap_summary(model, X, model_name, save_path):
    # 尝试用 TreeExplainer 优先，如果失败回退到 KernelExplainer（很慢）
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)
        shap.summary_plot(shap_values, X, show=False)
        plt.tight_layout()
        plt.savefig(save_path)
        plt.close()
    except Exception as e:
        try:
            # KernelExplainer 需要一个期待值基线
            explainer = shap.KernelExplainer(model.predict, shap.kmeans(X, 10))
            shap_values = explainer.shap_values(X[:200])  # 取子集以免过慢
            shap.summary_plot(shap_values, X[:200], show=False)
            plt.tight_layout()
            plt.savefig(save_path)
            plt.close()
        except Exception as e2:
            # 如果两个都失败，记录错误
            with open(save_path + '.error.txt', 'w') as f:
                f.write(str(e))
                f.write('\n')
                f.write(str(e2))


# ------------------------- Ensemble (Blending + Stacking) -------------------------

def blending_predict(models_preds, weights=None):
    # models_preds: dict name -> preds (np.array)
    names = list(models_preds.keys())
    preds = np.vstack([models_preds[n] for n in names])  # shape (n_models, n_samples)
    if weights is None:
        # 根据每个模型的 RMSE 反比权重 (从名字映射外部)
        weights = np.ones(len(names)) / len(names)
    weights = np.array(weights)
    weights = weights / weights.sum()
    blended = np.average(preds, axis=0, weights=weights)
    return blended


def build_stacking_model(trained_models, X_train, y_train):
    # trained_models: list of (name, model)
    estimators = [(n, m) for n, m in trained_models]
    # meta learner 使用 RidgeCV
    stack = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(), cv=5, n_jobs=-1)
    stack.fit(X_train, y_train)
    return stack



def run_full_pipeline(df_train: pl.DataFrame, df_test: pl.DataFrame, save_report_html=True):
    # 1) 编码
    categorical_cols = [c for c in features if df_train[c].dtype == pl.Utf8]
    df_train_enc, mappings = encode_categoricals(df_train, categorical_cols)
    df_test_enc, _ = encode_categoricals(df_test, categorical_cols)

    # 保存编码后的数据
    train_path = os.path.join(OUT_DIR_DATA, 'train_encoded.parquet')
    test_path = os.path.join(OUT_DIR_DATA, 'test_encoded.parquet')
    df_train_enc.write_parquet(train_path)
    df_test_enc.write_parquet(test_path)

    # 转 numpy / pandas
    X_train = df_train_enc[features].to_numpy()
    y_train = df_train_enc[target].to_numpy()
    X_test = df_test_enc[features].to_numpy()
    y_test = df_test_enc[target].to_numpy()

    # 用于分组评估的字段（原始 df_test 的 disaster_type）
    test_group_type = df_test['disaster_type'].to_list()

    # 2) 定义并训练模型
    models = {
        'GradientBoosting': GradientBoostingRegressor(),
        'XGBoost': XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6,
                                subsample=0.9, colsample_bytree=0.8, objective='reg:squarederror'),
        'AdaBoost': AdaBoostRegressor(n_estimators=100, learning_rate=0.08),
        'LightGBM': LGBMRegressor(n_estimators=300, learning_rate=0.05, num_leaves=31),
        'CatBoost': CatBoostRegressor(iterations=300, learning_rate=0.05, depth=6, verbose=False),
    }

    trained_models = []
    preds_store = {}
    global_results = {}
    group_results = {}

    for name, model in models.items():
        print(f"Training {name} ...")
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        preds_store[name] = preds
        trained_models.append((name, model))

        save_model(model, name)

        global_results[name] = compute_metrics(y_test, preds)

        # group
        df_eval = pl.DataFrame({'disaster_type': test_group_type, 'y_true': y_test, 'y_pred': preds})
        group_metrics = {}
        for g in df_eval['disaster_type'].unique().to_list():
            sub = df_eval.filter(pl.col('disaster_type') == g)
            gm = compute_metrics(sub['y_true'].to_numpy(), sub['y_pred'].to_numpy())
            group_metrics[g] = gm
        group_results[name] = group_metrics

        print(f"{name} done. Global metrics: {global_results[name]}")

    # 3) SHAP analysis
    shap_X = pd.DataFrame(X_test, columns=features)
    max_shap_rows = 1000
    shap_X_small = shap_X if shap_X.shape[0] <= max_shap_rows else shap_X.sample(max_shap_rows, random_state=42)

    for name, model in trained_models:
        print(f"Generating SHAP for {name} ...")
        save_path = os.path.join(OUT_DIR_REPORTS, f'shap_{name}.png')
        try:
            save_shap_summary(model, shap_X_small, name, save_path)
            print(f"SHAP saved: {save_path}")
        except Exception as e:
            print(f"SHAP failed for {name}: {e}")

    # 4) metric bar chart
    # metrics: mse, rmse, mae, r2
    for metric in ['mse', 'rmse', 'mae', 'r2']:
        plot_metric_bar(global_results, metric, os.path.join(OUT_DIR_REPORTS, f'global_{metric}.png'))

    # 5) Ensemble：Blending
    # 根据每个模型的全局 rmse 计算权重的反比
    rmses = {n: global_results[n]['rmse'] for n in global_results}
    inv_rmse = {n: 1.0 / (rmses[n] + 1e-12) for n in rmses}
    names = list(inv_rmse.keys())
    weights = np.array([inv_rmse[n] for n in names])
    weights = weights / weights.sum()

    blended_preds = blending_predict(preds_store, weights=weights)
    global_results['Blending'] = compute_metrics(y_test, blended_preds)
    # per-group
    df_eval = pl.DataFrame({'disaster_type': test_group_type, 'y_true': y_test, 'y_pred': blended_preds})
    group_metrics = {}
    for g in df_eval['disaster_type'].unique().to_list():
        sub = df_eval.filter(pl.col('disaster_type') == g)
        gm = compute_metrics(sub['y_true'].to_numpy(), sub['y_pred'].to_numpy())
        group_metrics[g] = gm
    group_results['Blending'] = group_metrics

    # 保存 blending 结果图
    plot_metric_bar(global_results, 'rmse', os.path.join(OUT_DIR_REPORTS, 'global_rmse_with_blending.png'))

    # 6) Ensemble：Stacking (on original train set)
    print('Building stacking model...')
    stack = build_stacking_model(trained_models, X_train, y_train)
    stack_preds = stack.predict(X_test)
    global_results['Stacking'] = compute_metrics(y_test, stack_preds)

    df_eval = pl.DataFrame({'disaster_type': test_group_type, 'y_true': y_test, 'y_pred': stack_preds})
    group_metrics = {}
    for g in df_eval['disaster_type'].unique().to_list():
        sub = df_eval.filter(pl.col('disaster_type') == g)
        gm = compute_metrics(sub['y_true'].to_numpy(), sub['y_pred'].to_numpy())
        group_metrics[g] = gm
    group_results['Stacking'] = group_metrics

    save_model(stack, 'Stacking')

    # 7) 保存指标 JSON
    metrics_path = os.path.join(OUT_DIR_REPORTS, 'metrics_global.json')
    with open(metrics_path, 'w') as f:
        json.dump(global_results, f, indent=2)

    group_path = os.path.join(OUT_DIR_REPORTS, 'metrics_group.json')
    with open(group_path, 'w') as f:
        json.dump(group_results, f, indent=2)

    # 8) report html generate
    if save_report_html:
        html_path = os.path.join(OUT_DIR_REPORTS, 'report.html')
        with open(html_path, 'w') as f:
            f.write('<html><head><meta charset="utf-8"><title>Model Report</title></head><body>')
            f.write('<h1>Models Global Metrics</h1>')
            f.write('<pre>' + json.dumps(global_results, indent=2) + '</pre>')
            f.write('<h2>Metrics by model (RMSE chart)</h2>')
            f.write('<img src="global_rmse.png" alt="rmse"/>')
            # include shap imgs
            for name in list(models.keys()):
                img = f'shap_{name}.png'
                if os.path.exists(os.path.join(OUT_DIR_REPORTS, img)):
                    f.write(f'<h3>SHAP: {name}</h3>')
                    f.write(f'<img src="{img}" style="max-width:800px;"/>')
            f.write('</body></html>')
        print(f'Report HTML saved to {html_path}')

    print('Pipeline finished. Artifacts:')
    print(' - models/  (saved model files)')
    print(' - data/    (train_encoded.parquet, test_encoded.parquet)')
    print(' - reports/ (plots, metrics json, report.html)')

    return global_results, group_results


In [31]:
global_results, group_results = run_full_pipeline(df_train, df_test)

(Deprecated in version 1.0.0)
  pl.col(c).replace(mapping, default=-1).alias(c)


Training GradientBoosting ...
GradientBoosting done. Global metrics: {'mse': 2843.7752787586774, 'rmse': 53.32705953602427, 'mae': 41.49197078511181, 'r2': 0.4168662180558971}
Training XGBoost ...
XGBoost done. Global metrics: {'mse': 2756.14501953125, 'rmse': 52.49900017649146, 'mae': 40.7761116027832, 'r2': 0.4348353147506714}
Training AdaBoost ...
AdaBoost done. Global metrics: {'mse': 2440.845882130334, 'rmse': 49.40491759056312, 'mae': 38.3544599616984, 'r2': 0.49948939319472285}
Training LightGBM ...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 688
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 8
[LightGBM] [Info] Start training from score 88.145000




LightGBM done. Global metrics: {'mse': 2683.03366865775, 'rmse': 51.79800834643886, 'mae': 38.828363957682676, 'r2': 0.4498272834797633}
Training CatBoost ...
CatBoost done. Global metrics: {'mse': 2577.415245276822, 'rmse': 50.768250366511765, 'mae': 38.39527915971245, 'r2': 0.47148499712863434}
Generating SHAP for GradientBoosting ...
SHAP saved: results/shap_GradientBoosting.png
Generating SHAP for XGBoost ...


  0%|          | 0/100 [00:00<?, ?it/s]

SHAP saved: results/shap_XGBoost.png
Generating SHAP for AdaBoost ...


  0%|          | 0/100 [00:00<?, ?it/s]

SHAP saved: results/shap_AdaBoost.png
Generating SHAP for LightGBM ...
SHAP saved: results/shap_LightGBM.png
Generating SHAP for CatBoost ...
SHAP saved: results/shap_CatBoost.png
Building stacking model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 688
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 8
[LightGBM] [Info] Start training from score 88.145000


  return _ForkingPickler.loads(res)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 553
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 8
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 8
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 8
[LightGBM] [Info] Start training from score 86.862500
[LightGBM] [Info] Start training from score 93.150000
[LightGBM] [Info] Start training from

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)






Report HTML saved to results/report.html
Pipeline finished. Artifacts:
 - models/  (saved model files)
 - data/    (train_encoded.parquet, test_encoded.parquet)
 - reports/ (plots, metrics json, report.html)


In [32]:
global_results

{'GradientBoosting': {'mse': 2843.7752787586774,
  'rmse': 53.32705953602427,
  'mae': 41.49197078511181,
  'r2': 0.4168662180558971},
 'XGBoost': {'mse': 2756.14501953125,
  'rmse': 52.49900017649146,
  'mae': 40.7761116027832,
  'r2': 0.4348353147506714},
 'AdaBoost': {'mse': 2440.845882130334,
  'rmse': 49.40491759056312,
  'mae': 38.3544599616984,
  'r2': 0.49948939319472285},
 'LightGBM': {'mse': 2683.03366865775,
  'rmse': 51.79800834643886,
  'mae': 38.828363957682676,
  'r2': 0.4498272834797633},
 'CatBoost': {'mse': 2577.415245276822,
  'rmse': 50.768250366511765,
  'mae': 38.39527915971245,
  'r2': 0.47148499712863434},
 'Blending': {'mse': 2532.503632600307,
  'rmse': 50.32398665249313,
  'mae': 38.58757506234705,
  'r2': 0.4806944022278645},
 'Stacking': {'mse': 2590.2979455308714,
  'rmse': 50.89496974683128,
  'mae': 40.00111557385197,
  'r2': 0.46884331943458146}}

In [33]:
group_results

{'GradientBoosting': {'Tornado': {'mse': 2031.0876031810421,
   'rmse': 45.06758927634184,
   'mae': 35.6857116061335,
   'r2': 0.448936500718067},
  'Hurricane': {'mse': 1376.151179812661,
   'rmse': 37.09651169332045,
   'mae': 32.37094104220002,
   'r2': 0.6820519742382629},
  'Wildfire': {'mse': 2700.3034320734164,
   'rmse': 51.96444392152596,
   'mae': 39.44933860222779,
   'r2': 0.35072986817573126},
  'Heatwave': {'mse': 4902.385247904966,
   'rmse': 70.0170354121407,
   'mae': 57.35922632189051,
   'r2': 0.02592598387439049},
  'Flood': {'mse': 3981.5531782264484,
   'rmse': 63.09954974662219,
   'mae': 47.69764194443387,
   'r2': 0.46746631568566277}},
 'XGBoost': {'Wildfire': {'mse': 2607.534912109375,
   'rmse': 51.06402757430494,
   'mae': 40.52964401245117,
   'r2': 0.37303537130355835},
  'Flood': {'mse': 3884.930419921875,
   'rmse': 62.329210005597496,
   'mae': 44.68368148803711,
   'r2': 0.4803895950317383},
  'Heatwave': {'mse': 3898.765625,
   'rmse': 62.4400962923

In [35]:
df_test_enc = pl.read_parquet('data/test_encoded.parquet')

In [37]:
df_test.head()

city,disaster_type,urban_planning_type,disaster_severity,population_density,avg_income,response_time_hr,damage_cost_usd,recovery_time_days,resilience_score
str,str,str,f64,i64,i64,f64,f64,i64,f64
"""San Francisco""","""Tornado""","""Resilient Design""",9.29,8195,46299,34.28,347178.03,128,68.77
"""Chicago""","""Flood""","""Mixed Use""",1.4,2885,32960,61.29,95790.92,29,96.04
"""Miami""","""Hurricane""","""Dense Urban""",5.97,5637,79894,34.1,408170.01,67,77.33
"""Boston""","""Flood""","""Resilient Design""",4.42,1096,76105,22.2,199491.34,119,72.67
"""Phoenix""","""Hurricane""","""Resilient Design""",1.66,2103,62858,7.97,124875.11,26,93.94


In [38]:
df_test['disaster_type'].value_counts()

disaster_type,count
str,u32
"""Heatwave""",18
"""Flood""",15
"""Tornado""",21
"""Wildfire""",23
"""Hurricane""",23
