In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, PredefinedSplit
from scipy.stats import uniform
from xgboost import XGBRegressor

import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors

In [None]:
validation_df = pd.read_csv('data/datasets/validation_2020.csv', index_col=[0])

test_df = pd.read_csv('data/datasets/test_2021.csv', index_col=[0])

In [None]:
sample_datasets = {i: pd.read_csv(f"datasets/group/{i}.csv", index_col=[0]) for i in range(20)}

In [None]:
best_models = {"logistic": [], "random_forest": [], "xgboost": []}

for i in range(20):
    best_models["logistic"].append(joblib.load(f"best_models/logreg_{i}.model"))
    best_models['random_forest'].append(joblib.load(f"best_models/rf_{i}.model"))
    best_models["xgboost"].append(joblib.load(f"best_models/xgboost_{i}.model"))

In [None]:
fire_data = pd.read_csv("data/input/FIRE/fire_df.csv", index_col=[0])
fire_data.head()

In [None]:
fire_distribution_df = pd.read_csv("data/input/FIRE/fire_distribution.csv", index_col=[0])[3:].T
fire_distribution_df

In [None]:
index_dict = {}

for cls in fire_data["SIZECLASS"].unique():
    index_dict[cls] = list(fire_data[fire_data["SIZECLASS"] == cls].index)

distribution = []

for cls in fire_distribution_df.columns:
    distribution += [cls] * fire_distribution_df.loc['0', cls]
    
def calc_damage_estimate(profile, n):
    estimates = []
    
    for test in range(n):
        damage_in_dollar = 0
        total_area = 0
        
        while total_area < profile:
            sample_class = distribution[random.randint(0, len(distribution))-1]
            i = index_dict[sample_class][random.randint(0, len(index_dict[sample_class])-1)]
            
            cost = fire_data.loc[i, "COST"]
            km_squared = fire_data.loc[i, "TOTALKM^2"]

            if (total_area + km_squared) > profile:
                damage_in_dollar += (profile - total_area) / km_squared * cost
            else:
                damage_in_dollar += cost
            total_area += km_squared

        estimates.append(damage_in_dollar)
    return estimates

In [None]:
optimal_scale_param_df = pd.read_csv("models/optimal_scale_params.csv", index_col=[0])
optimal_scale_param_df

In [None]:
def auc_and_mse_score(y_test, y_preds):
    roc_auc = roc_auc_score(y_test, y_preds)
    
    n = 426327
    
    output_pairs = []
    
    for i in range(12):
        profile = np.sum(y_test[i*n:(i+1)*n])
        predicted_profile = np.sum(y_preds[i*n:(i+1)*n])
        
        output_pairs.append((profile, predicted_profile))
        
    mse_score = np.mean((np.array(output_pairs)[:, 0] - np.array(output_pairs)[:, 1]) ** 2)
    
    return roc_auc, mse_score

# 2021 test year

In [None]:
y_test = test_df["TARGET"]
X_test = test_df[[col for col in test_df if col != "TARGET"]]

dollar_estimates = {k: {m: [] for m in range(20)} for k in ["logistic", "random_forest", "xgboost"]}

auc_scores = {k: [] for k in ["logistic", "random_forest", "xgboost"]}

profile_mse_scores = {k: [] for k in ["logistic", "random_forest", "xgboost"]}

for i, train_dataset in sample_datasets.items():
    model = best_models["logistic"][i]
    
    y_preds = model.predict_proba(X_test)[:, 1]
    
    opt_c = optimal_scale_param_df.loc[i, 'logistic']
    
    roc_auc, mse_score = auc_and_mse_score(y_test, opt_c*y_preds)
    
    auc_scores['logistic'].append(roc_auc)
    
    profile_mse_scores['logistic'].append(mse_score)
    
    dollar_estimates['logistic'][i] += calc_damage_estimate(opt_c * np.sum(y_preds), 50)
    
    #############################################
    
    model = best_models["random_forest"][i]
    
    y_preds = model.predict(X_test)
    
    opt_c = optimal_scale_param_df.loc[i, 'random_forest']
    
    roc_auc, mse_score = auc_and_mse_score(y_test, opt_c*y_preds)
    
    auc_scores['random_forest'].append(roc_auc)
    
    profile_mse_scores['random_forest'].append(mse_score)
    
    dollar_estimates["random_forest"][i] += calc_damage_estimate(np.sum(opt_c*y_preds), 50)
    
    ############################################
    
    model = best_models["xgboost"][i]
    
    y_preds = model.predict(X_test)
    
    opt_c = optimal_scale_param_df.loc[i, 'xgboost']
    
    roc_auc, mse_score = auc_and_mse_score(y_test, opt_c*y_preds)
    
    auc_scores['xgboost'].append(roc_auc)
    
    profile_mse_scores['xgboost'].append(mse_score)
    
    dollar_estimates["xgboost"][i] += calc_damage_estimate(np.sum(opt_c*y_preds), 50)

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))

ax.boxplot(list(auc_scores.values()))
ax.set_xticklabels(['Logisztikus regresszió', 'Random Forest', 'XGBRegressor'], fontsize=12)
ax.set_ylabel("AUC", fontsize=14)

plt.savefig("data/output/results/test_auc_compare.png")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))

ax.boxplot(list(profile_mse_scores.values()))
ax.set_xticklabels(['Logisztikus regresszió', 'Random Forest', 'XGBRegressor'], fontsize=12)
ax.set_ylabel("valószínűség-profil MSE", fontsize=14)

plt.savefig("data/output/results/test_profile_mse.png")
plt.show()

# 2020 validation year

In [None]:
dollar_estimates_2020 = {k: {m: [] for m in range(20)} for k in ["logistic", "random_forest", "xgboost"]}

y_valid = validation_df["TARGET"]
X_valid = validation_df[[col for col in validation_df if col != "TARGET"]]

for i, train_dataset in sample_datasets.items():
    model = best_models["logistic"][i]
    
    y_preds = model.predict_proba(X_valid)[:, 1]
    
    opt_c = optimal_scale_param_df.loc[i, 'logistic']
    
    dollar_estimates_2020['logistic'][i] += calc_damage_estimate(opt_c * np.sum(y_preds), 50)
    
    #############################################
    
    model = best_models["random_forest"][i]
    
    y_preds = model.predict(X_valid)
    
    opt_c = optimal_scale_param_df.loc[i, 'random_forest']
    
    dollar_estimates_2020["random_forest"][i] += calc_damage_estimate(np.sum(opt_c*y_preds), 50)
    
    ############################################
    
    model = best_models["xgboost"][i]
    
    y_preds = model.predict(X_valid)
    
    opt_c = optimal_scale_param_df.loc[i, 'xgboost']
    
    dollar_estimates_2020["xgboost"][i] += calc_damage_estimate(np.sum(opt_c*y_preds), 50)

# Comparison

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

to_plot = [np.array(list(dollar_estimates_2020['logistic'].values())).flatten() / 10**9,
          np.array(list(dollar_estimates_2020['random_forest'].values())).flatten() / 10**9,
          np.array(list(dollar_estimates_2020['xgboost'].values())).flatten() / 10**9]

ax[0].set_title("2020", fontsize=18, pad=7)
ax[0].boxplot(to_plot)
ax[0].set_xticklabels(['Logisztikus\nregresszió', 'Random\nForest', 'XGBRegressor'], fontsize=12)
ax[0].set_ylabel("kárbecslés $mrd", fontsize=14)
ax[0].set_xlim(0.5, 3.5)
ax[0].hlines([12.079], 0, 4, color='red', label='hivatalos becslés')

ax[0].legend()

to_plot = [np.array(list(dollar_estimates['logistic'].values())).flatten() / 10**9,
          np.array(list(dollar_estimates['random_forest'].values())).flatten() / 10**9,
          np.array(list(dollar_estimates['xgboost'].values())).flatten() / 10**9]

ax[1].set_title("2021", fontsize=18, pad=7)
ax[1].boxplot(to_plot)
ax[1].set_xticklabels(['Logisztikus\nregresszió', 'Random\nForest', 'XGBRegressor'], fontsize=12)
ax[1].set_ylabel("kárbecslés $mrd", fontsize=14)
ax[1].plot([], [], ' ', label='nincs még \nhivatalos becslés')
ax[1].legend()

plt.savefig("data/output/results/compare_cost_estimate.png")
plt.show()

# Predicted probabilty map example

In [None]:
february_y = validation_df[426327: 2*426327]['TARGET']
february_df = validation_df[426327: 2*426327][[col for col in validation_df if col != 'TARGET']]

aufust_y = validation_df[7*426327: 8*426327]['TARGET']
august_df = validation_df[7*426327: 8*426327][[col for col in validation_df if col != 'TARGET']]

In [None]:
cm = plt.cm.get_cmap("RdYlGn_r")

n = 100
a = 0.2
b = 0.9

new_cmap = colors.LinearSegmentedColormap.from_list(f'trunc({n},{a:.2f},{b:.2f})',
                                                   cm(np.linspace(a, b, n)))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))

opt_c = optimal_scale_param_df.loc[19, 'xgboost']

febr_c = opt_c * best_models['xgboost'][19].predict(february_df)

ax[0].set_axis_off()
febr_sc = ax[0].scatter(february_df['lon'].values, february_df['lat'].values, c=febr_c, s=0.1, cmap=new_cmap)
ax[0].set_title("XGBoost predikció térkép, \n2020 február", fontsize=18)
plt.colorbar(febr_sc, ax=ax[0])

aug_c = opt_c * best_models['xgboost'][19].predict(august_df)

ax[1].set_axis_off()
aug_sc = ax[1].scatter(august_df['lon'].values, august_df['lat'].values, c=aug_c, s=0.1, cmap=new_cmap)
ax[1].set_title("XGBoost predikció térkép, \n2020 augusztus", fontsize=18)
plt.colorbar(aug_sc, ax=ax[1])

plt.savefig("data/output/results/xgboost_pred_maps.png")
plt.show()