In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, PredefinedSplit
from scipy.stats import uniform
from xgboost import XGBRegressor

import time
import joblib
import random
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
sample_datasets = {i: pd.read_csv(f"datasets/group/{i}.csv", index_col=[0]) for i in range(20)}

In [None]:
best_models = {"logistic": [], "random_forest": [], "xgboost": []}

for i in range(20):
    best_models["logistic"].append(joblib.load(f"best_models/logreg_{i}.model"))
    best_models['random_forest'].append(joblib.load(f"best_models/rf_{i}.model"))
    best_models["xgboost"].append(joblib.load(f"best_models/xgboost_{i}.model"))

In [None]:
fire_data = pd.read_csv("fire/fire_df.csv", index_col=[0])
fire_data.head()

In [None]:
fire_distribution_df = pd.read_csv("fire/fire_distribution.csv", index_col=[0])[3:].T
fire_distribution_df

In [None]:
index_dict = {}

for cls in fire_data["SIZECLASS"].unique():
    index_dict[cls] = list(fire_data[fire_data["SIZECLASS"] == cls].index)

distribution = []

for cls in fire_distribution_df.columns:
    distribution += [cls] * fire_distribution_df.loc['0', cls]
    
def calc_damage_estimate(profile, n):
    estimates = []
    
    for test in range(n):
        damage_in_dollar = 0
        total_area = 0
        
        while total_area < profile:
            sample_class = distribution[random.randint(0, len(distribution))-1]
            i = index_dict[sample_class][random.randint(0, len(index_dict[sample_class])-1)]
            
            cost = fire_data.loc[i, "COST"]
            km_squared = fire_data.loc[i, "TOTALKM^2"]

            if (total_area + km_squared) > profile:
                damage_in_dollar += (profile - total_area) / km_squared * cost
            else:
                damage_in_dollar += cost
            total_area += km_squared

        estimates.append(damage_in_dollar)
    return estimates

In [None]:
optimal_scale_param_df = pd.read_csv("optimal_scale_params.csv", index_col=[0])
optimal_scale_param_df

In [None]:
yearly_estimates = {k: {i: [] for i in range(2015, 2020)} for k in ['Logisztikus regresszió', 'Random Forest', 'XGBRegressor']}

In [None]:
static_df = pd.read_csv("data/datasets/static_variables.csv", index_col=[0])

In [None]:
year_month_dict_logreg = {k: {i: {j: 0 for j in range(1, 13)} for i in range(2015, 2020)} for k in range(20)}
year_month_dict_rf = {k: {i: {j: 0 for j in range(1, 13)} for i in range(2015, 2020)} for k in range(20)}
year_month_dict_xgboost = {k: {i: {j: 0 for j in range(1, 13)} for i in range(2015, 2020)} for k in range(20)}

for year in range(2015, 2020):
    for month in range(1, 13):
        print(year, month)
        df = pd.read_csv(f"data/datasets/raw_datasets/{year}-{month}.csv", index_col=[0])
        df = pd.concat([static_df, df], axis=1)
        
        df.drop(columns=['TARGET'], inplace=True)
        for j in range(20):
            print('\t', j)
            model = best_models["logistic"][j]

            y_preds = np.sum(optimal_scale_param_df.loc[j, 'logistic'] * model.predict_proba(df)[:, 1])
            
            year_month_dict_logreg[j][year][month] = y_preds

            #####

            model = best_models["random_forest"][j]

            y_preds = np.sum(optimal_scale_param_df.loc[j, 'random_forest'] * model.predict(df))
            
            year_month_dict_rf[j][year][month] = y_preds

            ######

            model = best_models["xgboost"][j]

            y_preds = np.sum(optimal_scale_param_df.loc[j, 'xgboost'] * model.predict(df))
            
            year_month_dict_xgboost[j][year][month] = y_preds

In [None]:
yearly_estimates = {k: {i: [] for i in range(2015, 2020)} for k in ['Logisztikus regresszió', 'Random Forest', 'XGBRegressor']}

for i, year_dict in year_month_dict_logreg.items():
    for y, vals in year_dict.items():
        yearly_estimates['Logisztikus regresszió'][y] += calc_damage_estimate(np.sum(list(vals.values())), 25)
        
for i, year_dict in year_month_dict_rf.items():
    for y, vals in year_dict.items():
        yearly_estimates['Random Forest'][y] += calc_damage_estimate(np.sum(list(vals.values())), 25)
        
for i, year_dict in year_month_dict_xgboost.items():
    for y, vals in year_dict.items():
        yearly_estimates['XGBRegressor'][y] += calc_damage_estimate(np.sum(list(vals.values())), 25)

In [None]:
pd.DataFrame(yearly_estimates['Logisztikus regresszió']).to_csv(f"data/output/projection_results/train_log_df.csv")
pd.DataFrame(yearly_estimates['Random Forest']).to_csv(f"data/output/projection_results/train_rf_df.csv")
pd.DataFrame(yearly_estimates['XGBRegressor']).to_csv(f"data/output/projection_results/train_xgb_df.csv")

In [None]:
name_dict = {"Logisztikus regresszió": 'log_df.csv', 'Random Forest': 'rf_df.csv', 'XGBRegressor': 'xgb_df.csv'}

ssp5 = None
ssp2 = None

for model_name in ['Logisztikus regresszió', 'Random Forest', 'XGBRegressor']:
    df_train = pd.read_csv(f"data/output/projection_results/train_{name_dict[model_name]}", index_col=[0]) / 10**9 
    
    df_test_valid = pd.read_csv(f"data/output/projection_results/test_valid_{name_dict[model_name]}", index_col=[0]) / 10**9 
    
    df_ssp5 = pd.read_csv(f"data/output/projection_results/SSP5_{name_dict[model_name]}", index_col=[0]) / 10**9
    
    df_ssp2 = pd.read_csv(f"data/output/projection_results/SSP2_{name_dict[model_name]}", index_col=[0]) / 10**9
    
    if ssp5 is None:
        ssp5 = pd.concat([df_train.median(),
           df_test_valid['2020'].median(),
           df_test_valid['2021'].median(),
           df_ssp5.median()])
    else:
        ssp5 = pd.concat([ssp5, pd.concat([df_train.median(),
           df_test_valid['2020'].median(),
           df_test_valid['2021'].median(),
           df_ssp5.median()])], axis=1)
    
    if ssp2 is None:
        ssp2 = pd.concat([df_train.median(),
           df_test_valid['2020'].median(),
           df_test_valid['2021'].median(),
           df_ssp2.median()])
    else:
        ssp2 = pd.concat([ssp2, pd.concat([df_train.median(),
           df_test_valid['2020'].median(),
           df_test_valid['2021'].median(),
           df_ssp2.median()])], axis=1)

official_est = pd.Series([4.71, 0.48, 18.01, 26.35, 0.16, 12.08, '-'] + ['-']*25, name='Hivatalos becslés',
                        index=ssp2.index)

output = pd.concat([ssp5, official_est, ssp2], axis=1)

output.columns = pd.MultiIndex.from_tuples([('SSP5', 'Logisztikus regresszió'), ('SSP5', 'Random Forest'), ('SSP5', 'XGBoost'),
                                            ('', 'Hivatalos becslés'),
                 ('SSP2', 'Logisztikus regresszió'), ('SSP2', 'Random Forest'), ('SSP2', 'XGBoost')], names=['Szcenárió', 'Modell'])

k = [output[col][7:].median() for col in output.columns if 'Hivatalos becslés' not in col]

k = k[:3] + [''] + k[3:]

old_ind = output.index.copy()

output.loc[-1] = k

output.index = list(old_ind) + ['2025-2049 mediánja']

output.round(2)