# Training models on full dataset

Here, we train the models on the full dataset, using the best hyperparameters from cross-validation. We save the models for later use in simulations. We provide model explanations, using various interpretability techniques.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearnex import patch_sklearn
patch_sklearn()

import os
os.environ["SKLEARNEX_VERBOSE"] = "INFO"

from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data

In [2]:
data = pd.read_csv('data/data.csv', low_memory=False)

In [3]:
data['Coastal?'] = data['Coastal?'].fillna(False).astype(int)

mapping_dict = {'Arid': 0, 'Snow': 1, 'Temperate': 2, 'Tropical': 3}
data['Climate Zone'] = data['Climate Zone'].map(mapping_dict)

In [4]:
features = ['$\\Delta$Built Fraction', '$\\Delta$Grass Fraction', '$\\Delta$Tree Fraction', 
            '$\\Delta$Built Albedo', '$\\Delta$Grass Albedo', '$\\Delta$Tree Albedo', 
            '$\\Delta$Elevation', 'Coastal?', 'Climate Zone']

## Helper Functions

In [5]:
def get_data(label):
    df = data[features+[label]].dropna().copy()
    X  = df[features]
    y  = df[label]
    
    return X, y

In [6]:
def get_model(label, n_estimators, max_depth, filename):
    X, y = get_data(label)
    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, importance_type='gain', 
                         random_state=1, tree_method='hist', grow_policy = 'lossguide')
    model.fit(X,y)
    y_pred = model.predict(X)
    print('R2: ', round(model.score(X,y), 3))
    print('RMSE: ', round(mean_squared_error(y, y_pred, squared = False), 3))
    print('MBE: ', round(np.mean(y - y_pred), 3))
    joblib.dump(model, 'models/full/' + filename + '_xgb_estimator.joblib') 
    return model

## Full Models

### $\Delta$AT Day

In [7]:
model_at_d = get_model(label='$\\Delta$AT Day', n_estimators=100, max_depth=10, filename='AT_day')

R2:  0.988
RMSE:  0.115
MBE:  0.0


### $\Delta$AT Night

In [8]:
model_at_n = get_model(label='$\\Delta$AT Night', n_estimators=100, max_depth=10, filename='AT_night')

R2:  0.982
RMSE:  0.114
MBE:  0.0


### $\Delta$LST Day

In [9]:
model_lst_d = get_model(label='$\\Delta$LST Day', n_estimators=150, max_depth=10, filename='LST_day')

R2:  0.978
RMSE:  0.475
MBE:  -0.0


### $\Delta$LST Night

In [10]:
model_lst_n = get_model(label='$\\Delta$LST Night', n_estimators=150, max_depth=10, filename='LST_night')

R2:  0.973
RMSE:  0.187
MBE:  0.0
