# Predicting urban heat intensity (UHI) using ML methods

In [1]:
# import general packages
import numpy as np
import pandas as pd
import warnings
import os
import pickle
import yaml

In [2]:
# ignore warnings & adjust location
warnings.filterwarnings("ignore")
home_directory = os.path.expanduser( '~' )
os.chdir(home_directory + '/DS_Project/modules')
config_path = 'config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
path = config['data']['data'] + '/uhi_model/'
# path = home_directory

In [3]:
from models.UHI_modeling.UHI import *

In [4]:
# import sklearn models
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

In [5]:
# load features and target
grid_size_meters = 100
with open(path + 'final_' + str(grid_size_meters) + '_a.pkl', 'rb') as file:
    final = pickle.load(file)

In [6]:
# bring features and targets into right format
features = ['impervious','building','low vegetation','water','trees','road','avg_height']
X = final[features].values
Y = final['nLST'].values
assert len(X) == len(Y), "X and Y must have the same length."

In [7]:
# also create 2nd order polynomial for linear regression
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly_2 = poly_2.fit_transform(X)

In [8]:
# Define the models to test with their respective hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Linear Regression 2nd degree poly': (LinearRegression(), {}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [5, 10], 'min_samples_split': [2, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 300], 'max_depth': [10]}),
    'Support Vector Machine': (SVR(), {'kernel': ['linear', 'rbf']}),
    'Neural Network': (MLPRegressor(max_iter=500), {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.001]}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [100, 300], 'max_depth': [3, 7], 'learning_rate': [0.1, 0.001]}),
    'ElasticNet': (ElasticNet(), {'alpha': [0.1, 1], 'l1_ratio': [0.2, 0.5, 0.8]}),
    'AdaBoost': (AdaBoostRegressor(), {'n_estimators': [50, 200], 'learning_rate': [0.01, 0.1]})
}

In [9]:
performance_measures = {
    'Mean Squared Error': make_scorer(mean_squared_error),
    'Mean Absolute Error': make_scorer(mean_absolute_error),
    'R-squared': make_scorer(r2_score),
    'Explained Variance Score': make_scorer(explained_variance_score)
}

In [10]:
# Perform k-fold cross-validation with hyperparameter tuning and calculate performance measures
performance_results = {}
num_folds = 5
for model_name, (model, param_grid) in models.items():
    print(model_name)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kf)
    if model_name == 'Linear Regression 2nd degree poly':
        grid_search.fit(X_poly_2, Y)
    else:
        grid_search.fit(X, Y)
    best_model = grid_search.best_estimator_
    performance_scores = {}
    for measure_name, measure_func in performance_measures.items():
        if model_name == 'Linear Regression 2nd degree poly':
            scores = cross_val_score(best_model, X_poly_2, Y, cv=kf, scoring=measure_func, error_score='raise')
        else:
            scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func, error_score='raise')
        mean_score = np.mean(scores)
        performance_scores[measure_name] = mean_score
    performance_results[model_name] = performance_scores

Linear Regression
Linear Regression 2nd degree poly
Decision Tree
Random Forest
Support Vector Machine
Neural Network
XGBoost
Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not used.

Parameters: { "max_iter" } are not

In [11]:
performance_df = pd.DataFrame.from_dict(performance_results)
performance_df

Unnamed: 0,Linear Regression,Linear Regression 2nd degree poly,Decision Tree,Random Forest,Support Vector Machine,Neural Network,XGBoost,ElasticNet,AdaBoost
Mean Squared Error,4.115615,3.849607,3.520746,3.292522,3.914391,3.438375,3.285543,5.566831,3.793168
Mean Absolute Error,1.572295,1.514052,1.428925,1.392044,1.493624,1.430164,1.393522,1.822503,1.528819
R-squared,0.608773,0.634026,0.665452,0.686785,0.627886,0.673307,0.687646,0.470873,0.639839
Explained Variance Score,0.608849,0.63408,0.66539,0.686907,0.628021,0.674745,0.687672,0.470924,0.641518


In [12]:
best_measure = 'Mean Squared Error'
best_model_name, best_score = find_best_model(performance_results, best_measure)
best_model = models[best_model_name][0]
print(best_model_name)

XGBoost
