# Predicting urban heat intensity (UHI) using ML methods

In [1]:
# import general packages
import numpy as np
import pandas as pd
import warnings
import os
import pickle
import yaml

In [2]:
# ignore warnings & adjust location
warnings.filterwarnings("ignore")
home_directory = os.path.expanduser( '~' )
os.chdir(home_directory + '/DS_Project/modules')
config_path = 'config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
from models.UHI_modeling.disaggregate import *

In [4]:
# import sklearn models
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

In [5]:
# load features and target
grid_size_meters = 250
with open(path + 'final_' + str(grid_size_meters) + '_c.pkl', 'rb') as file:
    final = pickle.load(file)

In [6]:
# bring features and targets into right format
features = ['impervious','building','low vegetation','water','trees','road','avg_height']
X = final[features].values
Y = final['nLST'].values
assert len(X) == len(Y), "X and Y must have the same length."

In [7]:
# also create 2nd order polynomial for linear regression
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly_2 = poly_2.fit_transform(X)

In [8]:
# Define the models to test with their respective hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Linear Regression 2nd degree poly': (LinearRegression(), {}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}),
    'Support Vector Machine': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),
    'Neural Network': (MLPRegressor(max_iter=1000), {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.001], 'activation': ['relu', 'tanh']}),
    'XGBoost': (XGBRegressor(max_iter=1000), {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}),
    'ElasticNet': (ElasticNet(), {'alpha': [0.1, 1, 10], 'l1_ratio': [0.2, 0.5, 0.8]}),
    'AdaBoost': (AdaBoostRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]})
}

In [9]:
performance_measures = {
    'Mean Squared Error': make_scorer(mean_squared_error),
    'Mean Absolute Error': make_scorer(mean_absolute_error),
    'R-squared': make_scorer(r2_score),
    'Explained Variance Score': make_scorer(explained_variance_score)
}

In [10]:
# Perform k-fold cross-validation with hyperparameter tuning and calculate performance measures
performance_results = {}
num_folds = 5
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kf)
    if model_name == 'Linear Regression 2nd degree poly':
        grid_search.fit(X_poly_2, Y)
    else:
        grid_search.fit(X, Y)
    best_model = grid_search.best_estimator_
    performance_scores = {}
    for measure_name, measure_func in performance_measures.items():
        if model_name == 'Linear Regression 2nd degree poly':
            scores = cross_val_score(best_model, X_poly_2, Y, cv=kf, scoring=measure_func, error_score='raise')
        else:
            scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func, error_score='raise')
        mean_score = np.mean(scores)
        performance_scores[measure_name] = mean_score
    performance_results[model_name] = performance_scores
    for measure_name, score in performance_scores.items():
        print(f"{measure_name}: {score:.4f}")
    print("Best Hyperparameters:", grid_search.best_params_)
    print()

Model: Linear Regression
Mean Squared Error: 1.6469
Mean Absolute Error: 0.9709
R-squared: 0.6562
Explained Variance Score: 0.6579
Best Hyperparameters: {}

Model: Linear Regression 2nd degree poly
Mean Squared Error: 1.6234
Mean Absolute Error: 0.9632
R-squared: 0.6635
Explained Variance Score: 0.6667
Best Hyperparameters: {}

Model: Decision Tree
Mean Squared Error: 2.1755
Mean Absolute Error: 1.1080
R-squared: 0.5472
Explained Variance Score: 0.5555
Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 10}

Model: Random Forest
Mean Squared Error: 1.6784
Mean Absolute Error: 0.9944
R-squared: 0.6532
Explained Variance Score: 0.6585
Best Hyperparameters: {'max_depth': 5, 'n_estimators': 300}

Model: Support Vector Machine
Mean Squared Error: 1.6605
Mean Absolute Error: 0.9689
R-squared: 0.6546
Explained Variance Score: 0.6563
Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}

Model: Neural Network
Mean Squared Error: 1.7119
Mean Absolute Error: 1.0086
R-squa

In [11]:
performance_df = pd.DataFrame.from_dict(performance_results)
performance_df

Unnamed: 0,Linear Regression,Linear Regression 2nd degree poly,Decision Tree,Random Forest,Support Vector Machine,Neural Network,XGBoost,ElasticNet,AdaBoost
Mean Squared Error,1.646909,1.623369,2.175494,1.678431,1.660524,1.711934,1.722149,2.538173,1.679697
Mean Absolute Error,0.970893,0.963165,1.107967,0.994351,0.96889,1.008562,0.990308,1.197766,1.002656
R-squared,0.65624,0.663523,0.547229,0.653159,0.654597,0.643665,0.643837,0.474677,0.651077
Explained Variance Score,0.657939,0.666738,0.555542,0.65855,0.656289,0.647989,0.645657,0.478997,0.655867


In [12]:
def find_best_model(performance_results, performance_measure):
    best_model = None
    best_score = None

    for model_name, scores in performance_results.items():
        score = scores[performance_measure]

        if best_score is None:
            best_model = model_name
            best_score = score
        else:
            if (
                (performance_measure == 'Mean Squared Error' or performance_measure == 'Mean Absolute Error')
                and score < best_score
            ):
                best_model = model_name
                best_score = score
            elif (
                (performance_measure == 'R-squared' or performance_measure == 'Explained Variance Score')
                and score > best_score
            ):
                best_model = model_name
                best_score = score

    return best_model, best_score

In [13]:
best_measure = 'Mean Squared Error'
best_model_name, best_score = find_best_model(performance_results, best_measure)
best_model = models[best_model_name][0]
print(best_model_name)

Linear Regression 2nd degree poly


In [14]:
best_model.fit(X, Y)

In [19]:
new_X = [[0.25, 0.25, 0.4, 0.1, 0.0, 0.0, 3]]
best_model.predict(new_X)

array([39.01878695])

In [20]:
with open((path_model + best_model_name + '_' + str(grid_size_meters) + '_c.pkl').replace(" ", "_"), 'wb') as file:
    pickle.dump(best_model, file)