# Modeling urban heat intensity (UHI)

In [39]:
# start-up
import pickle
import yaml
config_path = '/home/tu/tu_tu/tu_zxmny46/DS_Project/modules/config.yml'
# config_path = 'C:/Users/stefan/OneDrive - bwedu/04_semester/DS_Project/DS_Project/modules/config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [40]:
# import general packages
import numpy as np
import pandas as pd
import warnings

In [41]:
# import sklearn models
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

In [42]:
# some randomly generated data
np.random.seed(42)

n = 1000
true_coefficients = np.array([2, -1, 0.5])
p = len(true_coefficients)

X = np.random.randn(n, p)
Y = np.dot(X, true_coefficients) + np.random.randn(n)

print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

Shape of X: (1000, 3)
Shape of Y: (1000,)


In [43]:
# with open(config['data']['dwd'] + '/uhi_data.pkl', 'rb') as file:
#     data = pickle.load(file)

# X, Y = data['X'], data['Y']

In [44]:
# Define the models to test with their respective hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}),
    'Support Vector Machine': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),
    'Neural Network': (MLPRegressor(max_iter=1000), {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.001], 'activation': ['relu', 'tanh']}),
    'XGBoost': (XGBRegressor(max_iter=1000), {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}),
    'ElasticNet': (ElasticNet(), {'alpha': [0.1, 1, 10], 'l1_ratio': [0.2, 0.5, 0.8]}),
    'AdaBoost': (AdaBoostRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]})
}

In [45]:
performance_measures = {
    'Mean Squared Error': make_scorer(mean_squared_error),
    'Mean Absolute Error': make_scorer(mean_absolute_error),
    'R-squared': make_scorer(r2_score),
    'Explained Variance Score': make_scorer(explained_variance_score)
}

In [46]:
# Perform k-fold cross-validation with hyperparameter tuning and calculate performance measures
performance_results = {}
num_folds = 5
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kf)
    grid_search.fit(X, Y)
    best_model = grid_search.best_estimator_
    performance_scores = {}
    for measure_name, measure_func in performance_measures.items():
        # scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func)
        scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func, error_score='raise')
        mean_score = np.mean(scores)
        performance_scores[measure_name] = mean_score
    performance_results[model_name] = performance_scores
    for measure_name, score in performance_scores.items():
        print(f"{measure_name}: {score:.4f}")
    print("Best Hyperparameters:", grid_search.best_params_)
    print()

Model: Linear Regression
Mean Squared Error: 1.0579
Mean Absolute Error: 0.8291
R-squared: 0.8287
Explained Variance Score: 0.8301
Best Hyperparameters: {}

Model: Decision Tree
Mean Squared Error: 1.7311
Mean Absolute Error: 1.0331
R-squared: 0.7179
Explained Variance Score: 0.7187
Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 5}

Model: Random Forest
Mean Squared Error: 1.2550
Mean Absolute Error: 0.8982
R-squared: 0.7965
Explained Variance Score: 0.7984
Best Hyperparameters: {'max_depth': 10, 'n_estimators': 100}

Model: Support Vector Machine
Mean Squared Error: 1.0625
Mean Absolute Error: 0.8319
R-squared: 0.8280
Explained Variance Score: 0.8296
Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}

Model: Neural Network
Mean Squared Error: 1.0724
Mean Absolute Error: 0.8390
R-squared: 0.8261
Explained Variance Score: 0.8260
Best Hyperparameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,)}

Model: XGBoost
Parameters: { "max_i

In [47]:
performance_df = pd.DataFrame.from_dict(performance_results)
performance_df

Unnamed: 0,Linear Regression,Decision Tree,Random Forest,Support Vector Machine,Neural Network,XGBoost,ElasticNet,AdaBoost
Mean Squared Error,1.057879,1.731068,1.255033,1.062462,1.072443,1.176417,1.091909,1.309013
Mean Absolute Error,0.82914,1.033122,0.898197,0.831858,0.838956,0.86138,0.837623,0.913482
R-squared,0.828724,0.717867,0.796529,0.827957,0.826076,0.809418,0.823254,0.785266
Explained Variance Score,0.830132,0.718718,0.798395,0.829619,0.825965,0.810501,0.82523,0.786258


In [48]:
def find_best_model(performance_results, performance_measure):
    best_model = None
    best_score = None

    for model_name, scores in performance_results.items():
        score = scores[performance_measure]

        if best_score is None:
            best_model = model_name
            best_score = score
        else:
            if (
                (performance_measure == 'Mean Squared Error' or performance_measure == 'Mean Absolute Error')
                and score < best_score
            ):
                best_model = model_name
                best_score = score
            elif (
                (performance_measure == 'R-squared' or performance_measure == 'Explained Variance Score')
                and score > best_score
            ):
                best_model = model_name
                best_score = score

    return best_model, best_score

In [49]:
best_measure = 'Mean Squared Error'
best_model_name, best_score = find_best_model(performance_results, best_measure)
best_model = models[best_model_name][0]
print(best_model_name)

Linear Regression


In [50]:
best_model.fit(X, Y)

In [51]:
new_X = [[2.0, 3.0, 4.0]]
best_model.predict(new_X)

array([2.78384468])

In [52]:
with open(config['data']['data'] + '/uhi_model/uhi_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)