# Modeling urban heat intensity (UHI)

In [None]:
# import general packages
import numpy as np
import pandas as pd
import warnings
import os
import pickle
import yaml

In [None]:
# adjust paths
home_directory = os.path.expanduser( '~' )
os.chdir(home_directory + '/DS_Project/modules')
config_path = 'config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
# import sklearn models
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

In [None]:
path = config['data']['dwd'] + '/uhi_model/'

In [None]:
# some randomly generated data
np.random.seed(42)

n = 1000
true_coefficients = np.array([2, -1, 0.5])
p = len(true_coefficients)

X = np.random.randn(n, p)
Y = np.dot(X, true_coefficients) + np.random.randn(n)

print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

In [None]:
# load features and target
grid_size_meters = 1000
with open(path + 'uhi_features_grid' + grid_size_meters + '.pkl', 'rb') as file:
    features = pickle.load(file)
with open(path + 'uhi_target_grid' + grid_size_meters + '.pkl', 'rb') as file:
    targets = pickle.load(file)

In [None]:
# bring features and targets into right format
X = features.drop('target_variable', axis=1).values
Y = targets['LST'].values
Y = Y.reshape(-1, 1)
assert len(X) == len(Y), "X and Y must have the same length."

In [None]:
# also create polynomials
poly_2 = PolynomialFeatures(degree=3, include_bias=False)
poly_3 = PolynomialFeatures(degree=3, include_bias=False)

X_poly_2 = poly_2.fit_transform(X)
X_poly_3 = poly_3.fit_transform(X)

In [None]:
# Define the models to test with their respective hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Linear Regression 2nd degree poly': (LinearRegression(), {}),
    'Linear Regression 3rd degree poly': (LinearRegression(), {}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}),
    'Support Vector Machine': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),
    'Neural Network': (MLPRegressor(max_iter=1000), {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.001], 'activation': ['relu', 'tanh']}),
    'XGBoost': (XGBRegressor(max_iter=1000), {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}),
    'ElasticNet': (ElasticNet(), {'alpha': [0.1, 1, 10], 'l1_ratio': [0.2, 0.5, 0.8]}),
    'AdaBoost': (AdaBoostRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]})
}

In [None]:
performance_measures = {
    'Mean Squared Error': make_scorer(mean_squared_error),
    'Mean Absolute Error': make_scorer(mean_absolute_error),
    'R-squared': make_scorer(r2_score),
    'Explained Variance Score': make_scorer(explained_variance_score)
}

In [None]:
# Perform k-fold cross-validation with hyperparameter tuning and calculate performance measures
pattern = r"Linear Regression \d+nd degree poly"
performance_results = {}
num_folds = 5
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kf)
    if model_name == 'Linear Regression 2nd degree poly':
        grid_search.fit(X_poly_2, Y)
    elif model_name == 'Linear Regression 3rd degree poly':
        grid_search.fit(X_poly_3, Y)
    else:
        grid_search.fit(X, Y)
    best_model = grid_search.best_estimator_
    performance_scores = {}
    for measure_name, measure_func in performance_measures.items():
        # scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func)
        scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func, error_score='raise')
        mean_score = np.mean(scores)
        performance_scores[measure_name] = mean_score
    performance_results[model_name] = performance_scores
    for measure_name, score in performance_scores.items():
        print(f"{measure_name}: {score:.4f}")
    print("Best Hyperparameters:", grid_search.best_params_)
    print()

In [None]:
performance_df = pd.DataFrame.from_dict(performance_results)
performance_df

In [None]:
def find_best_model(performance_results, performance_measure):
    best_model = None
    best_score = None

    for model_name, scores in performance_results.items():
        score = scores[performance_measure]

        if best_score is None:
            best_model = model_name
            best_score = score
        else:
            if (
                (performance_measure == 'Mean Squared Error' or performance_measure == 'Mean Absolute Error')
                and score < best_score
            ):
                best_model = model_name
                best_score = score
            elif (
                (performance_measure == 'R-squared' or performance_measure == 'Explained Variance Score')
                and score > best_score
            ):
                best_model = model_name
                best_score = score

    return best_model, best_score

In [None]:
best_measure = 'Mean Squared Error'
best_model_name, best_score = find_best_model(performance_results, best_measure)
best_model = models[best_model_name][0]
print(best_model_name)

In [None]:
best_model.fit(X, Y)

In [None]:
new_X = [[2.0, 3.0, 4.0]]
best_model.predict(new_X)

In [None]:
with open(path + 'uhi_model_' + grid_size_meters + '.pkl', 'wb') as file:
    pickle.dump(best_model, file)