# Modeling urban heat intensity (UHI)

In [2]:
# start-up
import pickle
import yaml
# config_path = '/home/tu/tu_tu/tu_zxmny46/DS_Project/modules/config.yml'
config_path = 'C:/Users/stefan/OneDrive - bwedu/04_semester/DS_Project/DS_Project/modules/config.yml'
with open(config_path, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
# import general packages
import numpy as np
import pandas as pd

In [10]:
# import sklearn models
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

In [5]:
# some randomly generated data
np.random.seed(42)

n = 10000
true_coefficients = np.array([2, -1, 0.5, 1.5, -0.5, 0, 0])
p = len(true_coefficients)

X = np.random.randn(n, p)
Y = np.dot(X, true_coefficients) + np.random.randn(n)

print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

Shape of X: (10000, 7)
Shape of Y: (10000,)


In [None]:
# with open(config['data']['dwd'] + '/uhi_data.pkl', 'rb') as file:
#     data = pickle.load(file)

# X, Y = data['X'], data['Y']

In [6]:
# Define the models to test with their respective hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}),
    'Support Vector Machine': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),
    'Neural Network': (MLPRegressor(), {'hidden_layer_sizes': [(100,), (50, 50)], 'alpha': [0.0001, 0.001], 'activation': ['relu', 'tanh']}),
    'Naive Bayes': (GaussianNB(), {}),
    'K-Nearest Neighbors': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']})
}

In [11]:
performance_measures = {
    'Mean Squared Error': make_scorer(mean_squared_error),
    'Mean Absolute Error': make_scorer(mean_absolute_error),
    'R-squared': make_scorer(r2_score),
    'Explained Variance Score': make_scorer(explained_variance_score)
}

In [12]:
# Perform k-fold cross-validation with hyperparameter tuning and calculate performance measures
num_folds = 5
for model_name, (model, param_grid) in models.items():
    print(f"Model: {model_name}")
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kf)
    grid_search.fit(X, Y)
    best_model = grid_search.best_estimator_
    performance_scores = {}
    for measure_name, measure_func in performance_measures.items():
        scores = cross_val_score(best_model, X, Y, cv=kf, scoring=measure_func)
        mean_score = np.mean(scores)
        performance_scores[measure_name] = mean_score
    for measure_name, score in performance_scores.items():
        print(f"{measure_name}: {score:.4f}")
    print("Best Hyperparameters:", grid_search.best_params_)
    print()

Model: Linear Regression
Mean Squared Error: 1.0157
Mean Absolute Error: 0.8037
R-squared: 0.8841
Explained Variance Score: 0.8843
Best Hyperparameters: {}

Model: Decision Tree
Mean Squared Error: 2.0736
Mean Absolute Error: 1.1410
R-squared: 0.7640
Explained Variance Score: 0.7638
Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 10}

Model: Random Forest


KeyboardInterrupt: 