# KNN Regression Model

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# TODO: modell mit eurem ersetzen
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')
corona_df = pd.read_csv('data/preprocessed_data/corona.csv')

In [3]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

X_corona = corona_df.drop(columns='count')
y_corona = corona_df['count']

In [4]:
X_train.head()

Unnamed: 0,latitude,longitude,distance_to_center,year,month,weekday,pm,holiday,mean_temperature,total_precipitation,stations_count,elevation_meters,density
0,0.476413,0.830476,0.267743,0.0,0.0,0.166667,0.0,0.0,0.295455,0.125352,0.0,0.07377,0.0
1,0.418223,0.624988,0.074418,0.0,0.0,0.166667,0.0,0.0,0.295455,0.125352,0.0,0.090164,0.393443
2,0.418223,0.624988,0.074418,0.0,0.0,0.166667,1.0,0.0,0.295455,0.125352,0.0,0.090164,0.393443
3,0.421166,0.562329,0.050914,0.0,0.0,0.166667,0.0,0.0,0.295455,0.125352,0.0,0.262295,0.721311
4,0.363304,0.505772,0.055828,0.0,0.0,0.166667,0.0,0.0,0.295455,0.125352,0.0,0.42623,0.540984


In [5]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/knn.csv')
    pass
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        
        # define hyperparameters
        n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
        
        #TODO: regression with weights=distance always returns rmse=0 ?
        weights = trial.suggest_categorical('weigths', ['uniform', 'distance'])
        metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])
        
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        knn_reg = KNeighborsRegressor(
            n_neighbors=n_neighbors,
            weights=weights,
            metric=metric
        )
        knn_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = knn_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'n_neighbors': n_neighbors,
             'weights': weights,
             'metric' : metric,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=10)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    hyperparameters_df['n_neighbors'] = hyperparameters_df['n_neighbors'].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/knn.csv', index=False)

[32m[I 2022-11-27 11:06:50,665][0m A new study created in memory with name: no-name-dc012c6f-5618-43ad-a594-c10b44ead81a[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-27 11:07:20,408][0m Trial 0 finished with value: 16.946509698834245 and parameters: {'n_neighbors': 14, 'weigths': 'distance', 'metric': 'manhattan'}. Best is trial 0 with value: 16.946509698834245.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-27 11:07:47,520][0m Trial 1 finished with value: 16.878547011159398 and parameters: {'n_neighbors': 12, 'weigths': 'distance', 'metric': 'manhattan'}. Best is trial 1 with value: 16.878547011159398.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-27 11:08:23,973][0m Trial 2 finished with value: 18.89721506582835 and parameters: {'n_neighbors': 14, 'weigths': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 16.878547011159398.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11

In [6]:
hyperparameters_df

Unnamed: 0,n_neighbors,weights,metric,r_squared,rmse
0,2,uniform,manhattan,0.625153,16.068241
1,7,distance,manhattan,0.602285,16.551119
2,11,distance,manhattan,0.588592,16.833626
3,12,distance,manhattan,0.586394,16.878547
4,14,distance,manhattan,0.583056,16.94651
5,11,uniform,manhattan,0.5802,17.004454
6,12,uniform,manhattan,0.578166,17.045594
7,16,uniform,manhattan,0.571796,17.173809
8,11,uniform,euclidean,0.487056,18.796467
9,14,uniform,euclidean,0.481543,18.897215


In [7]:
hyperparameters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   n_neighbors  10 non-null     int32  
 1   weights      10 non-null     object 
 2   metric       10 non-null     object 
 3   r_squared    10 non-null     float64
 4   rmse         10 non-null     float64
dtypes: float64(2), int32(1), object(2)
memory usage: 488.0+ bytes


In [8]:
# final model evaluation

# TODO: mit eurem Modell und Hyperparametern ersetzen
# build and train model using the most successful hyperparameters
knn_reg = KNeighborsRegressor(
    n_neighbors=hyperparameters_df.loc[0, 'n_neighbors'],
    weights=hyperparameters_df.loc[0, 'weights'],
    metric=hyperparameters_df.loc[0, 'metric']
)
knn_reg.fit(X_train, y_train)

# make predictions
y_test_pred = knn_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

R^2:	0.5186334013905602
RMSE:	18.45080543699161


In [10]:
# predictions on corona year 2020

# make predictions
y_corona_pred = knn_reg.predict(X_corona)

# evaluate predictions
r_squared_c = r2_score(y_corona, y_corona_pred)
rmse_c = mean_squared_error(y_corona, y_corona_pred) ** 0.5

print(f'R^2:\t{r_squared_c}')
print(f'RMSE:\t{rmse_c}')

R^2:	-0.7616299232449355
RMSE:	20.778309759299695
