# KNN Regression Model

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# TODO: modell mit eurem ersetzen
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723872 entries, 0 to 723871
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             723872 non-null  float64
 1   longitude            723872 non-null  float64
 2   distance_to_center   723872 non-null  float64
 3   year                 723872 non-null  int64  
 4   month                723872 non-null  int64  
 5   weekday              723872 non-null  int64  
 6   pm                   723872 non-null  int64  
 7   holiday              723872 non-null  bool   
 8   mean_temperature     716016 non-null  float64
 9   total_precipitation  710964 non-null  float64
 10  stations_count       723872 non-null  int64  
 11  count                723872 non-null  int64  
dtypes: bool(1), float64(5), int64(6)
memory usage: 61.4 MB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233379 entries, 0 to 233378
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             233379 non-null  float64
 1   longitude            233379 non-null  float64
 2   distance_to_center   233379 non-null  float64
 3   year                 233379 non-null  int64  
 4   month                233379 non-null  int64  
 5   weekday              233379 non-null  int64  
 6   pm                   233379 non-null  int64  
 7   holiday              233379 non-null  bool   
 8   mean_temperature     233379 non-null  float64
 9   total_precipitation  231039 non-null  float64
 10  stations_count       233379 non-null  int64  
 11  count                233379 non-null  int64  
dtypes: bool(1), float64(5), int64(6)
memory usage: 19.8 MB


In [5]:
# drop null values
train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 710964 entries, 0 to 723871
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             710964 non-null  float64
 1   longitude            710964 non-null  float64
 2   distance_to_center   710964 non-null  float64
 3   year                 710964 non-null  int64  
 4   month                710964 non-null  int64  
 5   weekday              710964 non-null  int64  
 6   pm                   710964 non-null  int64  
 7   holiday              710964 non-null  bool   
 8   mean_temperature     710964 non-null  float64
 9   total_precipitation  710964 non-null  float64
 10  stations_count       710964 non-null  int64  
 11  count                710964 non-null  int64  
dtypes: bool(1), float64(5), int64(6)
memory usage: 65.8 MB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231039 entries, 0 to 233378
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             231039 non-null  float64
 1   longitude            231039 non-null  float64
 2   distance_to_center   231039 non-null  float64
 3   year                 231039 non-null  int64  
 4   month                231039 non-null  int64  
 5   weekday              231039 non-null  int64  
 6   pm                   231039 non-null  int64  
 7   holiday              231039 non-null  bool   
 8   mean_temperature     231039 non-null  float64
 9   total_precipitation  231039 non-null  float64
 10  stations_count       231039 non-null  int64  
 11  count                231039 non-null  int64  
dtypes: bool(1), float64(5), int64(6)
memory usage: 21.4 MB


In [8]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

In [9]:
X_train.head()

Unnamed: 0,latitude,longitude,distance_to_center,year,month,weekday,pm,holiday,mean_temperature,total_precipitation,stations_count
0,45.523854,-73.519677,4.440272,2014,4,1,0,False,5.8,8.9,459
1,45.50931,-73.554431,1.288393,2014,4,1,0,False,5.8,8.9,459
2,45.50931,-73.554431,1.288393,2014,4,1,1,False,5.8,8.9,459
3,45.511119,-73.567974,0.844553,2014,4,1,0,False,5.8,8.9,459
4,45.51533,-73.559148,1.50972,2014,4,1,0,False,5.8,8.9,459


In [10]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/knn.csv')
    pass
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        
        # define hyperparameters
        n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
        
        #TODO: regression with weights=distance always returns rmse=0 ?
        weights = trial.suggest_categorical('weigths', ['uniform', 'distance'])
        metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])
        
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        knn_reg = KNeighborsRegressor(
            n_neighbors=n_neighbors,
            weights=weights,
            metric=metric
        )
        knn_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = knn_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'n_neighbors': n_neighbors,
             'weights': weights,
             'metric' : metric,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=10)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    hyperparameters_df['n_neighbors'] = hyperparameters_df['n_neighbors'].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/knn.csv', index=False)

[32m[I 2022-11-20 19:31:12,064][0m A new study created in memory with name: no-name-0708794b-5b53-4111-b7ad-ba93dc827a69[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:31:27,688][0m Trial 0 finished with value: 20.457143837626237 and parameters: {'n_neighbors': 10, 'weigths': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 20.457143837626237.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:31:39,135][0m Trial 1 finished with value: 20.269802689056775 and parameters: {'n_neighbors': 13, 'weigths': 'uniform', 'metric': 'manhattan'}. Best is trial 1 with value: 20.269802689056775.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:31:55,738][0m Trial 2 finished with value: 20.740277132206565 and parameters: {'n_neighbors': 20, 'weigths': 'distance', 'metric': 'euclidean'}. Best is trial 1 with value: 20.269802689056775.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-1

In [11]:
hyperparameters_df

Unnamed: 0,n_neighbors,weights,metric,r_squared,rmse
0,2,uniform,euclidean,0.51415,18.399197
1,5,distance,euclidean,0.425467,20.00806
2,13,uniform,manhattan,0.410337,20.269803
3,13,uniform,manhattan,0.410337,20.269803
4,9,distance,euclidean,0.406016,20.343943
5,18,distance,manhattan,0.400409,20.439725
6,17,uniform,manhattan,0.400326,20.441142
7,10,distance,euclidean,0.399387,20.457144
8,12,distance,euclidean,0.391311,20.594228
9,20,distance,euclidean,0.382647,20.740277


In [12]:
hyperparameters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   n_neighbors  10 non-null     int32  
 1   weights      10 non-null     object 
 2   metric       10 non-null     object 
 3   r_squared    10 non-null     float64
 4   rmse         10 non-null     float64
dtypes: float64(2), int32(1), object(2)
memory usage: 488.0+ bytes


In [13]:
# final model evaluation

# TODO: mit eurem Modell und Hyperparametern ersetzen
# build and train model using the most successful hyperparameters
knn_reg = KNeighborsRegressor(
    n_neighbors=hyperparameters_df.loc[0, 'n_neighbors'],
    weights=hyperparameters_df.loc[0, 'weights'],
    metric=hyperparameters_df.loc[0, 'metric']
)
knn_reg.fit(X_train, y_train)

# make predictions
y_test_pred = knn_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

R^2:	0.43987235301150496
RMSE:	19.881053272975713
