# KNN Regression Model

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# TODO: modell mit eurem ersetzen
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419752 entries, 0 to 419751
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   start_date           419752 non-null  object 
 1   latitude             419752 non-null  float64
 2   longitude            419752 non-null  float64
 3   month                419752 non-null  int64  
 4   weekday              419752 non-null  int64  
 5   pm                   419752 non-null  int64  
 6   mean_temperature     403958 non-null  float64
 7   total_precipitation  402899 non-null  float64
 8   count                419752 non-null  int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 28.8+ MB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214900 entries, 0 to 214899
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   start_date           214900 non-null  object 
 1   latitude             214900 non-null  float64
 2   longitude            214900 non-null  float64
 3   month                214900 non-null  int64  
 4   weekday              214900 non-null  int64  
 5   pm                   214900 non-null  int64  
 6   mean_temperature     204306 non-null  float64
 7   total_precipitation  203247 non-null  float64
 8   count                214900 non-null  int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 14.8+ MB


In [5]:
# drop start date for regression (is represented as month, day, pm) 
train_df = train_df.drop(columns='start_date')
test_df = test_df.drop(columns='start_date')

# drop null values
train_df = train_df.dropna()
test_df = test_df.dropna()

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 402899 entries, 0 to 419751
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             402899 non-null  float64
 1   longitude            402899 non-null  float64
 2   month                402899 non-null  int64  
 3   weekday              402899 non-null  int64  
 4   pm                   402899 non-null  int64  
 5   mean_temperature     402899 non-null  float64
 6   total_precipitation  402899 non-null  float64
 7   count                402899 non-null  int64  
dtypes: float64(4), int64(4)
memory usage: 27.7 MB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203247 entries, 0 to 214899
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             203247 non-null  float64
 1   longitude            203247 non-null  float64
 2   month                203247 non-null  int64  
 3   weekday              203247 non-null  int64  
 4   pm                   203247 non-null  int64  
 5   mean_temperature     203247 non-null  float64
 6   total_precipitation  203247 non-null  float64
 7   count                203247 non-null  int64  
dtypes: float64(4), int64(4)
memory usage: 14.0 MB


In [8]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val, X_test, y_val, y_test = train_test_split(
    test_df.drop(columns='count'),
    test_df['count'],
    test_size=0.5,
    shuffle=True,
    random_state=7
)

In [9]:
X_train.head()

Unnamed: 0,latitude,longitude,month,weekday,pm,mean_temperature,total_precipitation
0,45.533703,-73.515283,4,5,0,11.3,3.1
1,45.509328,-73.554347,4,5,0,11.3,3.1
2,45.509328,-73.554347,4,5,1,11.3,3.1
3,45.539292,-73.541031,4,5,0,11.3,3.1
4,45.539292,-73.541031,4,5,1,11.3,3.1


In [10]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/knn.csv')
    pass
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        
        # define hyperparameters
        n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
        
        #TODO: regression with weights=distance always returns rmse=0 ?
        #weights = trial.suggest_categorical('weigths', ['uniform', 'distance'])
        #metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])
        
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        knn_reg = KNeighborsRegressor(
            n_neighbors=n_neighbors,
            #weights=weights,
            #metric=metric
        )
        knn_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = knn_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'n_neighbors': n_neighbors,
             #'weights': weights,
             #'metric' : metric,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=10)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    hyperparameters_df['n_neighbors'] = hyperparameters_df['n_neighbors'].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/knn.csv', index=False)

[32m[I 2022-11-14 18:20:55,505][0m A new study created in memory with name: no-name-61e72440-b63b-48a5-9a76-c5edaf34d07a[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-14 18:20:59,044][0m Trial 0 finished with value: 17.29337765270503 and parameters: {'n_neighbors': 8}. Best is trial 0 with value: 17.29337765270503.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-14 18:21:02,253][0m Trial 1 finished with value: 17.9030367121702 and parameters: {'n_neighbors': 10}. Best is trial 0 with value: 17.29337765270503.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-14 18:21:05,373][0m Trial 2 finished with value: 17.29337765270503 and parameters: {'n_neighbors': 8}. Best is trial 0 with value: 17.29337765270503.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-14 18:21:08,860][0m Trial 3 finished with value: 18.333061743402354 and parameters: {'n_neighbors': 18}. Best is trial 0 with value: 17.2933776

In [11]:
hyperparameters_df

Unnamed: 0,n_neighbors,r_squared,rmse
0,2,0.81146,11.515159
1,7,0.59138,16.952291
2,8,0.574771,17.293378
3,8,0.574771,17.293378
4,10,0.54426,17.903037
5,10,0.54426,17.903037
6,13,0.532631,18.130011
7,15,0.526904,18.240758
8,18,0.522104,18.333062
9,20,0.518635,18.399483


In [12]:
hyperparameters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   n_neighbors  10 non-null     int32  
 1   r_squared    10 non-null     float64
 2   rmse         10 non-null     float64
dtypes: float64(2), int32(1)
memory usage: 328.0 bytes


In [13]:
# final model evaluation

# TODO: mit eurem Modell und Hyperparametern ersetzen
# build and train model using the most successful hyperparameters
knn_reg = KNeighborsRegressor(
    n_neighbors=hyperparameters_df.loc[0, 'n_neighbors'],
    #weights=hyperparameters_df.loc[0, 'weights'],
    #metric=hyperparameters_df.loc[0, 'metric']
)
knn_reg.fit(X_train, y_train)

# make predictions
y_test_pred = knn_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

R^2:	0.812069998450133
RMSE:	11.38943349200627
