# Model Template

In [31]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# TODO: modell mit eurem ersetzen
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Empfohlen von sklearn als Alternative zu SVR, da besser skalierbar auf größere Datensätze
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor

In [32]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')

In [33]:
# drop start date for regression (is represented as month, day, pm) 
# train_df = train_df.drop(columns='start_date')
# test_df = test_df.drop(columns='start_date')

# drop null values
train_df = train_df.dropna()
val_df = val_df.dropna()
test_df = test_df.dropna()

In [None]:
scaler = StandardScaler()

train_df = scaler.fit_transform(train_df) 
val_df = scaler.fit_transform(val_df) 
test_df = scaler.fit_transform(test_df)

In [34]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 710964 entries, 0 to 723871
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   latitude             710964 non-null  float64
 1   longitude            710964 non-null  float64
 2   distance_to_center   710964 non-null  float64
 3   year                 710964 non-null  int64  
 4   month                710964 non-null  int64  
 5   weekday              710964 non-null  int64  
 6   pm                   710964 non-null  int64  
 7   holiday              710964 non-null  bool   
 8   mean_temperature     710964 non-null  float64
 9   total_precipitation  710964 non-null  float64
 10  stations_count       710964 non-null  int64  
dtypes: bool(1), float64(5), int64(5)
memory usage: 60.3 MB


## Support Vector Regression

In [None]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/SVR.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        # define hyperparameters
        kernel = trial.suggest_categorical('kernel', ['poly', 'rbf'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        c_regularizaion = trial.suggest_float('C', 1, 100)
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        SVR_reg = SVR(
            kernel=kernel,
            gamma=gamma,
            C=c_regularizaion
        )
        SVR_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = SVR_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'kernel': kernel,
             'gamma': gamma,
             'C': c_regularizaion,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=25)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    # hyperparameters_df[['n_estimators', 'max_depth']] = hyperparameters_df[['n_estimators', 'max_depth']].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/SVR.csv', index=False)

In [None]:
hyperparameters_df.head()

Unnamed: 0,kernel,gamma,C,r_squared,rmse


In [None]:
# # final model evaluation

# # TODO: mit eurem Modell und Hyperparametern ersetzen
# # build and train model using the most successful hyperparameters
# SVR_reg = SVR()
# SVR_reg.fit(X_train, y_train)

# # make predictions
# y_test_pred = SVR_reg.predict(X_test)

# # evaluate predictions
# r_squared = r2_score(y_test, y_test_pred)
# rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

# print(f'R^2:\t{r_squared}')
# print(f'RMSE:\t{rmse}')

# LinearSVR

In [30]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/linearSVR.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        # define hyperparameters
        epsilon = trial.suggest_float('epsilon', 0, 1, step=0.1)
        loss = trial.suggest_categorical('loss', ['epsilon_insensitive', 'squared_epsilon_insensitive'])
        c_regularizaion = trial.suggest_float('C', 1, 100)
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        LinearSVR_reg = LinearSVR(
            epsilon=epsilon,
            loss=loss,
            C=c_regularizaion,
            random_state=1
        )
        LinearSVR_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = LinearSVR_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            { 'epsilon': epsilon,
            'loss': loss,
             'C': c_regularizaion,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=25)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    hyperparameters_df['epsilon'] = hyperparameters_df['epsilon'].astype('float')
    hyperparameters_df['C'] = hyperparameters_df['C'].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/linearSVR.csv', index=False)

[32m[I 2022-11-17 19:44:06,731][0m A new study created in memory with name: no-name-0280e7be-7cf4-4bdc-9048-5569b83d306e[0m
[33m[W 2022-11-17 19:48:00,529][0m Trial 0 failed because of the following error: ValueError('Input X contains NaN.\nLinearSVR does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values')[0m
Traceback (most recent call last):
  File "C:\Users\NicoDöring\AppData\Local\Temp\ipykernel_28568\3349086273.py", line 5, in <module>
    

ValueError: Input X contains NaN.
LinearSVR does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [20]:
hyperparameters_df.head()

In [21]:
# final model evaluation

# TODO: mit eurem Modell und Hyperparametern ersetzen
# build and train model using the most successful hyperparameters
LinearSVR_reg = LinearSVR(
            epsilon=hyperparameters_df.loc[0, 'epsilon'],
            loss=hyperparameters_df.loc[0, 'loss'],
            C=hyperparameters_df.loc[0, 'C'],
            random_state=1
)

LinearSVR_reg.fit(X_train, y_train)

# make predictions
y_test_pred = LinearSVR_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

KeyError: 'epsilon'