# Model Template

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# TODO: modell mit eurem ersetzen
from sklearn.svm import SVR

# Empfohlen von sklearn als Alternative zu SVR, da besser skalierbar auf größere Datensätze
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')

In [3]:
# drop start date for regression (is represented as month, day, pm) 
train_df = train_df.drop(columns='start_date')
test_df = test_df.drop(columns='start_date')

# drop null values
train_df = train_df.dropna()
test_df = test_df.dropna()

In [4]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val, X_test, y_val, y_test = train_test_split(
    test_df.drop(columns='count'),
    test_df['count'],
    test_size=0.5,
    shuffle=True,
    random_state=7
)

In [5]:
y_train.describe()

count    402899.000000
mean         23.192028
std          26.375188
min           1.000000
25%           7.000000
50%          15.000000
75%          29.000000
max         547.000000
Name: count, dtype: float64

## Support Vector Regression

In [6]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/SVR.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        # define hyperparameters
        kernel = trial.suggest_categorical('kernel', ['poly', 'rbf'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        c_regularizaion = trial.suggest_float('C', 1, 100)
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        SVR_reg = SVR(
            kernel=kernel,
            gamma=gamma,
            C=c_regularizaion
        )
        SVR_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = SVR_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'kernel': kernel,
             'gamma': gamma,
             'C': c_regularizaion,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=25)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    # hyperparameters_df[['n_estimators', 'max_depth']] = hyperparameters_df[['n_estimators', 'max_depth']].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/SVR.csv', index=False)

In [7]:
hyperparameters_df.head()

Unnamed: 0,kernel,gamma,C,r_squared,rmse


In [8]:
# # final model evaluation

# # TODO: mit eurem Modell und Hyperparametern ersetzen
# # build and train model using the most successful hyperparameters
# SVR_reg = SVR()
# SVR_reg.fit(X_train, y_train)

# # make predictions
# y_test_pred = SVR_reg.predict(X_test)

# # evaluate predictions
# r_squared = r2_score(y_test, y_test_pred)
# rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

# print(f'R^2:\t{r_squared}')
# print(f'RMSE:\t{rmse}')

# LinearSVR

In [9]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/LinearSVR.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        # define hyperparameters
        epsilon = trial.suggest_float('epsilon', 0, 1, step=0.1)
        loss = trial.suggest_categorical('loss', ['epsilon_insensitive', 'squared_epsilon_insensitive'])
        c_regularizaion = trial.suggest_float('C', 1, 100)
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        LinearSVR_reg = LinearSVR(
            epsilon=epsilon,
            loss=loss,
            C=c_regularizaion,
            random_state=1
        )
        LinearSVR_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = LinearSVR_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            { 'epsilon': epsilon,
            'loss': loss,
             'C': c_regularizaion,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=10)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    hyperparameters_df['epsilon'] = hyperparameters_df['epsilon'].astype('float')
    hyperparameters_df['C'] = hyperparameters_df['C'].astype('int')
    print(hyperparameters_df['C'])
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/LinearSVR.csv', index=False)

[32m[I 2022-11-15 18:52:10,852][0m A new study created in memory with name: no-name-c2f39912-5bae-4e27-939c-7b41978b5ced[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-15 18:54:12,799][0m Trial 0 finished with value: 41.569516724488714 and parameters: {'epsilon': 0.6000000000000001, 'loss': 'squared_epsilon_insensitive', 'C': 2.548294496818163}. Best is trial 0 with value: 41.569516724488714.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-15 18:56:09,970][0m Trial 1 finished with value: 25.40356959341293 and parameters: {'epsilon': 0.9, 'loss': 'epsilon_insensitive', 'C': 53.50992880565961}. Best is trial 1 with value: 25.40356959341293.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-15 18:58:11,512][0m Trial 2 finished with value: 42.25913250155488 and parameters: {'epsilon': 0.5, 'loss': 'squared_epsilon_insensitive', 'C': 23.684705461449912}. Best is trial 1 with value: 25.40356959341293.[0m
  hyperparameters

0     2
1    53
2    23
3    70
4    90
5    54
6     3
7    87
8    38
9    98
Name: C, dtype: int32


In [10]:
hyperparameters_df.head()

Unnamed: 0,epsilon,loss,C,r_squared,rmse
0,0.9,epsilon_insensitive,53,0.082401,25.40357
1,0.7,epsilon_insensitive,38,-0.961364,37.140475
2,0.6,squared_epsilon_insensitive,2,-1.457046,41.569517
3,0.6,squared_epsilon_insensitive,3,-1.475062,41.72164
4,0.8,squared_epsilon_insensitive,70,-1.514311,42.051149


In [12]:
# final model evaluation

# TODO: mit eurem Modell und Hyperparametern ersetzen
# build and train model using the most successful hyperparameters
LinearSVR_reg = LinearSVR(
            epsilon=hyperparameters_df.loc[0, 'epsilon'],
            loss=hyperparameters_df.loc[0, 'loss'],
            C=hyperparameters_df.loc[0, 'C'],
            random_state=1
)

LinearSVR_reg.fit(X_train, y_train)

# make predictions
y_test_pred = LinearSVR_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

R^2:	0.02911294246303986
RMSE:	25.88739915504945


