In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score



In [2]:
# Importing clean, ready to go dataset from previous exercise 2
df = pd.read_csv("../exercise2/log_reg_df_clean.csv")
df.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,lead_time,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,...,room_type_reserved_Room_Type 1,room_type_reserved_Room_Type 2,room_type_reserved_Room_Type 4,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online
0,2,0,1,2,1,0,224,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,2,0,2,3,0,0,5,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1,0,2,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,2,0,0,2,1,0,211,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,2,0,1,1,0,0,48,0,0,0,...,1,0,0,0,0,0,0,0,0,1


Split the dataset:

In [3]:
# X = everything else in the DataFrame minus the target variable
# y = only the target variable
X = df.drop('booking_status', axis=1)
y = df['booking_status']

# split the data into train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [4]:
# Initiate regression and fit in the data
model = make_pipeline(StandardScaler(), svm.SVC(probability=True))
model.fit(X_train, y_train)

Check metrics:

In [5]:
 # get predictions for error metrics
predictions = model.predict(X_test)
 # print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = round(accuracy_score(y_test, predictions), 2)
print(f"\nModel overall accuracy: {acc}")

# get roc acu score of the model and print it
roc_auc = round(roc_auc_score(y_test.values, model.predict_proba(X_test)[:, 1]), 2)
print(f"Model ROC AUC score: {roc_auc}")


              precision    recall  f1-score   support

           0       0.79      0.64      0.71      2535
           1       0.83      0.91      0.87      4853

    accuracy                           0.82      7388
   macro avg       0.81      0.78      0.79      7388
weighted avg       0.82      0.82      0.81      7388


Model overall accuracy: 0.82
Model ROC AUC score: 0.88


So, model overall accuracy is 82.44% and ROC AUC score is at almost 0.88 which can be considered as really well perfoming model.

In next table we see results of different versions of usual logistic regression from previous exercise. In this case accuracy of SVC model is slightly better and ROC AUC score is noticebly greater.

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Metric</th>
      <th>Basic Model</th>
      <th>StandardScaler + LogisticRegression</th>
      <th>StandardScaler + Regularization + LogisticRegression</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Total accuracy</td>
      <td>0.78</td>
      <td>0.78</td>
      <td>0.81</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Macro precision</td>
      <td>0.77</td>
      <td>0.77</td>
      <td>0.80</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Macro recall</td>
      <td>0.75</td>
      <td>0.75</td>
      <td>0.78</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Macro F1</td>
      <td>0.75</td>
      <td>0.75</td>
      <td>0.78</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Weighted precision</td>
      <td>0.78</td>
      <td>0.78</td>
      <td>0.81</td>
    </tr>
    <tr>
      <th>5</th>
      <td>Weighted recall</td>
      <td>0.78</td>
      <td>0.78</td>
      <td>0.81</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Weighted F1</td>
      <td>0.78</td>
      <td>0.78</td>
      <td>0.81</td>
    </tr>
    <tr>
      <th>7</th>
      <td>ROC AUC</td>
      <td>0.76</td>
      <td>0.76</td>
      <td>0.61</td>
    </tr>
  </tbody>
</table>
</div>

Creating simple df for future metrics of different variants of optimized SVC models:

In [6]:
# Empty dataframe for metrics
metrics_df = pd.DataFrame({'Metric': ['Total accuracy', 'ROC AUC']})

In [7]:
metrics_df["Default SVC"] = [acc, roc_auc]
metrics_df

Unnamed: 0,Metric,Default SVC
0,Total accuracy,0.82
1,ROC AUC,0.88


Now let's try to hightune parameters and try out different kernels of this model, such as common SVC, Linear SVC and NuSVC. Using GridSearchCV

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
# Initialize parameters, our common SVC model with linear kernel supports such parameters as:

# C - values between 0-1000. Regularization parameter

# Let's try different random parameters and see in which way we should move. After that increase or decrease values closer to best parameters and see if there is any space for improvement

param_grid = {
    'C': [0.1, 0.5, 1, 10],
    "kernel": ["linear"],
}

In [11]:
# make a pipeline, and wrap SVM model into the GridSearchCV
# probability=True parameter slows down fitting but allows to use predict_proba which is used in roc_auc_score metrics. However, fitting was too slow and I decided to go another way around using decision scores.
# n_jobs parameter allows to tun paralell threads tto improve speed of GridSearchCV. -2 means that python will use all CPUs will be used but 1
model = make_pipeline(StandardScaler(), GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=3, n_jobs=-2))
model.fit(X_train, y_train) 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [14]:
 # get predictions for error metrics
predictions = model.predict(X_test)
 # print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = round(accuracy_score(y_test, predictions), 2)
print(f"\nModel overall accuracy: {acc}")

# If probability=True is not set, can use the decision_function method.
decision_scores = model.decision_function(X_test)
# get roc acu score of the model and print it
roc_auc = round(roc_auc_score(y_test, decision_scores), 2)
print(f"Model ROC AUC score: {roc_auc}")

metrics_df["SVC, linear kernel vol.1"] = [acc, roc_auc]
metrics_df


              precision    recall  f1-score   support

           0       0.74      0.60      0.66      2535
           1       0.81      0.89      0.85      4853

    accuracy                           0.79      7388
   macro avg       0.78      0.75      0.76      7388
weighted avg       0.79      0.79      0.79      7388


Model overall accuracy: 0.79
Model ROC AUC score: 0.85


Unnamed: 0,Metric,Default SVC,"SVC, linear kernel vol.1"
0,Total accuracy,0.82,0.79
1,ROC AUC,0.88,0.85


Let's try to optimize the model with the same kernel a little bit further. 

In [15]:
model[1].best_params_

{'C': 0.5, 'kernel': 'linear'}

So the best C was 0.5 which means we could try again with values 0.1-1 and see if it does any help.

In [23]:
param_grid = {
    'C': [50,100,500,1000],
    "kernel": ["linear"],
}

model = make_pipeline(StandardScaler(), GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=3, n_jobs=-2))
model.fit(X_train, y_train) 

Fitting 5 folds for each of 4 candidates, totalling 20 fits


KeyboardInterrupt: 

In [21]:
 # get predictions for error metrics
predictions = model.predict(X_test)
 # print the classification report based on true values and predictions
print(classification_report(y_test, predictions))

# get overall accuracy of the model and print it
acc = round(accuracy_score(y_test, predictions), 2)
print(f"\nModel overall accuracy: {acc}")

# If probability=True is not set, can use the decision_function method.
decision_scores = model.decision_function(X_test)
# get roc acu score of the model and print it
roc_auc = round(roc_auc_score(y_test, decision_scores), 2)
print(f"Model ROC AUC score: {roc_auc}")

metrics_df["SVC, linear kernel vol.2"] = [acc, roc_auc]
metrics_df

              precision    recall  f1-score   support

           0       0.74      0.60      0.66      2535
           1       0.81      0.89      0.85      4853

    accuracy                           0.79      7388
   macro avg       0.78      0.75      0.76      7388
weighted avg       0.79      0.79      0.79      7388


Model overall accuracy: 0.79
Model ROC AUC score: 0.85


Unnamed: 0,Metric,Default SVC,"SVC, linear kernel vol.1","SVC, linear kernel vol.2"
0,Total accuracy,0.82,0.79,0.79
1,ROC AUC,0.88,0.85,0.85


In [22]:
model[1].best_params_

{'C': 0.5, 'kernel': 'linear'}

So we got exactly the same result