In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint


In [3]:
DataSet = pd.read_csv('./DataSets/DataSet_for_Models.csv')

In [4]:
X = DataSet.drop(['is_canceled', 'Unnamed: 0', 'reservation_status_0', 'reservation_status_1', 'reservation_status_2'], axis = 1)
Y = DataSet['is_canceled']

### Train Test Split

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=34)

# Model Training

### Logistic Regression

In [6]:
model = LogisticRegression()
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
Y_pred = model.predict(X_test)
Accuracy_LR = accuracy_score(Y_test, Y_pred)
Accuracy_LR

0.7662551755865665

### Decision Tree Classifier

In [8]:
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

In [9]:

Y_pred = model.predict(X_test)
Accuracy_DTC = accuracy_score(Y_test, Y_pred)
Accuracy_DTC

0.7965036037417574

### Random Forest Classifier

In [32]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)

In [33]:
Y_pred = model.predict(X_test)

In [34]:
Accuracy_RFC = accuracy_score(Y_test, Y_pred)
Accuracy_RFC

0.8463425854930225

### HyperParaMeter Tunning

### Decision Tree Classifier

Grid Search

In [62]:
param_grid = {
    'max_depth': [ 5, 6, 8],
    'min_samples_split': [3, 2, 5],
    'min_samples_leaf': [3, 1, 2],
    
}

In [36]:
model = DecisionTreeClassifier()
# cv is k-fold cross validation 
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

In [37]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [38]:
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Accuracy_HPT_DT = accuracy_score(Y_test,Y_pred)
Accuracy_HPT_DT

0.8204263149823646

Random Search

In [56]:
param_dist_RS = {
    'max_depth': [2, 4, 5, 8, 10, 12],
    'min_samples_split': randint(1, 10),
    'min_samples_leaf': randint(1, 10),
}

In [57]:
model = DecisionTreeClassifier()
random_search = RandomizedSearchCV(model, param_distributions=param_dist_RS, n_iter=10, cv=5)

random_search.fit(X_train, Y_train)

In [58]:
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 5}


In [59]:
best_model = random_search.best_estimator_
Y_pred = best_model.predict(X_test)
Accuracy_HPT_DT_RS = accuracy_score(Y_test,Y_pred)
Accuracy_HPT_DT_RS

0.8284005520625671

### Random Forest Classifier

Grid Search

In [74]:
param_grid = {
    'max_depth': [6, 8, 12],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2]
}

In [75]:
model = RandomForestClassifier(n_estimators = 50)
# cv is k-fold cross validation 
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

In [76]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 3}


In [77]:
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
Accuracy_HPT_RF = accuracy_score(Y_test,Y_pred)
Accuracy_HPT_RF

0.8170525992945867

Random Search

In [80]:
param_dist_RS = {
    'max_depth': [4, 8, 12],
    'min_samples_split': randint(1, 8),
    'min_samples_leaf': randint(1, 4)
}

In [82]:
model = RandomForestClassifier(n_estimators = 50)
random_search = RandomizedSearchCV(model, param_distributions=param_dist_RS, n_iter=10, cv=5)

random_search.fit(X_train, Y_train)

In [83]:
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 5}


In [85]:
best_model = random_search.best_estimator_
Y_pred = best_model.predict(X_test)
Accuracy_HPT_RF_RS = accuracy_score(Y_test,Y_pred)
Accuracy_HPT_RF_RS

0.8166308848336145

### Suport Vector Binary Classification

In [21]:
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train.head(30000), Y_train.head(30000))

In [22]:
Y_pred = svm_classifier.predict(X_test)

In [23]:
Accuracy_SVM = accuracy_score(Y_test, Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred)
#print(classification_report(Y_test, Y_pred))
Accuracy_SVM

0.7540637938966416

## Creating a Table for Calculations of Accuaracy

In [86]:
data = {"Accuracy": ["Logistic Regression", "DecisionTreeClassifier", "RandomForestClassifier", "DecisionTreeClassifier HPT GS", "DecisionTreeClassifier HPT RS", "RandomForestClassifier HPT GS", "RandomForestClassifier HPT RS", 'SVM'],
        "Values": [Accuracy_LR, Accuracy_DTC, Accuracy_RFC, Accuracy_HPT_DT, Accuracy_HPT_DT_RS, Accuracy_HPT_RF, Accuracy_HPT_RF_RS, Accuracy_SVM]}

In [87]:
results = pd.DataFrame(data)
results

Unnamed: 0,Accuracy,Values
0,Logistic Regression,0.766255
1,DecisionTreeClassifier,0.796504
2,RandomForestClassifier,0.846343
3,DecisionTreeClassifier HPT GS,0.820426
4,DecisionTreeClassifier HPT RS,0.816631
5,RandomForestClassifier HPT GS,0.817053
6,RandomForestClassifier HPT RS,0.816631
7,SVM,0.754064
