<h1>Deriving new Features From Given Data</h1>
<p>Using the given data we will try and derive some new feature columns that may yield a better result in the prediction of customer cancellations.</p>

In [2]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
#import the Train_df
Train_df = pd.read_csv('Train_df.csv')

In [4]:
Train_df.head()

Unnamed: 0,LeadTime,ArrivalYear,ArrivalMonth,ArrivalDate,NumWeekendNights,NumWeekNights,Parking,NumAdults,NumChildren,RepeatedGuest,...,RoomType_Room_Type 7,MealPlan_Meal Plan 1,MealPlan_Meal Plan 2,MealPlan_Meal Plan 3,MealPlan_Not Selected,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Offline,MarketSegment_Online
0,10,2018,3,31,0,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,116,2018,2,28,2,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,11,2018,7,25,1,2,0,2,1,0,...,0,1,0,0,0,0,0,0,0,1
3,3,2017,9,12,0,1,0,2,0,0,...,0,1,0,0,0,0,0,0,0,1
4,28,2018,3,7,1,3,0,2,0,0,...,0,1,0,0,0,0,0,0,1,0


<p>Let's create 2 new feature columns</p>
<ul>
<li>Total customers</li>
<li>Total nights to stay</li>
</ul>

In [6]:
#total customers is children + adults
Train_df['Total_Customers'] = Train_df['NumChildren'] + Train_df['NumAdults']

In [7]:
# total nights
Train_df['Total_Nights'] = Train_df['NumWeekNights'] + Train_df['NumWeekendNights']

In [8]:
#lets now try running linear regression on the data
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [9]:
#import the test data and preprocess it
Test_df = pd.read_csv('test_df.csv')


In [10]:
#pre process the test data
Test_df['Total_Customers'] = Test_df['NumChildren'] + Test_df['NumAdults']
Test_df['Total_Nights'] = Test_df['NumWeekNights'] + Test_df['NumWeekendNights']


In [15]:
#let's split the train data into train and test
X = Train_df.drop(['isCanceled'], axis=1)
y = Train_df['isCanceled']


#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
#preform linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

#predict the test data
y_pred = lr.predict(X_test)

import numpy as np
y_pred_round = np.round(y_pred)
#compare y_pred_round to y_test

#calculate accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_round)


0.8053066850447967

In [18]:
#now let's try logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [23]:
#we can use the same train and test data as before
#preform logistic regression
lr = LogisticRegression(max_iter=10)
lr.fit(X_train, y_train)

#predict the test data
y_pred = lr.predict(X_test)

#calculate accuracy

accuracy_score(y_test, y_pred)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7629221226740179

In [24]:
#lets try a svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#we can use the same train and test data as before
#preform logistic regression
svm = SVC()
svm.fit(X_train, y_train)

#predict the test data
y_pred = svm.predict(X_test)

#calculate accuracy

accuracy_score(y_test, y_pred)


0.7567195037904894

In [33]:
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#we can use the same train and test data as before
#preform logistic regression
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

#predict the test data
y_pred = rf.predict(X_test)

#calculate accuracy

accuracy_score(y_test, y_pred)

0.90144727773949

In [32]:
#lets try tuning the random forest
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]

}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                            cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

#predict the test data
y_pred = grid_search.predict(X_test)

#calculate accuracy

accuracy_score(y_test, y_pred)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


0.8871467953135769

In [28]:
#why not try an RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras import metrics
from tensorflow.keras import losses
from tensorflow.keras import optimizers

#we can use the same train and test data as before
#preform logistic regression
rnn = Sequential()
rnn.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
rnn.add(Dense(32, activation='relu'))
rnn.add(Dense(1, activation='sigmoid'))
rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

#predict the test data
y_pred = rnn.predict(X_test)

#calculate accuracy using a threshold of 0.5
y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.7910062026188835

In [29]:
#let's create a function that varies epochs, batch_size, and optimizer, and layers
def create_model(epochs, batch_size, optimizer, layers):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
    for i in range(layers):
        model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    return model


In [31]:
#let's try n nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#we can use the same train and test data as before
#preform logistic regression
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

#predict the test data
y_pred = knn.predict(X_test)

#calculate accuracy

accuracy_score(y_test, y_pred)




0.8115093039283253

In [40]:
#let's try a CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras import metrics
from tensorflow.keras import losses
from tensorflow.keras import optimizers

#we can use the same train and test data as before
#preform logistic regression
cnn = Sequential()
cnn.add(Dense(8, activation='relu', input_shape=(X_train.shape[1],)))
cnn.add(Dense(8, activation='relu'))
cnn.add(Dense(1, activation='sigmoid'))
cnn.add(Dropout(0.2))

cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

#predict the test data
y_pred = cnn.predict(X_test)

#calculate accuracy using a threshold of 0.5
y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.32598208132322537

In [35]:
#how about an ltsm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras import metrics
from tensorflow.keras import losses
from tensorflow.keras import optimizers

#we can use the same train and test data as before
#preform logistic regression
ltsm = Sequential()
ltsm.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
ltsm.add(Dense(32, activation='relu'))
ltsm.add(Dense(1, activation='sigmoid'))
#add dropout
ltsm.add(Dropout(0.2))


ltsm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ltsm.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

#predict the test data
y_pred = ltsm.predict(X_test)

#calculate accuracy using a threshold of 0.5
y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.32598208132322537

<h1>Hyper Parameter Tuning on Random Forest</h1>

In [78]:
#start by defining a parameter grid
param_grid = {'bootstrap': [True],
'max_features': [2, 3,6],
'max_depth': [30,120, 100,160, None],
'min_samples_leaf': [3, 4, 5,8],
'n_estimators': [250,450,550,750]
}

In [79]:
#lets run a grid search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

#predict the test data
y_pred = grid_search.predict(X_test)

#calculate accuracy using a threshold of 0.6

y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)
#get f score, recall    and precision
from sklearn.metrics import f1_score, recall_score, precision_score
print(f1_score(y_test, y_pred_round))
print(recall_score(y_test, y_pred_round))
print(precision_score(y_test, y_pred_round))



Fitting 3 folds for each of 240 candidates, totalling 720 fits
0.8339854667411962
0.7885835095137421
0.8849347568208779


In [80]:
import joblib
#save the model
joblib.dump(grid_search, 'grid_search_firstNight.pkl')

['grid_search_firstNight.pkl']

In [84]:
#start by defining a parameter grid
param_grid = {'bootstrap': [True],
'max_features': [1,8],
'max_depth': [10,220, None],
'min_samples_leaf': [2,12],
'n_estimators': [280,790]
}

In [76]:
#using a 0.6 threshold what is the false positive rate
pointsix = []
for each in y_pred:
    if each > 0.7:
        pointsix.append(1)
    else:
        pointsix.append(0)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pointsix)


array([[3898,   14],
       [ 911,  981]], dtype=int64)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

#predict the test data
y_pred = grid_search.predict(X_test)

#calculate accuracy using a threshold of 0.6

y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)
#get f score, recall    and precision
from sklearn.metrics import f1_score, recall_score, precision_score
print(f1_score(y_test, y_pred_round))
print(recall_score(y_test, y_pred_round))
print(precision_score(y_test, y_pred_round))



In [None]:
#save the model
joblib.dump(grid_search, 'grid_search_secondNight.pkl')

In [83]:
#import grid_search_firstNight.pkl and run it on the test data
#predict the test data
grid = joblib.load('grid_search_firstNight.pkl')
y_pred = grid.predict(X_test)

#calculate accuracy using a threshold of 0.6

y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)
#get f score, recall    and precision
from sklearn.metrics import f1_score, recall_score, precision_score
print(f1_score(y_test, y_pred_round))
print(recall_score(y_test, y_pred_round))
print(precision_score(y_test, y_pred_round))


print(confusion_matrix(y_test, y_pred_round))

0.8339854667411962
0.7885835095137421
0.8849347568208779
[[3718  194]
 [ 400 1492]]


In [85]:
#now let's preform pca and see if we can get a better model
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

#lets run a grid search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train_pca, y_train)

#predict the test data
y_pred = grid_search.predict(X_test_pca)

#calculate accuracy using a threshold of 0.6

y_pred_round = np.round(y_pred)
accuracy_score(y_test, y_pred_round)
#get f score, recall    and precision
print(f1_score(y_test, y_pred_round))
print(recall_score(y_test, y_pred_round))
print(precision_score(y_test, y_pred_round))


Fitting 3 folds for each of 24 candidates, totalling 72 fits
0.6975782634376846
0.6242071881606766
0.7904953145917001


In [86]:
#look at counts of each class
y_train.value_counts()

0    15588
1     7628
Name: isCanceled, dtype: int64

In [88]:
df.columns

NameError: name 'df' is not defined

In [87]:
#let's tale the year , month and day and make them into a single column
df['date'] = df['year'].astype(str) + df['month'].astype(str) + df['day'].astype(str)

NameError: name 'df' is not defined