# Prediction of customers' travel pattern

- https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

- https://towardsdatascience.com/predicting-hotel-bookings-with-user-search-parameters-8c570ab24805

# 1)-Importing key modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
# For processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# For modeling building and tunning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# for deep learning if I will have time

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [4]:
# for evaluation

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
from datetime import date
import datetime as dt

# 2)-Loading data

In [6]:
df = pd.read_csv('model_data.csv')
df.shape

(45805, 8)

In [7]:
df.columns

Index(['event_type', 'origin', 'destination', 'distance', 'num_family',
       'ts_datetime', 'len_jour', 'ts_hour'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20
2,book,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23
3,book,BER,BCN,1498.817537,1,2017-04-27 15:17:50,3.0,15
4,book,DEL,BKK,2921.339028,4,2017-04-27 22:51:57,6.0,22


### a. creating 3-feature dataset

In [9]:
df_three_feat=df[["event_type","distance","num_family","len_jour"]]

In [10]:
df_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
0,search,5834.154716,7,6.0
1,book,6525.926149,4,21.0
2,book,469.781624,2,3.0


In [15]:
df_three_feat['event_type'] = df_three_feat.event_type.map({'search':0, 'book':1})
df_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
0,0,5834.154716,7,6.0
1,1,6525.926149,4,21.0
2,1,469.781624,2,3.0


### b. Creating all feature dataset

In [11]:
df_all = pd.read_csv('all_features.csv')
df_all.shape

(45805, 518)

In [12]:
df_all.head(3)

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour,origin_ADB,origin_ADL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_all_feat=df_all.drop(['origin','destination','ts_datetime'], axis=1)

In [14]:
df_all_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,5834.154716,7,6.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6525.926149,4,21.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,469.781624,2,3.0,23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we have both datasets aligned. So, let's concentrate on our problem of imbalanced classes

In [16]:
df_three_feat.event_type.value_counts()

0    43997
1     1808
Name: event_type, dtype: int64

In [19]:
df_all_feat.event_type.value_counts(normalize=True)

0    0.960528
1    0.039472
Name: event_type, dtype: float64

This is the problem. We have only 3.9% of our data is of booking class.

# under-sampling method

In [None]:
booking_indices = df[df.is_booking == 1].index
random_indices = np.random.choice(booking_indices, len(df.loc[df.is_booking == 1]), replace=False)
booking_sample = df.loc[random_indices]



not_booking = df[df.is_booking == 0].index
random_indices = np.random.choice(not_booking, sum(df['is_booking']), replace=False)
not_booking_sample = df.loc[random_indices]




### rough

In [20]:
booking_indices = df_three_feat[df_three_feat.event_type == 1].index
random_indices = np.random.choice(booking_indices, len(df_three_feat.loc[df_three_feat.event_type == 1]), replace=False)
booking_sample = df_three_feat.loc[random_indices]

In [22]:
booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour
2635,1,1475.489132,2,3.0
2972,1,1864.628098,1,8.0
7885,1,1752.175509,2,0.0
1229,1,586.984689,2,0.0
5269,1,315.891059,5,0.0
...,...,...,...,...
8215,1,2201.277004,2,0.0
1661,1,868.135723,4,7.0
3321,1,915.021269,1,0.0
45522,1,2239.218982,1,0.0


In [24]:
not_booking = df_three_feat[df_three_feat.event_type == 0].index
random_indices = np.random.choice(not_booking, sum(df_three_feat['event_type']), replace=False)
not_booking_sample = df_three_feat.loc[random_indices]

In [25]:
not_booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour
38823,0,486.032983,1,11.0
42065,0,1211.708227,1,8.0
24606,0,837.034192,2,1.0
26810,0,1548.354316,2,6.0
3823,0,6198.683160,2,5.0
...,...,...,...,...
30957,0,1211.708227,1,13.0
23559,0,10259.110522,2,4.0
41215,0,1772.249505,4,8.0
20026,0,1187.537105,1,7.0


In [26]:
df_new = pd.concat([not_booking_sample, booking_sample], axis=0)

In [28]:
print("Percentage of search clicks: ", len(df_new[df_new.event_type == 0])/len(df_new))
print("Percentage of booking clicks: ", len(df_new[df_new.event_type == 1])/len(df_new))
print("Total number of records in resampled data: ", len(df_new))

Percentage of search clicks:  0.5
Percentage of booking clicks:  0.5
Total number of records in resampled data:  3616


### 3.6)-Separate features

In [None]:
df_model=df.copy()

In [None]:
target=df_model["event_type"]

Remove extra variables such as origin, destination as we have got dummy variables

ts_datetime as we have got ts_hour

Also event_type as it is stored in target variable

In [None]:
features=df_model.drop(['event_type','origin','destination','ts_datetime'], axis=1)

In [None]:
print(target.shape)
print(features.shape)

In [None]:
features.head(2)

### 3.6)-Normalize data

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(features)

# 4)-Model Building

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train

In [None]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [None]:
predictions_LR[:5]

In [None]:
print(accuracy_score(y_test,predictions_LR))

In [None]:
print(recall_score(y_test,predictions_LR))

In [None]:
print(classification_report(y_test,predictions_LR))

# 5)-Feature Selection

In [None]:
df = pd.read_csv('model_data.csv')

In [None]:
df.head(2)

In [None]:
my_features=df[["distance","num_family","len_jour"]]

In [None]:
df['event_type'] = df.event_type.map({'search':0, 'book':1})
Target=df['event_type']

In [None]:
print(Target.shape)
print(my_features.shape)

In [None]:
X = StandardScaler().fit_transform(my_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Target, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [None]:
print(accuracy_score(y_test,predictions_LR))

In [None]:
print(classification_report(y_test,predictions_LR))

If we compare two model with 514 features versus 3 features, we see better results in terms of accuracy in 3-feature model.

But, look at precision , recall and f1-score of class 1 i.e booking. It is 0. In total dataset, we have 1808 of booking samples. In test set, we got 525 and this is good enough to get atleast some results. Our both models on surface looks amazing with 95.8% and 96.1% accuracy. Clearly, these models are no good for our problem.

We want to predict conversion-likelihood of user i.e how many of those who searched actually booked.

**There are two problems here.**

1- Imbalnced class
2- Selecting correct evaluation matric 

And we also still need to know which model is better - the one with 3-features or the one with 514 features

**END OF NOTEBOOK3**