# Prediction of customers' travel pattern

- https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

- https://towardsdatascience.com/predicting-hotel-bookings-with-user-search-parameters-8c570ab24805

# 1)-Importing key modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
# For processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# For modeling building and tunning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# for deep learning if I will have time

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [4]:
# for evaluation

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
from datetime import date
import datetime as dt

# 2)-Loading data

In [6]:
df = pd.read_csv('model_data.csv')
df.shape

(45805, 8)

In [7]:
df.columns

Index(['event_type', 'origin', 'destination', 'distance', 'num_family',
       'ts_datetime', 'len_jour', 'ts_hour'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20
2,book,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23
3,book,BER,BCN,1498.817537,1,2017-04-27 15:17:50,3.0,15
4,book,DEL,BKK,2921.339028,4,2017-04-27 22:51:57,6.0,22


### a. creating 3-feature dataset

In [9]:
df_three_feat=df[["event_type","distance","num_family","len_jour"]]

In [10]:
df_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
0,search,5834.154716,7,6.0
1,book,6525.926149,4,21.0
2,book,469.781624,2,3.0


In [11]:
df_three_feat['event_type'] = df_three_feat.event_type.map({'search':0, 'book':1})
df_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
0,0,5834.154716,7,6.0
1,1,6525.926149,4,21.0
2,1,469.781624,2,3.0


### b. Creating all feature dataset

In [12]:
df_all = pd.read_csv('all_features.csv')
df_all.shape

(45805, 518)

In [13]:
df_all.head(3)

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour,origin_ADB,origin_ADL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_all_feat=df_all.drop(['origin','destination','ts_datetime'], axis=1)

In [15]:
df_all_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,0,5834.154716,7,6.0,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6525.926149,4,21.0,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,469.781624,2,3.0,23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we have both datasets aligned. So, let's concentrate on our problem of imbalanced classes

In [16]:
df_three_feat.event_type.value_counts()

0    43997
1     1808
Name: event_type, dtype: int64

In [17]:
df_all_feat.event_type.value_counts(normalize=True)

0    0.960528
1    0.039472
Name: event_type, dtype: float64

This is the problem. We have only 3.9% of our data is of booking class.

# 3)- Under-sampling method

In [19]:
booking_indices = df_three_feat[df_three_feat.event_type == 1].index
random_indices = np.random.choice(booking_indices, len(df_three_feat.loc[df_three_feat.event_type == 1]), replace=False)
booking_sample = df_three_feat.loc[random_indices]

In [20]:
booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour
174,1,820.508457,1,0.0
4522,1,1889.299488,4,7.0
8596,1,1512.993425,1,2.0
6965,1,1656.154773,1,6.0
2092,1,5075.727797,4,19.0
...,...,...,...,...
4838,1,1504.993992,1,4.0
8339,1,1552.740086,2,7.0
2976,1,1656.154773,3,10.0
5763,1,9627.195593,1,0.0


In [21]:
not_booking = df_three_feat[df_three_feat.event_type == 0].index
random_indices = np.random.choice(not_booking, sum(df_three_feat['event_type']), replace=False)
not_booking_sample = df_three_feat.loc[random_indices]

In [22]:
not_booking_sample

Unnamed: 0,event_type,distance,num_family,len_jour
27545,0,2201.277004,5,8.0
40591,0,8203.682870,1,17.0
19894,0,10892.524642,1,0.0
6591,0,6107.803762,1,7.0
17390,0,505.878975,2,2.0
...,...,...,...,...
16015,0,1656.154773,1,0.0
26586,0,9600.210603,1,126.0
40832,0,614.303469,1,0.0
27413,0,1984.857664,1,4.0


In [23]:
imb_three_feat = pd.concat([not_booking_sample, booking_sample], axis=0)

In [24]:
print("Percentage of search clicks: ", len(imb_three_feat[imb_three_feat.event_type == 0])/len(imb_three_feat))
print("Percentage of booking clicks: ", len(imb_three_feat[imb_three_feat.event_type == 1])/len(imb_three_feat))
print("Total number of records in resampled data: ", len(imb_three_feat))

Percentage of search clicks:  0.5
Percentage of booking clicks:  0.5
Total number of records in resampled data:  3616


In [26]:
#save dataset
imb_three_feat.to_csv('imb_three_feat.csv',index=False)

# 4)- Model Building

### 4.1)-Separate features

In [27]:
imb_three_feat.head(3)

Unnamed: 0,event_type,distance,num_family,len_jour
27545,0,2201.277004,5,8.0
40591,0,8203.68287,1,17.0
19894,0,10892.524642,1,0.0


In [29]:
imb_three_feat.event_type.value_counts()

1    1808
0    1808
Name: event_type, dtype: int64

In [30]:
target=imb_three_feat["event_type"]

In [31]:
features=imb_three_feat[["distance","num_family","len_jour"]]

In [32]:
print(target.shape)
print(features.shape)

(3616,)
(3616, 3)


### 4.2)-Normalize data

In [34]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(features)

### 4.3)-train_test_split

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=0)

In [36]:
print(X_train.shape)
print(X_test.shape)

(2531, 3)
(1085, 3)


In [37]:
print(y_train.shape)
print(y_test.shape)

(2531,)
(1085,)


In [38]:
X_train

array([[ 2.23167033, -0.75362021,  0.48237703],
       [-0.41433148, -0.75362021, -0.14995044],
       [-0.34979924, -0.75362021, -0.40288143],
       ...,
       [-0.58431562, -0.75362021, -0.52934693],
       [-0.28709194,  0.11203757, -0.33964868],
       [-0.45617284,  0.97769535, -0.33964868]])

In [39]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [40]:
predictions_LR[:5]

array([1, 1, 1, 1, 0])

In [41]:
print(accuracy_score(y_test,predictions_LR))

0.543778801843318


In [42]:
print(recall_score(y_test,predictions_LR))

0.792910447761194


In [43]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.60      0.30      0.40       549
           1       0.53      0.79      0.63       536

    accuracy                           0.54      1085
   macro avg       0.56      0.55      0.52      1085
weighted avg       0.56      0.54      0.51      1085



And here we are, we have lower accuracy(54.37%) than our previous model(96.17%).

But, we have got some valuable results for our booking class now.

# 5)-Model with ALL_Features

# 6)- Solution 2:

# 7)- Interpreting Evaluation part

**END OF NOTEBOOK3**