# Prediction of customers' travel pattern

# 1)-Importing key modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
# For processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# For modeling building and tunning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# for deep learning if I will have time

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [4]:
# for evaluation

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
from datetime import date
import datetime as dt

# 2)-Loading data

In [6]:
df = pd.read_csv('model_data.csv')
df.shape

(45805, 8)

In [7]:
df.columns

Index(['event_type', 'origin', 'destination', 'distance', 'num_family',
       'ts_datetime', 'len_jour', 'ts_hour'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20
2,book,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23
3,book,BER,BCN,1498.817537,1,2017-04-27 15:17:50,3.0,15
4,book,DEL,BKK,2921.339028,4,2017-04-27 22:51:57,6.0,22


### 3)- Quick data check

### 3.1)- Checking data type

In [9]:
# Checking all variables with their data-type
def _tbl_dtype(dataset):
    sum_dtype = pd.DataFrame(dataset.dtypes).sort_values(0).rename(columns = {0:'Data Type'})
    return sum_dtype

table_dtype = _tbl_dtype(df)
table_dtype

Unnamed: 0,Data Type
num_family,int64
ts_hour,int64
distance,float64
len_jour,float64
event_type,object
origin,object
destination,object
ts_datetime,object


In [10]:
df["ts_datetime"]=pd.to_datetime(df.ts_datetime)

In [11]:
table_dtype = _tbl_dtype(df)
table_dtype

Unnamed: 0,Data Type
num_family,int64
distance,float64
ts_datetime,datetime64[ns]
ts_hour,int64
len_jour,float64
event_type,object
origin,object
destination,object


### 3.2)- Checking missing values

In [12]:
df.isnull().sum()

event_type     0
origin         0
destination    0
distance       0
num_family     0
ts_datetime    0
len_jour       0
ts_hour        0
dtype: int64

### 3.3-Checking duplicate values

In [13]:
df.duplicated().sum()

0

### 3.4)- Encoding categorical Data

origin and destination

In [14]:
pd.get_dummies(df['origin']).iloc[:,1:].head()

Unnamed: 0,ADB,ADL,AER,AGP,AKL,ALA,ALC,ALG,AMS,ANK,...,YEA,YMQ,YOW,YTO,YUL,YVR,YWG,YYC,YYZ,ZRH
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can see that encoding has created a unique identifier for each airport destination

In [15]:
origin_dummy=pd.get_dummies(df['origin'],prefix='origin').iloc[:,1:]

In [16]:
df=pd.concat([df,origin_dummy],axis=1)

In [17]:
df.shape

(45805, 243)

In [18]:
df.head(2)

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour,origin_ADB,origin_ADL,...,origin_YEA,origin_YMQ,origin_YOW,origin_YTO,origin_YUL,origin_YVR,origin_YWG,origin_YYC,origin_YYZ,origin_ZRH
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
pd.get_dummies(df['destination']).iloc[:,1:].head(2)

Unnamed: 0,ADB,ADD,ADL,AER,AGP,AKL,ALA,ALC,ALG,AMS,...,YEG,YMQ,YOW,YTO,YUL,YVR,YWG,YYC,YYZ,ZRH
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
dest_dummy=pd.get_dummies(df['destination'],prefix='dest').iloc[:,1:]
df=pd.concat([df,dest_dummy],axis=1)
df.shape


(45805, 518)

In [21]:
df.head(3)

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour,origin_ADB,origin_ADL,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
2,book,BER,CGN,469.781624,2,2017-04-27 23:03:43,3.0,23,0,0,...,0,0,0,0,0,0,0,0,0,0


We have prefix with airport name so, we can distinguish between origin and destination

### 3.5)- Target Variable

In [22]:
df['event_type'] = df.event_type.map({'search':0, 'book':1})

In [23]:
df.event_type.value_counts()

0    43997
1     1808
Name: event_type, dtype: int64

### 3.6)-Separate features

In [24]:
df_model=df.copy()

In [25]:
target=df_model["event_type"]

Remove extra variables such as origin, destination as we have got dummy variables

ts_datetime as we have got ts_hour

Also event_type as it is stored in target variable

In [26]:
features=df_model.drop(['event_type','origin','destination','ts_datetime'], axis=1)

In [27]:
print(target.shape)
print(features.shape)

(45805,)
(45805, 514)


In [28]:
features.head(2)

Unnamed: 0,distance,num_family,len_jour,ts_hour,origin_ADB,origin_ADL,origin_AER,origin_AGP,origin_AKL,origin_ALA,...,dest_YEG,dest_YMQ,dest_YOW,dest_YTO,dest_YUL,dest_YVR,dest_YWG,dest_YYC,dest_YYZ,dest_ZRH
0,5834.154716,7,6.0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6525.926149,4,21.0,20,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3.6)-Normalize data

In [29]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(features)

# 4)-Model Building

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=0)

In [31]:
print(X_train.shape)
print(X_test.shape)

(32063, 514)
(13742, 514)


In [32]:
print(y_train.shape)
print(y_test.shape)

(32063,)
(13742,)


In [33]:
X_train

array([[-0.35177902,  1.0152115 , -0.61244991, ..., -0.03371257,
        -0.03135908, -0.08173844],
       [-0.50221793,  0.15374868, -0.15216897, ..., -0.03371257,
        -0.03135908, -0.08173844],
       [-0.69762774, -0.70771413, -0.21792339, ..., -0.03371257,
        -0.03135908, -0.08173844],
       ...,
       [ 1.98983893,  1.0152115 ,  3.53007847, ..., -0.03371257,
        -0.03135908, -0.08173844],
       [-0.50391064, -0.70771413,  0.76839289, ..., -0.03371257,
        -0.03135908, -0.08173844],
       [-0.73094009,  0.15374868, -0.34943223, ..., -0.03371257,
        -0.03135908, -0.08173844]])

In [34]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [35]:
predictions_LR[:5]

array([0, 0, 0, 0, 0])

In [36]:
print(accuracy_score(y_test,predictions_LR))

0.9598311745015282


In [37]:
print(recall_score(y_test,predictions_LR))

0.0019047619047619048


In [38]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     13217
           1       0.03      0.00      0.00       525

    accuracy                           0.96     13742
   macro avg       0.50      0.50      0.49     13742
weighted avg       0.93      0.96      0.94     13742



# 5)-Feature Selection

In [39]:
df = pd.read_csv('model_data.csv')

In [40]:
df.head(2)

Unnamed: 0,event_type,origin,destination,distance,num_family,ts_datetime,len_jour,ts_hour
0,search,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,6.0,11
1,book,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,21.0,20


In [41]:
my_features=df[["distance","num_family","len_jour"]]

In [42]:
df['event_type'] = df.event_type.map({'search':0, 'book':1})
Target=df['event_type']

In [43]:
print(Target.shape)
print(my_features.shape)

(45805,)
(45805, 3)


In [44]:
X = StandardScaler().fit_transform(my_features)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, Target, test_size=0.3, random_state=0)

In [46]:
print(X_train.shape)
print(X_test.shape)

(32063, 3)
(13742, 3)


In [47]:
print(y_train.shape)
print(y_test.shape)

(32063,)
(13742,)


In [48]:
# Logistic Classifeir
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
predictions_LR = logreg.predict(X_test)

In [49]:
print(accuracy_score(y_test,predictions_LR))

0.9617959540096056


In [50]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     13217
           1       0.00      0.00      0.00       525

    accuracy                           0.96     13742
   macro avg       0.48      0.50      0.49     13742
weighted avg       0.93      0.96      0.94     13742



If we compare two model with 514 features versus 3 features, we see better results in terms of accuracy in 3-feature model.

But, look at precision , recall and f1-score of class 1 i.e booking. It is 0. In total dataset, we have 1808 of booking samples. In test set, we got 525 and this is good enough to get atleast some results. Our both models on surface looks amazing with 95.8% and 96.1% accuracy. Clearly, these models are no good for our problem.

We want to predict conversion-likelihood of user i.e how many of those who searched actually booked.

**There are two problems here.**

1- Imbalnced class
2- Selecting correct evaluation matric 

And we also still need to know which model is better - the one with 3-features or the one with 514 features

**END OF NOTEBOOK3**