# Multi-class Classification Problem

https://towardsdatascience.com/predicting-hotel-bookings-with-user-search-parameters-8c570ab24805


- https://gist.github.com/rochacbruno/2883505

- https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/

# 1)-Importing key modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
# For processing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# For modeling building and tunning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# for deep learning if I will have time

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [4]:
# for evaluation

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
from datetime import date
import datetime as dt

# 2)-Loading data

In [6]:
df_event = pd.read_csv('events.csv')
df_event.shape

(47007, 9)

In [7]:
df_event.columns

Index(['ts', 'event_type', 'user_id', 'date_from', 'date_to', 'origin',
       'destination', 'num_adults', 'num_children'],
      dtype='object')

In [8]:
df_event.head()

Unnamed: 0,ts,event_type,user_id,date_from,date_to,origin,destination,num_adults,num_children
0,2017-04-27 11:06:51,search,60225f,2017-06-01,2017-06-07,PAR,NYC,6,1
1,2017-04-27 20:15:27,book,e5d69e,2017-08-12,2017-09-02,FRA,WAS,3,1
2,2017-04-27 23:03:43,book,f953f0,2017-10-08,2017-10-11,BER,CGN,2,0
3,2017-04-27 15:17:50,book,794d35,2017-04-28,2017-05-01,BER,BCN,1,0
4,2017-04-27 22:51:57,book,ca4f94,2017-05-16,2017-05-22,DEL,BKK,4,0


In [9]:
df_event.event_type.unique()

array(['search', 'book'], dtype=object)

So we have a binary classification problem with our goal to predict the conversion-likelihood of a user

In [10]:
df_geo = pd.read_csv('iata_1_1.csv')

df_geo.shape

(500, 3)

In [11]:
df_geo.head()

Unnamed: 0,iata_code,lat,lon
0,LON,51.5,-0.17
1,MOW,55.75,37.7
2,NYC,40.71,-74.01
3,PAR,48.85,2.35
4,IST,41.01,28.95


In [12]:
df_geo[df_geo.iata_code == "PAR"]

Unnamed: 0,iata_code,lat,lon
3,PAR,48.85,2.35


In [13]:
df_geo[df_geo.iata_code == "NYC"]

Unnamed: 0,iata_code,lat,lon
2,NYC,40.71,-74.01


# 3)-Combining dataset

### a)- Working with orgin 

In [14]:
df_origin=pd.DataFrame(columns=["origin"])
df_origin['origin']=df_event['origin']
df_origin['origin_lat']=np.nan
df_origin.head(2)

Unnamed: 0,origin,origin_lat
0,PAR,
1,FRA,


In [15]:
for k in range(len(df_origin)):
    df_origin['origin_lat'][k] = (df_geo['lat'][df_geo['iata_code']==df_origin['origin'][k]][df_geo['lat'][df_geo['iata_code']==df_origin['origin'][k]].index[0]])

In [16]:
df_origin.head(2)

Unnamed: 0,origin,origin_lat
0,PAR,48.85
1,FRA,50.12


In [17]:
df_origin['origin_lon']=np.nan

In [18]:
for k in range(len(df_origin)):
    df_origin['origin_lon'][k] = (df_geo['lon'][df_geo['iata_code']==df_origin['origin'][k]][df_geo['lon'][df_geo['iata_code']==df_origin['origin'][k]].index[0]])

In [19]:
df_origin.head(2)

Unnamed: 0,origin,origin_lat,origin_lon
0,PAR,48.85,2.35
1,FRA,50.12,8.68


In [20]:
df_origin.shape

(47007, 3)

### b)-working with destination 

In [21]:
df_destination=pd.DataFrame(columns=["destination"])

In [22]:
df_destination['destination']=df_event['destination']
df_destination.head(2)

Unnamed: 0,destination
0,NYC
1,WAS


In [23]:
df_destination['destination_lat']=np.nan
df_destination['destination_lon']=np.nan

In [24]:
for k in range(len(df_destination)):
    df_destination['destination_lat'][k] = (df_geo['lat'][df_geo['iata_code']==df_destination['destination'][k]][df_geo['lat'][df_geo['iata_code']==df_destination['destination'][k]].index[0]])
    df_destination['destination_lon'][k] = (df_geo['lon'][df_geo['iata_code']==df_destination['destination'][k]][df_geo['lon'][df_geo['iata_code']==df_destination['destination'][k]].index[0]])

In [25]:
df_destination.head(2)

Unnamed: 0,destination,destination_lat,destination_lon
0,NYC,40.71,-74.01
1,WAS,38.9,-77.04


**Adding to main dataset**

In [26]:
df_event['origin_lat'] = df_origin['origin_lat']
df_event['origin_lon'] = df_origin['origin_lon']

In [27]:
df_event['destination_lat']= df_destination['destination_lat']
df_event['destination_lon']=df_destination['destination_lon']

In [28]:
df_event.head(2)

Unnamed: 0,ts,event_type,user_id,date_from,date_to,origin,destination,num_adults,num_children,origin_lat,origin_lon,destination_lat,destination_lon
0,2017-04-27 11:06:51,search,60225f,2017-06-01,2017-06-07,PAR,NYC,6,1,48.85,2.35,40.71,-74.01
1,2017-04-27 20:15:27,book,e5d69e,2017-08-12,2017-09-02,FRA,WAS,3,1,50.12,8.68,38.9,-77.04


# 4)- Calculating distance

Haversine distance between two points 

Ref: https://www.movable-type.co.uk/scripts/latlong.html
- https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/

In [29]:
def haversine_vectorize(lon1, lat1, lon2, lat2):
 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
 
    newlon = lon2 - lon1
    newlat = lat2 - lat1
 
    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2
 
    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [30]:
haversine_vectorize(df_event['origin_lon'],df_event['origin_lat'],df_event['destination_lon'],
                   df_event['destination_lat'])

0        5834.154716
1        6525.926149
2         469.781624
3        1498.817537
4        2921.339028
            ...     
47002     856.002808
47003     395.323222
47004     730.610705
47005     174.707494
47006     614.303469
Length: 47007, dtype: float64

In [31]:
df_event['distance'] = haversine_vectorize(df_event['origin_lon'],df_event['origin_lat'],df_event['destination_lon'],
                   df_event['destination_lat'])

In [32]:
df_event.head(2)

Unnamed: 0,ts,event_type,user_id,date_from,date_to,origin,destination,num_adults,num_children,origin_lat,origin_lon,destination_lat,destination_lon,distance
0,2017-04-27 11:06:51,search,60225f,2017-06-01,2017-06-07,PAR,NYC,6,1,48.85,2.35,40.71,-74.01,5834.154716
1,2017-04-27 20:15:27,book,e5d69e,2017-08-12,2017-09-02,FRA,WAS,3,1,50.12,8.68,38.9,-77.04,6525.926149


**This will be one of our key feature for analysis**

# 5)- Creating new feature

num_family

In [33]:
df_event["num_family"]=df_event['num_adults']+df_event['num_children']

In [34]:
df_event.head(2)

Unnamed: 0,ts,event_type,user_id,date_from,date_to,origin,destination,num_adults,num_children,origin_lat,origin_lon,destination_lat,destination_lon,distance,num_family
0,2017-04-27 11:06:51,search,60225f,2017-06-01,2017-06-07,PAR,NYC,6,1,48.85,2.35,40.71,-74.01,5834.154716,7
1,2017-04-27 20:15:27,book,e5d69e,2017-08-12,2017-09-02,FRA,WAS,3,1,50.12,8.68,38.9,-77.04,6525.926149,4


drop extra columns for clarity. Also if we keep then they will cause multi-collinearity.

In [35]:
df_event.drop(['num_adults', 'num_children','origin_lat','origin_lon','destination_lat','destination_lon'], axis=1, inplace=True)

In [36]:
df_event.head(2)

Unnamed: 0,ts,event_type,user_id,date_from,date_to,origin,destination,distance,num_family
0,2017-04-27 11:06:51,search,60225f,2017-06-01,2017-06-07,PAR,NYC,5834.154716,7
1,2017-04-27 20:15:27,book,e5d69e,2017-08-12,2017-09-02,FRA,WAS,6525.926149,4


# 6)-Working with Time Series

Let's suppose. we want to see which year we have least travel attempts click or book 

In [37]:
# Checking all variables with their data-type
def _tbl_dtype(dataset):
    sum_dtype = pd.DataFrame(dataset.dtypes).sort_values(0).rename(columns = {0:'Data Type'})
    return sum_dtype

table_dtype = _tbl_dtype(df_event)
table_dtype

Unnamed: 0,Data Type
num_family,int64
distance,float64
ts,object
event_type,object
user_id,object
date_from,object
date_to,object
origin,object
destination,object


### 6.1)-convert object to datetime

In [38]:
df_event["ts_datetime"]=pd.to_datetime(df_event.ts)
df_event["date_from_datetime"]=pd.to_datetime(df_event.date_from)
df_event["date_to_datetime"]=pd.to_datetime(df_event.date_to)

In [39]:
table_dtype = _tbl_dtype(df_event)
table_dtype

Unnamed: 0,Data Type
num_family,int64
distance,float64
ts_datetime,datetime64[ns]
date_from_datetime,datetime64[ns]
date_to_datetime,datetime64[ns]
ts,object
event_type,object
user_id,object
date_from,object
date_to,object


In [40]:
df_event.head(2)

Unnamed: 0,ts,event_type,user_id,date_from,date_to,origin,destination,distance,num_family,ts_datetime,date_from_datetime,date_to_datetime
0,2017-04-27 11:06:51,search,60225f,2017-06-01,2017-06-07,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,2017-06-01,2017-06-07
1,2017-04-27 20:15:27,book,e5d69e,2017-08-12,2017-09-02,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,2017-08-12,2017-09-02


**drop extra columns**

In [41]:
df_event.drop(['ts', 'date_to','date_from'], axis=1, inplace=True)

In [42]:
df_event.head(2)

Unnamed: 0,event_type,user_id,origin,destination,distance,num_family,ts_datetime,date_from_datetime,date_to_datetime
0,search,60225f,PAR,NYC,5834.154716,7,2017-04-27 11:06:51,2017-06-01,2017-06-07
1,book,e5d69e,FRA,WAS,6525.926149,4,2017-04-27 20:15:27,2017-08-12,2017-09-02


### 6.2)-Adding extra Time Feature

Flight_length

In [None]:
df_event['date_from_month']=df_event.date_from_datetime.dt.month
df_event['date_from_day']=df_event.date_from_datetime.dt.day
df_event['date_from_dayofweek']=df_event.date_from_datetime.dt.dayofweek
df_event['date_from_hours']=df_event.date_from_datetime.dt.hour

In [None]:
df_event.head()

In [None]:
df_event['date_to_month']=df_event.date_to_datetime.dt.month
df_event['date_to_day']=df_event.date_to_datetime.dt.day
df_event['date_to_day']=df_event.date_to_datetime.dt.dayofweek
df_event['date_to_hour']=df_event.date_to_datetime.dt.hour

### 6.3)-working with ts_datetime

In [None]:
df_event['ts_year']=df_event.ts_datetime.dt.year
df_event['ts_month']=df_event.ts_datetime.dt.month
df_event['ts_day']=df_event.ts_datetime.dt.day
df_event['ts_dayofweek']=df_event.ts_datetime.dt.dayofweek
df_event['ts_hour']=df_event.ts_datetime.dt.hour

In [None]:
df_event.head(2)

In [None]:
df_event.ts_year.unique()

In [None]:
df_event.ts_month.unique()

**Both month and year will not provide much of longitudnal(time related) information. Only we can see how much booking occured in month of April vs month of May**

In [None]:
df_event.ts_day.unique()

**drop extra columns for clarity. Also if we keep then they will cause multi-collinearity.**

In [None]:
df_event.drop(['ts_datetime', 'ts_year','ts_month'], axis=1, inplace=True)

In [None]:
df_event.head(2)

### working with date_from _datetime

In [None]:
df_event['date_from_month']=df_event.date_from_datetime.dt.month
df_event['date_from_day']=df_event.date_from_datetime.dt.day
df_event['date_from_hours']=df_event.date_from_datetime.dt.hour

In [None]:
df_event.head(2)

In [None]:
df_event.date_from_month.unique()

In [None]:
df_event.date_from_day.unique()

### working with date_to_datetime

In [None]:
df_event.date_to_day.unique()

# num_family

num_adult+num_children

**END OF NOTEBOOK**