# Imports

In [1]:
!pip install ydata_profiling

Collecting ydata_profiling
  Downloading ydata_profiling-4.12.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.7.7,>=0.7.5 (from visions[type_image_path]<0.7.7,>=0.7.5->ydata_profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata_profiling)
  Downloading phik-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata_profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata_profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata_profiling)
  Downloading dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata_profiling)
  Downloading pywavelets-1.

In [2]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import sklearn
from ydata_profiling import ProfileReport

In [3]:
# Importing dataset

from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv("/content/drive/MyDrive/Datasets/titanic/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Datasets/titanic/test.csv")


Mounted at /content/drive


In [4]:
features = test_df.columns

In [5]:
df = pd.concat([train_df[features], test_df])

# Exploratory data analysis

## Automated EDA using ydata

In [6]:
report = ProfileReport(df, title = "report")

In [7]:
report

Output hidden; open in https://colab.research.google.com to view.

In [9]:
train_report = ProfileReport(train_df, title = "train_report")
train_report

Output hidden; open in https://colab.research.google.com to view.

## Preprocessing

### Handling missing values

In [10]:
# Missing values

df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,263
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,1014


In [11]:
# Missing data in training dataset

train_df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [12]:
# missing data in testing data

test_df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


#### Age

In [15]:
# Unmissed values
df[df["Age"].notnull()]["Sex"].value_counts()

Unnamed: 0_level_0,count
Sex,Unnamed: 1_level_1
male,658
female,388


In [None]:
# Missed values

df[df["orig_destination_distance"].isnull()]["is_booking"].value_counts()

Unnamed: 0_level_0,count
is_booking,Unnamed: 1_level_1
0,33396
1,2689


In [None]:
# Dropping "orig_destination_distance"

df.drop(["orig_destination_distance"], axis = 1, inplace = True)

#### srch_ci and srch_co

In [None]:
df[df["srch_co"].isnull()]["srch_ci"].isnull().sum()

# so 122 missing values of features srch_co and srch_ci from the same rows

122

In [None]:
df[df["srch_ci"].isnull()]["is_booking"].value_counts()

Unnamed: 0_level_0,count
is_booking,Unnamed: 1_level_1
0,122


In [None]:
# checking other values

df[df["srch_ci"].isnull()]["is_package"].value_counts()

Unnamed: 0_level_0,count
is_package,Unnamed: 1_level_1
0,107
1,15


In [None]:
df.dropna(subset = ["srch_co","srch_ci"],axis=0, inplace = True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
date_time,0
site_name,0
posa_continent,0
user_location_country,0
user_location_region,0
user_location_city,0
user_id,0
is_mobile,0
is_package,0


### Feature Extraction

#### Total number of booking days

In [None]:
# Creating new features  from date time data

from datetime import datetime

def helper(date_str):
    dformat = "%Y-%m-%d"
    return datetime.strptime(date_str, dformat)

# converting str datetime into datetime format
df["srch_co"] = df["srch_co"].apply(helper)
df["srch_ci"] = df["srch_ci"].apply(helper)



In [None]:
# feature extraction total_no_of_days

df["srch_days"] = df["srch_co"] - df["srch_ci"]

In [None]:
# converting into integer format for machine learning purpose

df["srch_days"] = df["srch_days"].apply(lambda x:x.days)

In [None]:
df["srch_days"]

Unnamed: 0,srch_days
0,4
1,1
2,7
3,1
4,5
...,...
99995,4
99996,1
99997,7
99998,2


In [None]:
# dropping unwanted features

df.drop(labels = ["srch_ci", "srch_co"], axis = 1, inplace = True)

In [None]:
df.columns

Index(['Unnamed: 0', 'date_time', 'site_name', 'posa_continent',
       'user_location_country', 'user_location_region', 'user_location_city',
       'user_id', 'is_mobile', 'is_package', 'channel', 'srch_adults_cnt',
       'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
       'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent',
       'hotel_country', 'hotel_market', 'hotel_cluster', 'srch_days'],
      dtype='object')

#### Time stamp in days between datapoints


In [None]:
# Converting session times into days and minutes

def helper2(date_str):
    dformat = "%Y-%m-%d %H:%M:%S"
    return datetime.strptime(date_str, dformat)

In [None]:
# converting datetime str into datetime

df["date_time"] = df["date_time"].apply(helper2)

In [None]:
latest_session_time = df["date_time"].sort_values(ascending = False)[0]

In [None]:
# calculating the session period
df["session_day"] = latest_session_time - df["date_time"]
df["session_day"] = df["session_day"].apply(lambda x:x.days)
df["session_hour"] = latest_session_time -df["date_time"]
df["session_hour"] = df["session_hour"].apply(lambda x:x.seconds/3600)

In [None]:
# dropping datetime column

df.drop(labels = ["date_time", 'Unnamed: 0'], axis = 1, inplace = True)

## Encoding

# Classification

## train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.columns

Index(['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city', 'user_id', 'is_mobile',
       'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'is_booking', 'cnt', 'hotel_continent', 'hotel_country', 'hotel_market',
       'hotel_cluster', 'srch_days', 'session_day', 'session_hour'],
      dtype='object')

In [None]:
features = ['site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city', 'user_id', 'is_mobile',
       'is_package', 'channel', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'cnt', 'hotel_continent', 'hotel_country', 'hotel_market',
       'hotel_cluster', 'srch_days', 'session_day', 'session_hour']
target = 'is_booking'

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], test_size = 0.33, stratify = df[target])

In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [None]:
from lazypredict.Supervised import LazyClassifier
import lazypredict

In [None]:
len(lazypredict.Supervised.CLASSIFIERS)

29

In [None]:
classifiers = lazypredict.Supervised.CLASSIFIERS

In [None]:
classifiers.pop(12)

('LabelSpreading', sklearn.semi_supervised._label_propagation.LabelSpreading)

In [None]:
clf = LazyClassifier( classifiers = classifiers, verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(x_train, x_test, y_train, y_test)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


 97%|█████████▋| 28/29 [10:43<01:18, 78.21s/it] 

[LightGBM] [Info] Number of positive: 5355, number of negative: 61563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003513 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2371
[LightGBM] [Info] Number of data points in the train set: 66918, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080023 -> initscore=-2.442030
[LightGBM] [Info] Start training from score -2.442030


100%|██████████| 29/29 [10:44<00:00, 22.22s/it]


In [None]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
QuadraticDiscriminantAnalysis      0.69               0.66     0.66      0.76   
GaussianNB                         0.66               0.65     0.65      0.74   
NearestCentroid                    0.57               0.65     0.65      0.67   
PassiveAggressiveClassifier        0.76               0.56     0.56      0.81   
Perceptron                         0.78               0.55     0.55      0.82   
DecisionTreeClassifier             0.85               0.54     0.54      0.86   
ExtraTreeClassifier                0.85               0.53     0.53      0.86   
KNeighborsClassifier               0.92               0.51     0.51      0.88   
BaggingClassifier                  0.92               0.50     0.50      0.88   
XGBClassifier                      0.92               0.50     0.50      0.88   
LGBMClassifier              

## Visualising Results

In [None]:
# Accuracy

line = px.line(data_frame = models, y = ["Accuracy"], markers=True)
line.update_xaxes(title = "Model", rangeslider_visible = False)
line.update_yaxes(title = "Accuracy")
line.update_traces(line_color="red")
line.update_layout(showlegend=True,
                   title = {
                       "text":"Accuracu vs model",
                       "y":0.95,
                       "x":0.5,
                       "xanchor":"center",
                       "yanchor":"top"
                   })
line.show()

In [None]:
# ROC and F1 Score

line = px.line(data_frame= models, y = ["ROC AUC", "F1 Score"], markers = True)
line.update_xaxes(title = "Model", rangeslider_visible = False)
line.update_yaxes(title = "ROC AUC Score")
line.update_layout(showlegend = True,
                   title= {
                       "text":"ROC AOC and F1 Score vs Model",
                       "x" :0.50,
                       "y":0.95,
                       "xanchor":"center",
                       "yanchor":"top"
                   })
line.show()

In [None]:
# Training Time

line = px.line(data_frame = models, y = ["Time Taken"], markers = True)
line.update_xaxes(title = "Model", rangeslider_visible = False)
line.update_yaxes(title = "Time Taken")
line.update_traces(line_color = "green")
line.update_layout(showlegend = True,
                   title = {
                       "text":"Training Time vs Model",
                       "x":0.50,
                       "y":0.95,
                       "xanchor":"center",
                       "yanchor":"top"
                   })
line.show()

In [None]:
models = pd.DataFrame(models).to_csv("/content/drive/MyDrive/Datasets/expedia-travel-dataset/models.csv")

In [None]:
predictions = pd.DataFrame(predictions).to_csv("/content/drive/MyDrive/Datasets/expedia-travel-dataset/predictions.csv")