# Import Necessory Libraries

In [1]:
# Dataframe manipulation and analysis libraries
import pandas as pd 
import numpy as np

# Data Visualization Library
import matplotlib.pyplot as plt
import seaborn as sns

#Library to filter warnings
import warnings
warnings.filterwarnings('ignore')


#importing machine learning models
from sklearn.linear_model import LogisticRegression,Lasso,RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.cluster import KMeans

# Data Preparation libraries
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV,StratifiedKFold

#Model evaluation Metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score

# Feature Decomposition Library
from sklearn.feature_selection import RFE

import pickle

## Loading Dataset

In [2]:
df = pd.read_csv('train_data_evaluation_part_2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,...,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
0,0,1,PRT,51.0,150,45,371.0,105.3,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,PRT,,1095,61,280.0,53.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,3,DEU,31.0,1095,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,4,FRA,60.0,1095,93,240.0,60.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,5,FRA,51.0,1095,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df = df[:3000]

## Problem statement

*  Train a machine learning model (preferably with a neural network) that predicts the customer who is going to be checked in. Once done, please test the prediction with below test data.


## Exploratory Data Analysis (EDA)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            3000 non-null   int64  
 1   ID                    3000 non-null   int64  
 2   Nationality           3000 non-null   object 
 3   Age                   2729 non-null   float64
 4   DaysSinceCreation     3000 non-null   int64  
 5   AverageLeadTime       3000 non-null   int64  
 6   LodgingRevenue        3000 non-null   float64
 7   OtherRevenue          3000 non-null   float64
 8   BookingsCanceled      3000 non-null   int64  
 9   BookingsNoShowed      3000 non-null   int64  
 10  BookingsCheckedIn     3000 non-null   int64  
 11  PersonsNights         3000 non-null   int64  
 12  RoomNights            3000 non-null   int64  
 13  DaysSinceLastStay     3000 non-null   int64  
 14  DaysSinceFirstStay    3000 non-null   int64  
 15  DistributionChannel  

Analysis :
1. there are total 82580 rows and 30 columns in dataset
2. there are null values in age column

In [5]:
df.nunique().to_frame(name = 'Number of unique values')

Unnamed: 0,Number of unique values
Unnamed: 0,3000
ID,3000
Nationality,67
Age,82
DaysSinceCreation,77
AverageLeadTime,124
LodgingRevenue,696
OtherRevenue,597
BookingsCanceled,3
BookingsNoShowed,3


In [6]:
df['MarketSegment'].value_counts()

Other                    869
Travel Agent/Operator    864
Groups                   560
Direct                   529
Corporate                142
Complementary             34
Aviation                   2
Name: MarketSegment, dtype: int64

In [7]:
df['DistributionChannel'].value_counts()

Travel Agent/Operator      2211
Direct                      575
Corporate                   206
Electronic Distribution       8
Name: DistributionChannel, dtype: int64

In [8]:
df['BookingsCheckedIn'].value_counts()

1     1874
0     1046
2       48
3        9
4        5
5        4
11       3
7        2
8        2
6        2
9        1
12       1
66       1
15       1
29       1
Name: BookingsCheckedIn, dtype: int64

In [9]:
df = df[df['Age'] > 0] 
df = df[df['AverageLeadTime'] >= 0] 
#df = df[df['BookingsCheckedIn'] < 2]
#df['CheckIn'] = df['BookingsCheckedIn'].replace({if > 1:1})

In [10]:
df['CheckIn'] = np.where(df['BookingsCheckedIn'] >= 1,1,0)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2727 entries, 0 to 2999
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            2727 non-null   int64  
 1   ID                    2727 non-null   int64  
 2   Nationality           2727 non-null   object 
 3   Age                   2727 non-null   float64
 4   DaysSinceCreation     2727 non-null   int64  
 5   AverageLeadTime       2727 non-null   int64  
 6   LodgingRevenue        2727 non-null   float64
 7   OtherRevenue          2727 non-null   float64
 8   BookingsCanceled      2727 non-null   int64  
 9   BookingsNoShowed      2727 non-null   int64  
 10  BookingsCheckedIn     2727 non-null   int64  
 11  PersonsNights         2727 non-null   int64  
 12  RoomNights            2727 non-null   int64  
 13  DaysSinceLastStay     2727 non-null   int64  
 14  DaysSinceFirstStay    2727 non-null   int64  
 15  DistributionChannel  

In [12]:
df['CheckIn'].value_counts()

1    1743
0     984
Name: CheckIn, dtype: int64

### Replacing Null Values with Median

In [13]:
#df['Age'].fillna(df['Age'].median(), inplace = True)

### Encoding Catagorical Features

In [14]:
df = pd.get_dummies(df,columns = ['DistributionChannel'])

In [15]:
df = pd.get_dummies(df,columns = ['MarketSegment'])

### Dropping Unnecessory Columns

In [16]:
df.drop(['Unnamed: 0','ID','Nationality'],axis=1,inplace=True)

## Descriptive Analysis

In [17]:
df.describe()

Unnamed: 0,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,...,DistributionChannel_Direct,DistributionChannel_Electronic Distribution,DistributionChannel_Travel Agent/Operator,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Other,MarketSegment_Travel Agent/Operator
count,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,...,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0,2727.0
mean,45.808948,1051.36267,22.235057,149.444822,54.795134,0.0044,0.0022,0.726073,3.544554,1.961496,...,0.185552,0.001834,0.745875,0.000733,0.008067,0.044738,0.173084,0.189219,0.285662,0.298497
std,15.614104,28.754344,32.981099,244.946675,107.516829,0.093727,0.054128,1.477935,4.219071,2.738878,...,0.388816,0.042788,0.435448,0.027077,0.089472,0.206766,0.378389,0.391754,0.451812,0.457682
min,1.0,150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,1031.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,47.0,1047.0,4.0,112.44,21.0,0.0,0.0,1.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,57.0,1068.5,35.0,216.0,72.49,0.0,0.0,1.0,6.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,83.0,1095.0,162.0,8493.65,2587.5,3.0,2.0,66.0,75.0,95.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Building Custom Summery function for indepth EDA report

In [18]:
def custom_summary(mydf):
    cols = []
    for i in df.columns:
        if mydf[i].dtype != object:
            cols.append(i)
    result = pd.DataFrame(columns = cols,index=['datatype','count','min','Q1','Q2','Q3','Max','Mean','stddev','skew','kurt','range','IQR','skew_comment','kurt_comment','outlier_comment'])
    for i in result.columns:
        result.loc['datatype',i] = mydf[i].dtype
        result.loc['count',i] = mydf[i].count()
        result.loc['min',i] = mydf[i].min()
        result.loc['Q1',i] = mydf[i].quantile(0.25)
        result.loc['Q2',i] = mydf[i].quantile(0.50)
        result.loc['Q3',i] = mydf[i].quantile(0.75)
        result.loc['Max',i] = mydf[i].quantile(1)
        result.loc['Mean',i] = round(mydf[i].mean(),2)
        result.loc['stddev',i] = round(mydf[i].std(),2)
        result.loc['skew',i] = round(mydf[i].skew(),2)
        result.loc['kurt',i] = round(mydf[i].kurt(),2)
        result.loc['range',i] = mydf[i].quantile(1)-mydf[i].min()
        result.loc['IQR',i] = mydf[i].quantile(0.75)-mydf[i].quantile(0.25)
        
        #Adding Comments for skewness
        if result.loc['skew',i] < -1:
            sk_label = 'Highly negatively skewed'
        elif -1 < result.loc['skew',i] <= -0.5:
            sk_label = 'Moderately negatively skewed'
        elif -0.5 < result.loc['skew',i] <= 0:
            sk_label = 'Approximately normally distributed(-ve)'
        elif 0 < result.loc['skew',i] <= 0.5:
            sk_label = 'Approximately normally distributed(+ve)'
        elif 0.5 < result.loc['skew',i] <= 1:
            sk_label = 'Moderately Positively Skewd'
        elif result.loc['skew',i] > 1:
            sk_label = 'Highly Positively Skewd'
        else:
            sk_label = 'error'
        result.loc['skew_comment',i] = sk_label
        
        #Adding Comments for Kurtosis
        if result.loc['kurt',i] < -1:
            ku_label = 'Highly Platykurtic'
        elif -1 < result.loc['kurt',i] <= -0.5:
            ku_label = 'Moderately Platykurtic'
        elif -0.5 < result.loc['kurt',i] <= 0.5:
            ku_label = 'Mesokurtic'
        elif 0.5 < result.loc['kurt',i] <= 1:
            ku_label = 'Moderately Leptokurtic'
        elif result.loc['kurt',i] > 1:
            ku_label = 'Highly Leptokurtic'
        else:
            ku_label = 'error'
        result.loc['kurt_comment',i] = ku_label
        
        #adding comments for outliers
        LW = result.loc['Q1',i] - (1.5*result.loc['IQR',i])
        UW = result.loc['Q3',i] + (1.5*result.loc['IQR',i])
        
        if len([x for x in mydf[i] if x < LW or x> UW]) > 0:
            outlier_lable = "Have Outliers"
        else:
            outlier_lable = "No Outliers"
            
        result.loc['outlier_comment',i] = outlier_lable
        
    return result
            

In [19]:
custom_summary(df)

Unnamed: 0,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,...,DistributionChannel_Direct,DistributionChannel_Electronic Distribution,DistributionChannel_Travel Agent/Operator,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Other,MarketSegment_Travel Agent/Operator
datatype,float64,int64,int64,float64,float64,int64,int64,int64,int64,int64,...,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8,uint8
count,2727,2727,2727,2727,2727,2727,2727,2727,2727,2727,...,2727,2727,2727,2727,2727,2727,2727,2727,2727,2727
min,1.0,150,0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q1,35.0,1031.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q2,47.0,1047.0,4.0,112.44,21.0,0.0,0.0,1.0,2.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q3,57.0,1068.5,35.0,216.0,72.49,0.0,0.0,1.0,6.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
Max,83.0,1095.0,162.0,8493.65,2587.5,3.0,2.0,66.0,75.0,95.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Mean,45.81,1051.36,22.24,149.44,54.8,0.0,0.0,0.73,3.54,1.96,...,0.19,0.0,0.75,0.0,0.01,0.04,0.17,0.19,0.29,0.3
stddev,15.61,28.75,32.98,244.95,107.52,0.09,0.05,1.48,4.22,2.74,...,0.39,0.04,0.44,0.03,0.09,0.21,0.38,0.39,0.45,0.46
skew,-0.28,-11.0,1.68,15.73,8.56,26.61,27.66,32.6,2.94,15.21,...,1.62,23.3,-1.13,36.91,11.0,4.41,1.73,1.59,0.95,0.88


## Checking for Outlier using boxplot

In [20]:
def replace_outlier(mydf,col,method = 'Quartile', stratergy = 'median'):
    if method == 'Quartile':
        Q1 = mydf[col].quantile(0.25)
        Q2 = mydf[col].quantile(0.50)
        Q3 = mydf[col].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - (1.5 * IQR)
        UW = Q3 + (1.5 * IQR)
    elif method == 'std':
        mean = mydf[col].mean()
        std = mydf[col].std()
        LW = mean - (2 * std)
        UW = mean + (2 * std)
    else:
        print('Pass a corect method')
#printing all the outliers 
    outliers = mydf.loc[(mydf[col] < LW) | (mydf[col] > UW),col]
    outliers_density = round(len(outliers) / len(mydf),2)
    if len(outliers) == 0:
        print(f'feature {col} does not have any outliers')
    else:
        print(f'feature {col} has otliers')
        print(f'total no of outliers in {col} is {len(outliers)}')
        print(f'outliers percentage in {col} is {outliers_density*100}%')
    
    if stratergy == 'median':
#     mydf.loc[(mydf[col] < LW), col] = Q2 
#     mydf.loc[(mydf[col] > UW), col] = Q2 
        mydf.loc[(mydf[col] < LW), col] = Q1 
        mydf.loc[(mydf[col] > UW), col] = Q3 
    elif stratergy == 'mean':
        mydf.loc[(mydf[col] < LW), col] = mean 
        mydf.loc[(mydf[col] > UW), col] = mean
    else:
        print('pass the correct stratergy')
        
    return mydf

In [21]:
def odt_plots(mydf,col):
    f,(ax1,ax2) = plt.subplots(1,2,figsize=(25,8))
    #discriptive statistics box plot
    sns.boxplot(mydf[col],ax = ax1)
    ax1.set_title(col + ' boxplot')
    ax1.set_xlabel('values')
    ax1.set_ylabel('boxplot')
    #replacing the outliers
    mydf_out = replace_outlier(mydf,col)
    #plotting box plot without outliers
    sns.boxplot(mydf_out[col],ax = ax2)
    ax2.set_title(col + ' boxplot')
    ax2.set_xlabel('values')
    ax2.set_ylabel('boxplot')
    plt.show()

In [22]:
df.nunique().to_frame(name = 'Number of unique values')

Unnamed: 0,Number of unique values
Age,81
DaysSinceCreation,77
AverageLeadTime,123
LodgingRevenue,662
OtherRevenue,569
BookingsCanceled,3
BookingsNoShowed,3
BookingsCheckedIn,13
PersonsNights,26
RoomNights,19


In [23]:
#out_df = df[['Age', 'DaysSinceCreation','AverageLeadTime','LodgingRevenue','OtherRevenue']]
#for col in out_df.columns:
    #odt_plots(df,col)

In [24]:
df['BookingsCheckedIn'].value_counts()

1     1674
0      984
2       42
3        7
5        4
4        4
11       3
7        2
8        2
6        2
9        1
66       1
15       1
Name: BookingsCheckedIn, dtype: int64

In [25]:
df['CheckIn'].value_counts()

1    1743
0     984
Name: CheckIn, dtype: int64

# Model Building
1. Using train test split
2. Cross Validation
3. Hyperparameter Tuning

In [26]:
def train_and_test_split(data,t_col,testsize=0.3):
    x = data.drop(t_col,axis=1)
    y = data[t_col]
    return train_test_split(x,y,test_size = testsize,random_state = 40)

In [27]:
def model_builder(x,y,model_name,estimators):
    x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3, random_state = 1)
    model = model_name
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    result = accuracy_score(y_test,y_pred)
    return [estimators,result]

In [28]:
 def master_mb(x,y):
        list_model = [LogisticRegression(),DecisionTreeClassifier(),RidgeClassifier(),SVC(),KNeighborsClassifier(),AdaBoostClassifier(),GradientBoostingClassifier(),RandomForestClassifier(),XGBClassifier()]
        result = []
        model_name = []
        for model in list_model:
            temp_result = model_builder(x,y,model,estimators = type(model).__name__)
            result.append(temp_result)
        return result

In [29]:
master_mb(x=df.drop(['BookingsCheckedIn','CheckIn'],axis=1),y=df['CheckIn'])

[['LogisticRegression', 1.0],
 ['DecisionTreeClassifier', 1.0],
 ['RidgeClassifier', 1.0],
 ['SVC', 1.0],
 ['KNeighborsClassifier', 1.0],
 ['AdaBoostClassifier', 1.0],
 ['GradientBoostingClassifier', 1.0],
 ['RandomForestClassifier', 1.0],
 ['XGBClassifier', 1.0]]

In [30]:
df.head()

Unnamed: 0,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,...,DistributionChannel_Direct,DistributionChannel_Electronic Distribution,DistributionChannel_Travel Agent/Operator,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Other,MarketSegment_Travel Agent/Operator
0,51.0,150,45,371.0,105.3,1,0,3,8,5,...,0,0,0,0,0,1,0,0,0,0
2,31.0,1095,0,0.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,60.0,1095,93,240.0,60.0,0,0,1,10,5,...,0,0,1,0,0,0,0,0,0,1
4,51.0,1095,0,0.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,54.0,1095,58,230.0,24.0,0,0,1,4,2,...,0,0,1,0,0,0,0,0,1,0


In [31]:
Master_MB = pd.DataFrame(master_mb(x = df.drop(['BookingsCheckedIn','CheckIn'],axis=1), y = df['CheckIn']),columns=['Model Names','Accuraccy']).sort_values(by=['Accuraccy'],ascending = False)
Master_MB = Master_MB.reset_index(drop=True)
Master_MB

Unnamed: 0,Model Names,Accuraccy
0,LogisticRegression,1.0
1,DecisionTreeClassifier,1.0
2,RidgeClassifier,1.0
3,SVC,1.0
4,KNeighborsClassifier,1.0
5,AdaBoostClassifier,1.0
6,GradientBoostingClassifier,1.0
7,RandomForestClassifier,1.0
8,XGBClassifier,1.0


In [32]:
stf = StratifiedKFold(n_splits=10,shuffle=True,random_state=100)
stf.get_n_splits(X=df.drop(['BookingsCheckedIn'],axis=1),y=df['BookingsCheckedIn'])

10

In [33]:
X=df.drop(['BookingsCheckedIn'],axis=1)
Y=df['BookingsCheckedIn']

In [34]:
def startifiedKfoldcv(x,y,fold=10):
    score_lr = LogisticRegression()
    score_rg = RidgeClassifier()
    score_dt = DecisionTreeClassifier()
    score_kn = KNeighborsClassifier()
    score_sv = SVC()
    score_rf = RandomForestClassifier()
    score_ab = AdaBoostClassifier()
    score_gb = GradientBoostingClassifier()
   # score_xg = XGBClassifier()
    scores = [score_lr,score_rg,score_dt,score_kn,score_kn,score_sv,score_rf,score_ab,score_gb]
    score_mean = []
    for train_index,test_index in stf.split(X=X,y=Y):
        #print('train indices:',train_index,'test_indice',test_index)
        x1_train,x1_test = X.iloc[train_index],X.iloc[test_index]
        y1_train,y1_test = Y.iloc[train_index],Y.iloc[test_index]
        for model in scores:
            model.fit(x1_train,y1_train)
            y1_pred = model.predict(x1_test)
            acc = accuracy_score(y1_test,y1_pred)
            score_mean.append(acc)
    model_names = ['LogisticRegression','RidgeClassifier','DecisionTreeClassifier','KNeighborsClassifier','SVC','RandomForestClassifier','AdaBoostClassifier','GradientBoostingClassifier']
    
    result = []
    for i in range(len(model_names)):
        score = score_mean[i]
        m_names = model_names[i]
        temp = [m_names,score]
        result.append(temp)
    kfold_df = pd.DataFrame(result,columns = ['model_names','cv_score']).sort_values(by=['cv_score'], ascending=False)
    kfold_df = kfold_df.reset_index(drop=True)
    return kfold_df
        

In [35]:
startifiedKfoldcv(df.drop(['BookingsCheckedIn','CheckIn'],axis = 1),df['CheckIn'])

Unnamed: 0,model_names,cv_score
0,AdaBoostClassifier,0.992674
1,LogisticRegression,0.989011
2,DecisionTreeClassifier,0.985348
3,KNeighborsClassifier,0.981685
4,SVC,0.981685
5,RidgeClassifier,0.978022
6,RandomForestClassifier,0.978022
7,GradientBoostingClassifier,0.974359


In [36]:
def tuning(x,y,fold = 10):
    
    # Parameter Grids for Different Models
    
    param_rd = {'alpha':[1e-15,1e-13,1e-11,1e-9,1e-7,1e-5,1e-3,1e-1,0,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,200,300,400,500]}
    param_dtr = {'criterion':['gini','entropy','log_loss'],'max_depth':[3,5,7,9,11],'max_features':[1,2,3,4,5,6,7,'auto','log2','sqrt']}
    param_knr = {'weights':['uniform','distance'],'algorithm':['auto','ball_tree','kd_tree','brute']}
    param_svr = {'gamma':['scale','auto'],'C':[0.1,1,1.5,2]}
    param_rf = {'max_depth':[3,5,7,9,11],'max_features':[1,2,3,4,5,6,7,'auto','log2','sqrt'],'n_estimators':[50,100,150,200]}
    param_ad = {'n_estimators':[50,100,150,200],'learning_rate':[0.1,0.5,0.7,1,5,10,20,50,100]}
    param_gb = {'n_estimators':[50,100,150,200],'loss':['log_loss','deviance','exponential']}
    param_xgb = {'eta':[0.1,0.5,0.7,1,5,10 ,20,50,100],'max_depth':[3,5,7,9,11],'gamma':[0,10,20,50,100],'reg_lambda':[0,1,3,5,7,10],'alpha':[0,1,3,5,7,10]}
    
    # Creating Model Object
    tune_rd = RandomizedSearchCV(RidgeClassifier(),param_rd,cv = fold)
    tune_dtr = RandomizedSearchCV(DecisionTreeClassifier(),param_dtr,cv = fold)
    tune_knr = RandomizedSearchCV(KNeighborsClassifier(),param_knr,cv = fold)
    tune_svr = RandomizedSearchCV(SVC(),param_svr,cv = fold)
    tune_rf = RandomizedSearchCV(RandomForestClassifier(),param_rf,cv = fold)
    tune_ad = RandomizedSearchCV(AdaBoostClassifier(),param_ad,cv = fold)
    tune_gb = RandomizedSearchCV(GradientBoostingClassifier(),param_gb,cv = fold)
    tune_xgb = RandomizedSearchCV(XGBClassifier(),param_xgb,cv = fold)
    
    # Model Fitting
    tune_rd.fit(x,y)
    tune_dtr.fit(x,y)
    tune_knr.fit(x,y)
    tune_svr.fit(x,y)
    tune_rf.fit(x,y)
    tune_ad.fit(x,y)
    tune_gb.fit(x,y)
    tune_xgb.fit(x,y)
    
    tune = [tune_rd,tune_dtr,tune_knr,tune_svr,tune_rf,tune_ad,tune_gb,tune_xgb]
    #tune = [tune_knr,tune_ad,tune_xgb]
    models = ['RidgeClassifer','DecisionTreeClassifier','KNeighborsClassifier','SVC','RandomForestClassifier','AdaBoostClassifier','GradientBoostClassifier','XGBClassifier']
    #models = ['KNN Regressor','AdaBoost Regressor','XGB Regressor']
    for i in range(len(tune)):
        print('Model :',models[i])
        print('Best Params :',tune[i].best_params_)

In [37]:
#tuning(df.drop(['BookingsCheckedIn','CheckIn'],axis = 1),df['CheckIn'])

In [38]:
def CV_Post_HPT(x,y,fold = 10):
    score_lr = cross_val_score(LogisticRegression(),x,y,cv = fold)
    score_drt = cross_val_score(DecisionTreeClassifier(criterion = 'entropy',max_depth = 11,max_features = 7),x,y,cv = fold)
    score_rid = cross_val_score(RidgeClassifier(alpha = 1),x,y,cv = fold)
    score_knr = cross_val_score(KNeighborsClassifier(weights = 'uniform',algorithm = 'auto'),x,y,cv = fold)
    score_svr = cross_val_score(SVC(gamma = 'scale',C = 0.1),x,y,cv = fold)
    score_rfr = cross_val_score(RandomForestClassifier(max_depth = 5,max_features = 7,n_estimators = 50),x,y,cv = fold)
    score_ada = cross_val_score(AdaBoostClassifier(n_estimators = 50,learning_rate = 50),x,y,cv = fold)
    score_gb = cross_val_score(GradientBoostingClassifier(n_estimators = 50,loss ='deviance'),x,y,cv = fold)
    score_xgb = cross_val_score(XGBClassifier(eta =0.1,max_depth = 7,gamma = 100,reg_lambda = 7,alpha = 7),x,y,cv = fold)
    
    model_names = ['LogisticRegression','DecisionTreeClassifier','RidgeClassifier','KNeighborsClassifier','SVC','RandomForestClassifier','AdaBoostClassifier','GradientBoostClassifier','XGBClassifier']
    scores = [score_lr,score_drt,score_rid,score_knr,score_svr,score_rfr,score_ada,score_gb,score_xgb]
    result = []
    for i in range(len(model_names)):
        score_mean = np.mean(scores[i])
        score_stdev = np.std(scores[i])
        m_names = model_names[i]
        temp = [m_names,score_mean,score_stdev]
        result.append(temp)
    KFold_df = pd.DataFrame(result,columns = ['Model Name','CV Score','CV Stdev']).sort_values(by = 'CV Score',ascending = False)  
    KFold_df = KFold_df.reset_index(drop=True)
    return KFold_df

In [39]:
CV_Post_HPT(df.drop(['BookingsCheckedIn','CheckIn'],axis = 1),df['CheckIn'])

Unnamed: 0,Model Name,CV Score,CV Stdev
0,LogisticRegression,1.0,0.0
1,DecisionTreeClassifier,1.0,0.0
2,RandomForestClassifier,1.0,0.0
3,AdaBoostClassifier,1.0,0.0
4,GradientBoostClassifier,1.0,0.0
5,XGBClassifier,1.0,0.0
6,RidgeClassifier,0.999266,0.001468
7,KNeighborsClassifier,0.999266,0.001468
8,SVC,0.999266,0.001468


# RFE

In [40]:
rfe_dfe = df
rfe = RFE(estimator= XGBClassifier())
rfe.fit(df.drop(['BookingsCheckedIn','CheckIn'],axis=1),df['CheckIn'])
rfe.support_

array([False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [41]:
df.columns

Index(['Age', 'DaysSinceCreation', 'AverageLeadTime', 'LodgingRevenue',
       'OtherRevenue', 'BookingsCanceled', 'BookingsNoShowed',
       'BookingsCheckedIn', 'PersonsNights', 'RoomNights', 'DaysSinceLastStay',
       'DaysSinceFirstStay', 'SRHighFloor', 'SRLowFloor', 'SRAccessibleRoom',
       'SRMediumFloor', 'SRBathtub', 'SRShower', 'SRCrib', 'SRKingSizeBed',
       'SRTwinBed', 'SRNearElevator', 'SRAwayFromElevator',
       'SRNoAlcoholInMiniBar', 'SRQuietRoom', 'CheckIn',
       'DistributionChannel_Corporate', 'DistributionChannel_Direct',
       'DistributionChannel_Electronic Distribution',
       'DistributionChannel_Travel Agent/Operator', 'MarketSegment_Aviation',
       'MarketSegment_Complementary', 'MarketSegment_Corporate',
       'MarketSegment_Direct', 'MarketSegment_Groups', 'MarketSegment_Other',
       'MarketSegment_Travel Agent/Operator'],
      dtype='object')

In [42]:
rfe_df = df[['OtherRevenue', 'BookingsCanceled', 'BookingsNoShowed', 'PersonsNights', 'RoomNights', 'DaysSinceLastStay','DaysSinceFirstStay','DistributionChannel_Electronic Distribution','DistributionChannel_Travel Agent/Operator', 'MarketSegment_Aviation','MarketSegment_Complementary', 'MarketSegment_Corporate','MarketSegment_Direct', 'MarketSegment_Groups', 'MarketSegment_Other','MarketSegment_Travel Agent/Operator']]

In [43]:
CV_Post_HPT(rfe_df,df['CheckIn'])

Unnamed: 0,Model Name,CV Score,CV Stdev
0,DecisionTreeClassifier,1.0,0.0
1,RandomForestClassifier,1.0,0.0
2,AdaBoostClassifier,1.0,0.0
3,GradientBoostClassifier,1.0,0.0
4,XGBClassifier,1.0,0.0
5,LogisticRegression,0.999634,0.001099
6,RidgeClassifier,0.999266,0.001468
7,KNeighborsClassifier,0.999266,0.001468
8,SVC,0.999266,0.001468


In [44]:
file = open('hotel_booking.pkl','wb')
#dump information to that file
pickle.dump('xgb',file)

In [45]:
rfe_df.columns

Index(['OtherRevenue', 'BookingsCanceled', 'BookingsNoShowed', 'PersonsNights',
       'RoomNights', 'DaysSinceLastStay', 'DaysSinceFirstStay',
       'DistributionChannel_Electronic Distribution',
       'DistributionChannel_Travel Agent/Operator', 'MarketSegment_Aviation',
       'MarketSegment_Complementary', 'MarketSegment_Corporate',
       'MarketSegment_Direct', 'MarketSegment_Groups', 'MarketSegment_Other',
       'MarketSegment_Travel Agent/Operator'],
      dtype='object')

In [46]:
df['DaysSinceLastStay'].value_counts()

-1       984
 1046     98
 1025     67
 1032     52
 1059     50
        ... 
 758       1
 1015      1
 96        1
 101       1
 151       1
Name: DaysSinceLastStay, Length: 136, dtype: int64

In [47]:
#model = pickle.load(open('hotel_booking.pickle', 'rb'))

In [52]:
df7 = df.rename(columns={'DistributionChannel_Electronic Distribution':'DistributionChannel_Electronic_Distribution','DistributionChannel_Travel Agent/Operator':'DistributionChannel_Travel_Agent_Operator', 'MarketSegment_Travel Agent/Operator':'MarketSegment_Travel_Agent_Operator'})


In [49]:
df.head()

Unnamed: 0,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,...,DistributionChannel_Direct,DistributionChannel_Electronic_Distribution,DistributionChannel_Travel_Agent_Operator,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Other,MarketSegment_Travel_Agent_Operator
0,51.0,150,45,371.0,105.3,1,0,3,8,5,...,0,0,0,0,0,1,0,0,0,0
2,31.0,1095,0,0.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,60.0,1095,93,240.0,60.0,0,0,1,10,5,...,0,0,1,0,0,0,0,0,0,1
4,51.0,1095,0,0.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,54.0,1095,58,230.0,24.0,0,0,1,4,2,...,0,0,1,0,0,0,0,0,1,0


In [53]:
df7

Unnamed: 0,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,...,DistributionChannel_Direct,DistributionChannel_Electronic_Distribution,DistributionChannel_Travel_Agent_Operator,MarketSegment_Aviation,MarketSegment_Complementary,MarketSegment_Corporate,MarketSegment_Direct,MarketSegment_Groups,MarketSegment_Other,MarketSegment_Travel_Agent_Operator
0,51.0,150,45,371.0,105.3,1,0,3,8,5,...,0,0,0,0,0,1,0,0,0,0
2,31.0,1095,0,0.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,60.0,1095,93,240.0,60.0,0,0,1,10,5,...,0,0,1,0,0,0,0,0,0,1
4,51.0,1095,0,0.0,0.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,54.0,1095,58,230.0,24.0,0,0,1,4,2,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,31.0,1020,41,229.0,87.0,0,0,1,6,3,...,0,0,1,0,0,0,0,1,0,0
2995,48.0,1020,41,229.0,103.5,0,0,1,6,3,...,0,0,1,0,0,0,0,1,0,0
2996,45.0,1020,41,229.0,126.5,0,0,1,6,3,...,0,0,1,0,0,0,0,1,0,0
2998,42.0,1020,41,197.0,41.0,0,0,1,3,3,...,0,0,1,0,0,0,0,1,0,0
