# Machine Learning in Python

## General Setup

In [34]:
# Display plots inline
%matplotlib inline

# Data libraries
import pandas as pd
import numpy as np
import math 

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn modules
import sklearn

In [3]:
# Plotting defaults
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 80

In [74]:
# Load data
d = pd.read_csv("hotel.csv")
d.head()

Unnamed: 0,is_canceled,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,0,Resort Hotel,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,,,0,Transient,0.0,0,0
1,0,Resort Hotel,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,,,0,Transient,0.0,0,0
2,0,Resort Hotel,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,,,0,Transient,75.0,0,0
3,0,Resort Hotel,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,304.0,,0,Transient,75.0,0,0
4,0,Resort Hotel,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,240.0,,0,Transient,98.0,0,1


## 1. Introduction

In [54]:
d.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


## 2. Exploratory Data Analysis and Feature Engineering

*A detailed discussion of the data with a particular emphasis on the features of the data that are relevant for the subsequent modeling.* 


*Additionally, this section implements and describe any preprocessing / feature engineering of the data.*



In [6]:
# print number of mising values in data 
for col in d.columns:
    null_num = np.sum(d[col].isnull())
    if null_num > 0:
        print(col,null_num)



children 4
country 488
agent 16340
company 112593


In [8]:
# drop duplicated data
drop_data = d.copy()
drop_data.drop_duplicates(inplace=True)
# Report the number of dropped rows due to duplicates
print(f'Dropped {len(drop_data.index) - len(drop_data.drop_duplicates().index)} rows due to duplicates')

Dropped 0 rows due to duplicates


In [65]:
def high_cancel_agent(col, threshold):
    ''' 
    function to output an array with ids of agents with a cancellation rate of higher than the threshold,
    col ; agents or company
    threshold ; proportion
    '''
    n = col.size
    agent_id = np.array(col.unique()) # array containing unique values in the agent column
    agent_id = agent_id[~np.isnan(agent_id)] # remove nan value

    num_of_cancels = np.zeros(agent_id.size)
    num_of_bookings = np.zeros(agent_id.size)

    # to find the total number of bookings per agent
    for i in range(n):
        agent = col[i]
        if math.isnan(agent):
                k = 1
        else:
            num_of_bookings[list(agent_id).index(agent)] += 1

    # to find the number of cancellations per agent
    for i in range(n):
        if d.is_canceled[i] == 1:
            agent = col[i]
            if math.isnan(agent):
                k = 1
            else:
                num_of_cancels[list(agent_id).index(agent)] += 1

    cancel_rate = num_of_cancels / num_of_bookings
    count = 0
    agent_id_high = []
    agent_bookings_high = []
    agent_rate_high = []

    for i in range(cancel_rate.size):
        if cancel_rate[i] >= threshold:
            agent_id_high.append(agent_id[i])
            agent_bookings_high.append(num_of_bookings[i])
            agent_rate_high.append(cancel_rate[i])

    return agent_id_high, agent_bookings_high, agent_rate_high



In [73]:
agent_high = high_cancel_agent(d.agent, 0.5)[0] # agents with cancellation rate higher than 50%
company_high = high_cancel_agent(d.company, 0.5)[0] # companies with cancellation rate higher than 50%

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d.agent[i] = 0


In [85]:
# assigning 0 to non-significant agents
d_copy = d.copy()

for i in range(d.agent.size):
    if d.agent[i] in agent_high:
        d_copy.agent[i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_copy.agent[i] = 0


In [101]:
# assigning 0 to non-significant companies
for i in range(d.company.size):
    if d.company[i] in company_high:
        d_copy.company[i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d_copy.company[i] = 0


## 3. Model Fitting and Tuning

*Choice of model and describe the process used to refine and fit that model.* (e.g. logistic regression, classification trees, SVC, etc.)

## 4. Discussion & Conclusions


*A general overview of the final model, its performance and reliability*


## 5. References
