In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('hotel_bookings.csv')
df

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
def rem_null(df):
    df.fillna(0,inplace=True)
    print(df.isna().sum())
rem_null(df)

In [None]:
df.columns

In [None]:
ACB = ['adults', 'children', 'babies']
for i in ACB:
    print(f'{i} has unique value as {df[i].unique()}')

In [None]:
pd.set_option('display.max_columns',32)

In [None]:
filtereddata = (df['children']==0) & (df['adults']==0) & (df['babies']==0)
df[filtereddata]

In [None]:
data = df[~filtereddata]
data

In [None]:
df.columns

# Analysing Home country of guests

In [None]:
countrywise_data = pd.DataFrame(data,columns=['country','No of guests']).reset_index()

In [None]:
countrywise_data

## Folium Map

In [None]:
import folium
from folium.plugins import HeatMap

basemap = folium.Map(zoom_start=12)
basemap.save("foliummap.html", close_file=True)

In [None]:
basemap = folium.Map()

basemap

In [None]:
#plotly is a advanced label data visualisation library that is extensively used for deployment label visuals

import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()

## Pyplot Map

In [None]:
map_cg = px.choropleth(countrywise_data,
             locations = countrywise_data['country'],
             color = countrywise_data['No of guests'],
             color_continuous_scale="ylorrd",
             hover_name = countrywise_data['country'],
             title = 'Nationality of Guests')
map_cg.write_html("plotlymap.html")
map_cg.show()

## Non colored countries are not in data

In [None]:
attenddata = data[data['is_canceled']==0]

In [None]:
attenddata.columns

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x='reserved_room_type',y='adr',hue='hotel',data=attenddata)
plt.title('Price of room types per night &person')
plt.xlabel('Room type')
plt.ylabel('Price(Euro)')
plt.legend()
plt.show()


### Blue = Resort hotel , Orange = City hotel

1. for a blue box top line is 75th percentile data and middle line is 50th percentile data and below line is 25th percentile data

2. The line perpendicular to out of the box has a top most line parallel to 75th percentile data that is 100th percentile data and the down most line is 0th percentile data

#similarly for orange box

3. The data points that we see above 100% line are high values or we can say outliers and datapoints below 0% line are low values or we can say outliers

#### An outlier is a data point that is noticeably different from the rest

#### Outliers are stragglers that are extremely high or extremely low values in a data set that can throw off your stats.

### We can conclude from this that the best distribution of CITY HOTEL is almost tend to G room type && RESORT HOTEL is almost tend to H room type

# Analysing Prices of Hotels across year

In [None]:
data_rh = data[(data['hotel']=='Resort Hotel') & (data['is_canceled']==0)]
data_ch = data[(data['hotel']=='City Hotel') & (data['is_canceled']==0)]

In [None]:
data_rh.head(20)

In [None]:
resort_hotel = data_rh.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel

In [None]:
city_hotel = data_ch.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel

In [None]:
final = resort_hotel.merge(city_hotel,on='arrival_date_month')
final.columns = ['month','price_resort','price_cityhotel']
final

In [None]:
pip install sorted-months-weekdays
pip install sort-dataframeby-monthorweek

import sort_dataframeby_monthorweek as sd

In [None]:
def sort_data(df,colname):
    return sd.Sort_Dataframeby_Month(df,colname)
final = sort_data(final,'month')

In [None]:
px.line(final,x='month',y=['price_resort','price_cityhotel'],title='Room price per night over the month')

#### We can conclude that the prices in Resort hotel in summer is pretty much High

#### City hotel is much expensive in Spring and Autumn that Resorts

#### Resort hotel costs less comparitively to City hotel excepth in Summer,but whereas City hotels are high in cost

# Analysing Demand Of hotels

In [None]:
data_rh

In [None]:
rush_rh = data_rh['arrival_date_month'].value_counts().reset_index()
rush_rh.columns=['Month','No of guessts in resort']
rush_rh

In [None]:
rush_ch = data_ch['arrival_date_month'].value_counts().reset_index()
rush_ch.columns=['Month','No of guests in city hotel']
rush_ch

In [None]:
rush = rush_ch.merge(rush_rh,on='Month')
rush.columns = ['month','guests rush in city hotel','guests rush in resort']
rush

In [None]:
rush = sort_data(rush,'month')

In [None]:
rush_graph = px.line(rush,x='month',y=['guests rush in resort','guests rush in city hotel'],title= 'No of guests in each month')
rush_graph

#### For both Resort and City hotel the Guests are fewer in Winter
#### Guests prefer City hotels in Summer
#### City hotels get more guests every month compared to Resorts
#### Peak time for both City hotels and Resorts is Summer

## How long do people stay at the hotels?

In [None]:
filter=data['is_canceled']==0
clean_data=data[filter]

In [None]:
clean_data.head()

In [None]:
clean_data.columns


In [None]:
clean_data.head()

In [None]:
clean_data["total_nights"] = clean_data["stays_in_weekend_nights"] + clean_data["stays_in_week_nights"]

In [None]:
stay=clean_data.groupby(['total_nights','hotel']).agg('count').reset_index()
stay=stay.iloc[:,0:3]
stay

In [None]:
stay=stay.rename(columns={'is_canceled':'Number of stays'})
stay

In [None]:
plt.figure(figsize=(20, 8))
sns.barplot(x = "total_nights", y = "Number of stays" , hue="hotel",
            hue_order = ["City Hotel", "Resort Hotel"], data=stay)

# CO-RELATION

In [None]:
data.head()

In [None]:
data.corr()

In [None]:
corelation = data.corr()['is_canceled']
corelation

In [None]:
corelation.abs().sort_values(ascending=False)

In [None]:
corelation.abs().sort_values(ascending=False)[1:]

In [None]:
data.columns

####    From this list it is apparent that lead_time, total_of_special_requests, required_car_parking_spaces, booking_changes and previous_cancellations are the 5 most important numerical features.
####    However, to predict whether or not a booking will be canceled, the number of booking changes is a possible source of leakage, because this information can change over time.
####    I will also not include days_in_waiting_list,booking changes  and arrival_date_year.

####    The most important feature to exclude is the "reservation_status":

In [None]:
data.groupby('is_canceled')['reservation_status'].value_counts()

In [None]:
list_not=['days_in_waiting_list','arrival_date_year']

In [None]:
num_features=[col for col in data.columns if data[col].dtype!='O' and col not in list_not]
num_features

In [None]:
cat_not=['arrival_date_year', 'assigned_room_type', 'booking_changes', 'reservation_status', 'country','days_in_waiting_list']

In [None]:
cat_features=[col for col in data.columns if data[col].dtype=='O' and col not in cat_not]
cat_features

In [None]:
data_cat=data[cat_features]

In [None]:
data_cat.head()