In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
import math
import random
import numpy as np    
import seaborn as sns
import missingno as msno

# Configure Visualization Defaults
%matplotlib inline
plt.rcParams['figure.figsize'] = 12,8

### Exploratory Data Analysis

In [None]:
df = pd.read_csv("../data/training_set_VU_DM.csv", nrows=3000)
# df = pd.read_csv("../data/training_set_VU_DM.csv")

In [None]:
df.head()

### In this dataset we count 129.113 unique hotels

In [None]:
# Change time information to year and month columns
df["date_time"] = pd.to_datetime(df["date_time"])
df["year"] = df["date_time"].dt.year
df["month"] = df["date_time"].dt.month

In [None]:
# Correlation heatmap of dataset
def correlation_heatmap(df):
    sns.set(font_scale=1)
    fig = plt.figure()
    _ , ax = plt.subplots(figsize =(9, 9))
#     colormap = sns.diverging_palette(220, 10, as_cmap = True)
    cmap = sns.cubehelix_palette(light=1.2, as_cmap=True)
    plt.title('Pearson correlation matrix of proptery features',fontdict = {'fontsize' : 16})
    cbar_ax = _.add_axes([.905, .3, .05, .5])
    _ = sns.heatmap(
        df.corr(), 
        cmap = cmap,
        square=True,
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':10 },
        cbar_ax = cbar_ax)
#     _.figure.tight_layout()

In [None]:
# Groups of attributes
visitor = list(range(3,6))
prop = list(range(6,17))
prop.remove(14)
srch = list(range(17,24))
target = list(range(51,54))
target.append(14)

correlation_heatmap(df.iloc[:,prop+target])

plt.savefig("../plots/correlation.pdf", bbox_inches='tight')

In [None]:
from matplotlib.ticker import PercentFormatter

sns.set()
df_random = df[df['random_bool'] == 1]
hist1 = df_random.loc[df_random['click_bool'] == 1, 'position']
df_nonrandom = df[df['random_bool'] == 0]
hist2 = df_nonrandom.loc[df_nonrandom['click_bool'] == 1, 'position']

# fig = plt.gcf()

fig, ax = plt.subplots(figsize=(6,3))

a_heights, a_bins = np.histogram(hist1, bins=40,  weights=np.ones(len(hist1)) / len(hist1))
b_heights, b_bins = np.histogram(hist2, bins=a_bins,  weights=np.ones(len(hist2)) / len(hist2))

width = (a_bins[1] - a_bins[0])

ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width - 0.93, b_heights, width=width, facecolor='seagreen', bottom=a_heights)

plt.legend(["Random order", "Non-random order"])
plt.title("Number of clicks per position for random and non random order")
plt.ylabel("Clicks")
plt.xlabel("Position")
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()


fig.savefig("../plots/bookings_per_pos.pdf", bbox_inches='tight')

In [None]:
import seaborn as sns
sns.set()

fig, ax = plt.subplots(figsize=(14,3))
missing = df.isnull().sum()/len(df)*100
missing.sort_values(inplace=True)
plt.xticks(rotation=90)
plt.ylabel("Percentage of missing values")
plt.title("Overview of missing values for the trainingset")
plt.bar(missing.index, height = missing, color="seagreen")
plt.savefig("../plots/missingvalues.pdf", bbox_inches = 'tight')

In [None]:
# msno.matrix(df, labels=True, figsize=(20, 5))
# msno.bar(df2, labels=True, figsize=(20, 5))
# dendo = msno.dendrogram(df)
# fig_copy = dendo.get_figure()
# fig_copy.savefig("../plots/dendogram.pdf")

## Analyse comp_rate_diff --> difference in price for Expedia competitor and difference in availability

In [None]:
import seaborn as sns
sns.set()

compare_cols = ["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate",
                "comp6_rate", "comp7_rate", "comp8_rate"]

total_diff = []
[total_diff.append(np.mean(df[col])) for col in compare_cols]

def showcompareplot(compare_cols, total_diff, title):
    fig, ax = plt.subplots(figsize=(10,5))
    ax.set(ylabel='Mean of the whole column', title=title)
    sns.barplot(compare_cols, total_diff)

showcompareplot(compare_cols, total_diff, title="Price in comparison to competitor 1 for all data; \
            positive = lower price!")

### On average, Expedia has a lower price than competitor 1 for the hotel. For competitor 4, Expedia has a higher price (on average!). However, when we look at availability we see that for competitor 4, the availability is slightly higher than other competitors. On the other hand, this is only about 10%. 

In [None]:
compare_cols_inv = ["comp1_inv", "comp2_inv", "comp3_inv", "comp4_inv", "comp5_inv",
                "comp6_inv", "comp7_inv", "comp8_inv"]

total_diff = []
[total_diff.append(np.mean(df[col])) for col in compare_cols_inv]
    
showcompareplot(compare_cols_inv, total_diff, "Availability rates for all dataset")

In [None]:
# Add comp_inv and comp_rate together 
for i in range (len(compare_cols)):
    df["combine_inv_rate" + str(i + 1)] = df[compare_cols_inv[i]] + df[compare_cols[i]]

compare_cols_total = ["combine_inv_rate1", "combine_inv_rate2", "combine_inv_rate3", "combine_inv_rate4",
                      "combine_inv_rate5", "combine_inv_rate6", "combine_inv_rate7", "combine_inv_rate8"]
total_diff = []
[total_diff.append(np.mean(df[col])) for col in compare_cols_total]
    
showcompareplot(compare_cols_total, total_diff, "Availability rates for all dataset")

In [None]:
# Make a new column for when there exists a competitor
competitor_bools = []

for index, row in df.iterrows():
    comp_bool = 0
    
    # compare_cols_total exists of a column of data from comp_rate and comp_inv,
    # so, a combination of competitor price and room-availability. 
    for competitor in compare_cols_total:
        if row[competitor] == 1:
            comp_bool = 1
    competitor_bools.append(comp_bool)
df["competitor_bool"] = competitor_bools

In [None]:
plt.figure(figsize=(4, 4))
sns.countplot(x="competitor_bool", data=df).set_title("Existence of a competitor; 0 = False, 1 = True")

In [None]:
# Drop all other comp_rate and comp_inv columns:
df = df.drop(compare_cols_inv, axis=1)
df = df.drop(compare_cols, axis=1)
df = df.drop(compare_cols_total, axis=1)

### Most people search for a one night stay and with two people (adults) and no children. 

In [None]:
from matplotlib.ticker import ScalarFormatter

plt.figure(figsize=(7, 4))
countplot = sns.countplot(x="srch_length_of_stay", data=df)
plt.xlabel('Length of stay')
plt.ylabel('Frequency')
plt.title("Number of search per length of stay")
countplot.set_xlim(-1,14)
# plt.ticker.LogFormatterSciNotation

plt.show()
countplot.figure.savefig("plots/srch_length_of_stay.pdf", bbox_inches='tight')


plt.figure(figsize=(11, 4))
sns.countplot(x="srch_adults_count", data=df)

plt.figure(figsize=(11, 4))
sns.countplot(x="srch_children_count", data=df)

### Most people visit the website from the US (id = 219).

In [None]:
# Get five most frequent countries 
print(df['visitor_location_country_id'].value_counts()[:5].index.tolist())
sns.distplot(df['visitor_location_country_id'], label="User country")
plt.legend()

## Clean price data

#### There are a lot of prices between 7 and 240, then outlying category between 240 and 554655:
#### Categories (7, interval[float64]): [(6.0889999999999995, 69.0] < (69.0, 90.0] < (90.0, 110.0] < (110.0, 136.0] < (136.0, 170.077] < (170.077, 239.0] < (239.0, 554655.0]]

## Note that different countries have different conventions regarding displaying taxes and fees and the value may be per night or for the whole stay

## For this reason, the choice was made to only select the US 

In [None]:
df_us = df.loc[df['visitor_location_country_id'] == 219].copy()

### There are a lot less clicks for the hotels with higher price (as expected)

In [None]:
# Correct for number of nights
df_us["price_correction"] = df_us["price_usd"] / df_us["srch_length_of_stay"]

df_us.groupby('click_bool')['price_usd'].describe()

In [None]:
df_us['PriceBand'] = pd.qcut(df_us["price_correction"], 7)
df_us['PriceBand'].head(10)

In [None]:
import seaborn as sns
import matplotlib.pylab as plt
sns.set()


ax = plt.subplots(figsize=(14,4))
plt.rcParams["axes.labelsize"] = 17
plt.rcParams["axes.titlesize"] = 17
plt.clf()
plt.xlim(0, 900)
ax = sns.distplot(df['price_usd'], bins=500, label="Distribution of prices", color = 'blue')
ax.set(xlabel='Price in dollars', ylabel='Percentage of queries', title="Distribution of prices")
plt.show()
ax.figure.savefig("../plots/prices.pdf", bbox_inches='tight')

In [None]:
# Categorize into labels
df_us['PriceBand'] = pd.qcut(df_us["price_correction"], 7, labels=[0,1,2,3,4,5,6])

### Add column that tells us whether someone has visited a hotel before

In [None]:
hist_starrating = df.visitor_hist_starrating.isna()
hist_adr = df.visitor_hist_adr_usd.isna()

# Dit kan waarschijnlijk veel mooier en sneller maar het werkt...
total_visited = []
for index, row in df.iterrows():
    if hist_starrating[index] or hist_adr[index]:
        visited = 0
    else:
        visited = 1
    total_visited.append(visited)
    
df["total_visited"] = total_visited

# lookup = df.loc[:, 'visitor_hist_starrating':'visitor_hist_adr_usd'].notnull().idxmax(1)
# df.assign(visited_before=df.lookup(lookup.index, lookup.values))

In [None]:
plt.figure(figsize=(4, 4))
sns.countplot(x="total_visited", data=df).set_title("Visited hotel before or not; 0 = False, 1 = True")

In [None]:
sns.distplot(df['prop_location_score1'], label="prop_location_score")

In [None]:
from matplotlib.ticker import PercentFormatter

sns.set()
# fig, axs = plt.subplots(2, 2)
# fig, ax = plt.subplots(figsize=(10,7)) 
    
fig = plt.figure(figsize=(15,10))
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.titlesize"] = 14


plt.subplot(2, 2, 1)
data1 = df[df['click_bool']==0]["prop_location_score1"]
data1.hist(color='cornflowerblue', 
          label='No click', 
          weights=np.ones(len(data1)) / len(data1),  bins=7)
data = df[df['click_bool']==1]["prop_location_score1"]
data.hist(color='seagreen', label='Click',  weights=np.ones(len(data)) / len(data1),bins=7)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.legend(loc='best')
plt.title("Property locationscore (1)")
plt.ylabel("Percentage of responses")
plt.xlabel("Score")


plt.subplot(2, 2, 2)
plt.xlim(0,1)
data1 = df[df['click_bool']==0]["prop_location_score2"].dropna()
data1.hist( color='cornflowerblue', label='No click', weights=np.ones(len(data1)) / len(data1), bins=11)
data = df[df['click_bool']==1]["prop_location_score2"].dropna()
data.hist(color='seagreen', label='Click', weights=np.ones(len(data)) / len(data1), bins=10)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.legend(loc='best')
plt.title("Property locationscore (2)")
plt.xlabel("Score")


plt.subplot(2, 2, 3)
plt.xlim(0,5)
data1 = df[df['click_bool']==0]["prop_review_score"].dropna()
data1.hist(color='cornflowerblue', label='No click', weights=np.ones(len(data1)) / len(data1))
data = df[df['click_bool']==1]["prop_review_score"].dropna()
data.hist(color='seagreen', label='Clicked', weights=np.ones(len(data)) / len(data1))
plt.legend(loc='best')
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Property review score")
plt.ylabel("Percentage of responses")
plt.xlabel("Score")

plt.subplot(2, 2, 4)
data1 = df[df['click_bool']==0]["prop_starrating"].dropna()
data1.hist(color='cornflowerblue', label='No click', bins=5, weights=np.ones(len(data1)) / len(data1))
data = df[df['click_bool']==1]["prop_starrating"].dropna()
data.hist( color='seagreen', label='Clicked', stacked=True, bins=5, weights=np.ones(len(data)) / len(data1))
# df[df['booking_bool']==1]["prop_starrating"].hist( color='gold', label='Booked', stacked=True, bins=5)
plt.legend(loc='best')
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.title("Property starrating")
plt.xlabel("Score")

plt.subplots_adjust(left=None, bottom=0, right=None, top=None, wspace=0.25, hspace=None)

plt.show()

fig.savefig("../plots/overviewratings.pdf", bbox_inches='tight')


In [None]:
sns.set()
sns.distplot(df['srch_booking_window'],
hist = False, label = 'Booking window')
plt.xlabel('dist')
sns.despine()