In [1]:
#import all necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#load datasets
train = pd.read_csv('train_downsampled.csv')

In [3]:
train.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-02-27 07:24:03,2,3,66,174,23793,1780.4772,8654,1,0,...,2,1,5438,3,0,2,2,50,637,47
1,2014-08-30 16:10:42,2,3,66,174,13277,133.1612,8654,1,0,...,1,1,8278,1,0,1,2,50,970,80
2,2014-08-30 16:25:16,2,3,66,174,13277,124.8268,8654,1,0,...,1,1,8278,1,0,1,2,50,368,10
3,2014-08-30 16:26:45,2,3,66,174,13277,112.2581,8654,1,0,...,1,1,8278,1,0,1,2,50,368,21
4,2014-08-30 16:29:57,2,3,66,174,13277,117.6094,8654,1,0,...,1,1,8278,1,0,3,2,50,368,95


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290413 entries, 0 to 290412
Data columns (total 24 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   date_time                  290413 non-null  object 
 1   site_name                  290413 non-null  int64  
 2   posa_continent             290413 non-null  int64  
 3   user_location_country      290413 non-null  int64  
 4   user_location_region       290413 non-null  int64  
 5   user_location_city         290413 non-null  int64  
 6   orig_destination_distance  290413 non-null  float64
 7   user_id                    290413 non-null  int64  
 8   is_mobile                  290413 non-null  int64  
 9   is_package                 290413 non-null  int64  
 10  channel                    290413 non-null  int64  
 11  srch_ci                    290413 non-null  object 
 12  srch_co                    290413 non-null  object 
 13  srch_adults_cnt            29

Most of columns are numeric, so we can't do a lot of feature engineering. For example, user_location_location doesn't tell us what country represents which number. So we can't transform the data into features that better represent the predictor variables



In [5]:
train.corr()['hotel_cluster']

  train.corr()['hotel_cluster']


site_name                   -0.024835
posa_continent               0.019831
user_location_country       -0.015561
user_location_region         0.013036
user_location_city          -0.001013
orig_destination_distance    0.011047
user_id                      0.001739
is_mobile                    0.009824
is_package                   0.042279
channel                     -0.003721
srch_adults_cnt              0.010422
srch_children_cnt            0.011673
srch_rm_cnt                 -0.007727
srch_destination_id         -0.016216
srch_destination_type_id    -0.036056
is_booking                  -0.023270
cnt                          0.002919
hotel_continent             -0.012584
hotel_country               -0.021196
hotel_market                 0.034559
hotel_cluster                1.000000
Name: hotel_cluster, dtype: float64

Since none of the columns are correlated with hotel_cluster, linear regression won't work well on our data.



In [7]:
train['date_time'] = pd.to_datetime(train['date_time'])
train['year'] = train['date_time'].dt.year
train['month'] = train['date_time'].dt.month
train['dayofweek'] = train['date_time'].dt.dayofweek
train["srch_ci"] = pd.to_datetime(train["srch_ci"], format='%Y-%m-%d', errors="coerce")
train["srch_co"] = pd.to_datetime(train["srch_co"], format='%Y-%m-%d', errors="coerce")
train["stay_span"] = (train["srch_co"] - train["srch_ci"]).astype('timedelta64[h]')
train.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster,year,month,day,dayofweek,stay_span
0,2014-02-27 07:24:03,2,3,66,174,23793,1780.4772,8654,1,0,...,2,2,50,637,47,2014,2,27,3,48.0
1,2014-08-30 16:10:42,2,3,66,174,13277,133.1612,8654,1,0,...,1,2,50,970,80,2014,8,30,5,48.0
2,2014-08-30 16:25:16,2,3,66,174,13277,124.8268,8654,1,0,...,1,2,50,368,10,2014,8,30,5,24.0
3,2014-08-30 16:26:45,2,3,66,174,13277,112.2581,8654,1,0,...,1,2,50,368,21,2014,8,30,5,24.0
4,2014-08-30 16:29:57,2,3,66,174,13277,117.6094,8654,1,0,...,3,2,50,368,95,2014,8,30,5,24.0


In [8]:
train.groupby(['year'])['month'].value_counts()


year  month
2013  7         8464
      3         7724
      6         7553
      8         7310
      5         7308
      9         7248
      4         7031
      12        6693
      10        6679
      2         6259
      11        6208
      1         5780
2014  12       23371
      7        22166
      8        21950
      11       21348
      9        20831
      10       20362
      6        17892
      5        13963
      4        13395
      3        13081
      1         9089
      2         8708
Name: month, dtype: int64