In [1]:
import pandas as pd
import numpy as np
import pickle
import time
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns

#### Data cleaning functions

In [2]:
def column_rename(df):
    df.rename(columns={'srch_ci':'check_in', 'srch_co':'check_out', 'srch_adults_cnt':'adult_count', \
                         'srch_children_cnt':'child_count', 'srch_rm_cnt':'room_count',\
                         'srch_destination_id':'destination_id', 'srch_destination_type_id':'destination_type_id', \
                        'cnt':'similar_events'}, inplace=True)

In [3]:
def feature_engine(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['check_in'] = pd.to_datetime(df['check_in'], errors='coerce')
    df['check_out'] = pd.to_datetime(df['check_out'], errors='coerce')
    df['stay_duration'] = (df['check_out'] - df['check_in']).astype('timedelta64[D]')
    df['plan_time'] = (df['check_in'] - df['date_time']).astype('timedelta64[D]')
#     df['day_of_week'] = df['date_time'].dt.day_name()

In [4]:
def fillna(df):
    df['orig_destination_distance'] = df['orig_destination_distance'].astype(np.float64)
    df['orig_destination_distance'].fillna((df['orig_destination_distance'].mean()), inplace=True)
    df['stay_duration'].fillna((df['stay_duration'].mean()), inplace=True)
    df['plan_time'].fillna((df['plan_time'].mean()), inplace=True)

### Train 2m dataset

In [5]:
train = pd.read_csv('train_2m.csv')

In [6]:
train.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999999 entries, 0 to 1999998
Data columns (total 24 columns):
date_time                    object
site_name                    int64
posa_continent               int64
user_location_country        int64
user_location_region         int64
user_location_city           int64
orig_destination_distance    float64
user_id                      int64
is_mobile                    int64
is_package                   int64
channel                      int64
srch_ci                      object
srch_co                      object
srch_adults_cnt              int64
srch_children_cnt            int64
srch_rm_cnt                  int64
srch_destination_id          int64
srch_destination_type_id     int64
is_booking                   int64
cnt                          int64
hotel_continent              int64
hotel_country                int64
hotel_market                 int64
hotel_cluster                int64
dtypes: float64(1), int64(20), object(3)
m

In [8]:
#clean up
column_rename(train)
feature_engine(train)
fillna(train)

In [9]:
most_common_clusters = list(train.hotel_cluster.value_counts().head(10).index)
most_common_clusters

[91, 41, 48, 64, 5, 65, 98, 59, 70, 42]

In [10]:
train = train.loc[train['hotel_cluster'].isin(most_common_clusters)]

In [11]:
# remove datetime columns
train.drop(columns=['date_time', 'check_in','check_out'], inplace=True)

In [12]:
# with open('train_cleaned.pickle', 'wb') as to_write:
#     pickle.dump(train, to_write)

In [13]:
train.to_csv('train_cleaned.csv')