## Initalise

In [1]:
# Set full page width
from IPython.core.display import HTML
HTML("""
<style>
.container {
    width: 100%;
}
</style>
""")

In [2]:
import graphlab as gl
gl.canvas.set_target('ipynb')
import datetime
import os as os

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

# for large number of cores in a machine
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 32)

2016-05-03 23:47:21,732 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.9 started. Logging: /tmp/graphlab_server_1462319240.log


This non-commercial license of GraphLab Create is assigned to kevin.mcisaac@gmail.com and will expire on November 06, 2016. For commercial licensing options, visit https://dato.com/buy/.




# Load Data and save in fast binary format

The training data contains 37.6M rows. 8% are bookings, the remainder are clicks. The test data contains 2.5M rows where all are bookings.

Missing data
- 47k search checkin and check out dates. These are dropped.
- 167k rows are missing d1-d149 from destinations
- 36% of the distances are missing

In [None]:
train = gl.SFrame('Data/train.csv')

In [None]:
test = gl.SFrame('Data/test.csv')
destinations = gl.SFrame('Data/destinations.csv')

print "Train:", len(train), "Destinations:", len(destinations)

In [None]:
train.save('Data/train_raw')
test.save('Data/test_raw')
destinations.save('Data/destinations')

## Exploration of the data

In [7]:
train = gl.SFrame('Data/train_raw')
test = gl.SFrame('Data/test_raw')
print "Train:", len(train)

Train: 37670293


In [4]:
train['srch_destination_id', 'is_booking', 'cnt']

srch_destination_id,is_booking,cnt
8250,0,3
8250,1,1
8250,0,1
14984,0,1
14984,0,1
14984,0,1
8267,0,2
8267,0,1
8267,0,1
8267,0,1


In [11]:
set(train.column_names()) - set(test.column_names())

{'cnt', 'hotel_cluster', 'is_booking'}

In [6]:
train.filter_by(12, 'user_id')

date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city
2014-08-11 07:46:59,2,3,66,348,48862
2014-08-11 08:22:12,2,3,66,348,48862
2014-08-11 08:24:33,2,3,66,348,48862

orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt
2234.2641,12,0,1,9,2014-08-27,2014-08-31,2,0
2234.2641,12,0,1,9,2014-08-29,2014-09-02,2,0
2234.2641,12,0,0,9,2014-08-29,2014-09-02,2,0

srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country
1,8250,1,0,3,2,50
1,8250,1,1,1,2,50
1,8250,1,0,1,2,50

hotel_market,hotel_cluster
628,1
628,1
628,1


# Feature Engineering

In [None]:
train = gl.SFrame('Data/train_raw')
print "Train:", len(train)

In [15]:
# drop rows with blank check in or out dates
train = train[train['srch_ci'] != '']
train = train[train['srch_co'] != '']

#convert dates to datetime format to simplify feature engineering
train['date_time'] = train['date_time'].str_to_datetime(str_format='%Y-%m-%d %H:%M:%S')
train['srch_co'] = train['srch_ci'].str_to_datetime(str_format='%Y-%m-%d')
train['srch_ci'] = train['srch_ci'].str_to_datetime(str_format='%Y-%m-%d')

#split dates  into components
train.add_columns(train['date_time'].split_datetime(column_name_prefix='date_time',limit=['year', 'month', 'day', 'hour', 'weekday']));
train.add_columns(train['srch_co'].split_datetime(column_name_prefix='srch_co',limit=['year', 'month', 'day', 'weekday']))
train.add_columns(train['srch_ci'].split_datetime(column_name_prefix='srch_ci',limit=['year', 'month', 'day', 'weekday']));

#calculate days till check in and duration of stay
train['days_till_ci'] = train.apply(lambda row: (row['srch_ci'] - row['date_time']).days)
train['days_stay'] = train.apply(lambda row: (row['srch_co'] - row['srch_ci']).days)

train.save('Data/train_fe')

In [None]:
#joining the data with destinations  massively increases the size of the data set, making it hard to work with 
#train = train.join(destinations, on='srch_destination_id', how='left')

# Modeling 

In [16]:
train = gl.SFrame('Data/train_fe')

In [18]:
features  = set(train.column_names())
features -= set(['hotel_cluster', 'date_time', 'srch_co', 'srch_ci'])
features -= set(['is_booking', 'cnt'])
print list(features)

['site_name', 'user_location_country', 'srch_co.day', 'srch_adults_cnt', 'srch_ci.month', 'posa_continent', 'srch_ci.day', 'hotel_country', 'user_location_region', 'hotel_continent', 'srch_destination_id', 'date_time.month', 'user_id', 'date_time.year', 'srch_co.weekday', 'srch_destination_type_id', 'is_mobile', 'srch_ci.year', 'channel', 'hotel_market', 'days_till_ci', 'srch_children_cnt', 'days_stay', 'date_time.weekday', 'date_time.day', 'orig_destination_distance', 'srch_co.month', 'srch_ci.weekday', 'date_time.hour', 'srch_co.year', 'is_package', 'srch_rm_cnt', 'user_location_city']


In [19]:
train_bookings = train[train['is_booking'] == 1]
len(train_bookings)

3000693

In [22]:
train_bookings['hotel_cluster']=train_bookings['hotel_cluster'].astype(str)

In [None]:
train_small, rest = train.random_split(fraction=0.01, seed=1983)
valdate_small, rest = rest.random_split(fraction=0.0001, seed=1983)

print len(train_small), len(valdate_small)

### Spot check
Boosted tree wins

In [None]:
model = gl.classifier.create(train_small, target='hotel_cluster', features=features)

## Boosted Tree

In [23]:
model = gl.boosted_trees_classifier.create(train_bookings, target='hotel_cluster', features=features)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [24]:
model.save('Models/model_is_booked')

In [25]:
pred = model.predict_topk(test[0:5],output_type = 'rank', k=5)
pred

id,class,rank
0,13,0
0,33,1
0,41,2
0,10,3
0,1,4
1,13,0
1,99,1
1,16,2
1,43,3
1,1,4


In [27]:
def create_predictions(test):
    k = 5
    pred = model.predict_topk(test, output_type = 'rank', k=k)
    pred = pred.groupby('id', {'hotel_cluster':gl.aggregate.CONCAT('rank', 'class')})
    pred['hotel_cluster'] = pred['hotel_cluster'].apply(lambda d: ' '.join([str(d[r]) for r in range(0,k)]))
    return pred

In [28]:
create_predictions(test[0:5])

id,hotel_cluster
0,13 33 41 10 1
1,13 99 16 43 1
3,1 13 19 39 0
2,56 13 39 76 1
4,39 21 76 16 1


In [None]:
import ml_metrics

# More features

In [None]:
train['score'] = train['is_booking'] + train['cnt'] *0.1

In [None]:
test.column_names()