## Initalise

In [24]:
# Set full page width
from IPython.core.display import HTML
HTML("""
<style>
.container {
    width: 100%;
}
</style>
""")

In [25]:
import graphlab as gl
gl.canvas.set_target('ipynb')
import datetime
import os as os

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

# for large number of cores in a machine
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 32)

## Grouping/reduction

In [3]:
train = gl.SFrame('Data/train_raw')
print "Train:", len(train)

Train: 37670293


In [10]:
others = set(train.column_names()) - set(['user_id', 'srch_destination_id', 'hotel_cluster']) - set(['is_booking', 'cnt'])

others = ['hotel_continent', 'hotel_country', 'hotel_market', 'hotel_cluster']
aggs = {f:gl.aggregate.SELECT_ONE(f) for f in others}

aggs.update({'bookings':gl.aggregate.SUM('is_booking'), 'count':gl.aggregate.COUNT(), 'cnt':gl.aggregate.SUM('cnt')})

In [5]:
summary = train.groupby(['srch_destination_id', 'hotel_cluster'], 
                        {'bookings':gl.aggregate.SUM('is_booking'), 
                         'clicks': gl.aggregate.COUNT()} )
summary

hotel_cluster,srch_destination_id,bookings,clicks
30,45970,4,23
63,43781,2,40
60,38280,0,1
25,31932,1,3
48,61467,0,4
82,23337,1,7
58,64858,0,1
89,17319,0,13
43,39387,0,10
47,35319,0,2


In [6]:
CLICK_WEIGHT = 0.05
summary['clicks'] = summary['clicks'] - summary['bookings'] # 
summary['relevance'] = summary['bookings'] + CLICK_WEIGHT * summary['clicks']
summary

hotel_cluster,srch_destination_id,bookings,clicks,relevance
30,45970,4,19,4.95
63,43781,2,38,3.9
60,38280,0,1,0.05
25,31932,1,2,1.1
48,61467,0,4,0.2
82,23337,1,6,1.3
58,64858,0,1,0.05
89,17319,0,13,0.65
43,39387,0,10,0.5
47,35319,0,2,0.1


In [8]:
summary.filter_by(1, 'srch_destination_id')

hotel_cluster,srch_destination_id,bookings,clicks,relevance
57,1,0,1,0.05
60,1,0,17,0.85
30,1,2,20,3.0
20,1,4,22,5.1


In [26]:
summary.save('Data/summary')

# Find most popular hotel clusters by destination
Define a function to get most popular hotels for a destination group.
nlargest() function lets us save some time on sorting large groups.


In [None]:
def most_popular(group, n_max=5):
    ind = group.relevance.nlargest(n_max).index
    most_popular = group.hotel_cluster[ind].values
    return np.array_str(most_popular)[1:-1] # remove square brackets


In [42]:
pred = summary.groupby('srch_destination_id',{'hotel_cluster': gl.aggregate.CONCAT('relevance', 'hotel_cluster')})
pred

srch_destination_id,hotel_cluster
49249,"{0.25: 15, 1.25: 89, 1.45: 33, 0.1: 61, 0.2: ..."
51146,{0.05: 99}
24527,"{0.25: 29, 3.45: 53, 1.1: 38, 0.2: 44, 0.05: 35, ..."
30459,"{0.1: 18, 0.05: 95}"
15376,"{0.25: 3, 0.75: 67, 0.1: 44, 0.2: 57, 1.1: 61, ..."
26613,"{2.6: 42, 2.9: 63, 0.2: 54, 0.55: 91, 1.35: 31, ..."
21855,"{1.75: 3, 0.2: 53, 0.4: 60, 2.6: 93, 1.45: 62, ..."
4441,{0.05: 32}
27161,"{1.25: 53, 2.75: 8, 0.05: 30, 0.1: 61, 41.45: 43, ..."
42644,"{1.1: 5, 1.05: 50, 0.1: 29, 0.05: 85} ..."


In [43]:
pred['hotel_cluster'] = pred['hotel_cluster'].apply(lambda d: arg_max(d, k=5))
pred.print_rows(5, max_column_width=40)

+---------------------+--------------------------------+
| srch_destination_id |         hotel_cluster          |
+---------------------+--------------------------------+
|        49249        | [43.0, 77.0, 48.0, 16.0, 40.0] |
|        51146        |             [99.0]             |
|        24527        | [82.0, 81.0, 53.0, 36.0, 62.0] |
|        30459        |          [18.0, 95.0]          |
|        15376        | [20.0, 38.0, 8.0, 78.0, 85.0]  |
+---------------------+--------------------------------+
[59455 rows x 2 columns]



In [10]:
pred['hotel_cluster'] = pred['hotel_cluster'].apply(lambda d: ' '.join([str(d[r]) for r in range(0,k)]))
pred

KeyError: 0

In [31]:
def arg_max(d, k=3):
    topk = sorted(d.keys(), reverse=True)[:k]
    return [d[k] for k in topk]

In [None]:
def create_predictions(test):
    k = 5
    pred = model.predict_topk(test, output_type = 'rank', k=k)
    pred = pred.groupby('id', {'hotel_cluster':gl.aggregate.CONCAT('rank', 'class')})
    pred['hotel_cluster'] = pred['hotel_cluster'].apply(lambda d: ' '.join([str(d[r]) for r in range(0,k)]))
    return pred