# Data Anlysis

In [1]:
import pandas as pd
import random

In [2]:
dest = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [3]:
# The size of training & testing data
print train.shape
print test.shape

(37670293, 24)
(2528243, 22)


In [4]:
# There is no relationship between hotelcluster and # of items
train["hotel_cluster"].value_counts()

91    1043720
41     772743
48     754033
64     704734
65     670960
5      620194
98     589178
59     570291
42     551605
21     550092
70     545572
18     545284
83     534132
46     534038
25     530591
62     518809
95     509266
28     507016
68     503797
82     503755
37     496061
50     489892
30     489287
9      488328
58     483253
97     479446
16     477868
72     457463
1      452694
99     444887
       ...   
19     282893
84     278264
66     273505
38     269246
87     260398
23     259233
12     259022
31     257587
67     255946
43     253578
7      252447
54     250745
92     244343
89     243560
45     241408
49     240124
3      225250
80     220218
60     217919
71     216054
93     214293
86     209054
14     192299
75     165226
24     164127
35     139122
53     134812
88     107784
27     105040
74      48355
Name: hotel_cluster, dtype: int64

In [6]:
# test_id is the subset of train_id
test_id = set(test.user_id.unique())
train_id = set(train.user_id.unique())
len(test_id&train_id) == len(test_id)

True

# Prepocessing

In [7]:
# Divide date_time into month and year fields
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [8]:
# Randomly select 10k users
uni_users = train.user_id.unique()

part_user_ids = [uni_users[i] for i in sorted(random.sample(range(len(uni_users)),10000))]
part_train = train[train.user_id.isin(part_user_ids)] # part_train only contains the data of 10k users

In [9]:
# New training set and testing set
Ntrain = part_train[((part_train.year == 2013) | ((part_train.year == 2014) & (part_train.month < 8)))]
Ntest = part_train[((part_train.year == 2014) & (part_train.month >= 8))]

In [10]:
# Remove all the non-booking events
Ntest = Ntest[Ntest.is_booking == True]

# Naive ALG_Most_comm

In [12]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)
pred = [most_common_clusters for i in range(Ntest.shape[0])]

In [13]:
import ml_metrics as metrics

In [14]:
# Evaluation
target = [[l] for l in Ntest["hotel_cluster"]]
metrics.mapk(target, pred, k=5)

0.060460277427490536

# Correclation to hotel_cluster

In [15]:
train.corr()["hotel_cluster"]

site_name                   -0.022408
posa_continent               0.014938
user_location_country       -0.010477
user_location_region         0.007453
user_location_city           0.000831
orig_destination_distance    0.007260
user_id                      0.001052
is_mobile                    0.008412
is_package                   0.038733
channel                      0.000707
srch_adults_cnt              0.012309
srch_children_cnt            0.016261
srch_rm_cnt                 -0.005954
srch_destination_id         -0.011712
srch_destination_type_id    -0.032850
is_booking                  -0.021548
cnt                          0.002944
hotel_continent             -0.013963
hotel_country               -0.024289
hotel_market                 0.034205
hotel_cluster                1.000000
year                        -0.001050
month                       -0.000560
Name: hotel_cluster, dtype: float64

# Generating Features

In [20]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
dest_small = pca.fit_transform(dest[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = dest["srch_destination_id"]

In [31]:
def new_feature(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    temp = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        temp[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        temp[prop] = df[prop]
    
    date_temp = ["month", "day", "dayofweek", "quarter"]
    for prop in date_temp:
        temp["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        temp["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    temp["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(temp)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

df = new_feature(Ntrain)
df.fillna(df.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,channel,ci_day,ci_dayofweek,ci_month,ci_quarter,cnt,co_day,co_dayofweek,co_month,co_quarter,...,srch_rm_cnt,stay_span,user_id,user_location_city,user_location_country,user_location_region,year,0,1,2
368,9,18.0,4.0,4.0,2.0,1,21.0,0.0,4.0,2.0,...,1,72.0,3313,16634,66,174,2014,0.520591,0.029306,0.009437
369,9,18.0,4.0,4.0,2.0,1,21.0,0.0,4.0,2.0,...,1,72.0,3313,16634,66,174,2014,0.527879,0.025604,0.056107
370,9,18.0,4.0,4.0,2.0,5,21.0,0.0,4.0,2.0,...,1,72.0,3313,16634,66,174,2014,0.527879,0.025604,0.056107
371,1,19.0,5.0,4.0,2.0,1,21.0,0.0,4.0,2.0,...,1,48.0,3313,16634,66,174,2014,0.527879,0.025604,0.056107
372,1,19.0,5.0,4.0,2.0,1,21.0,0.0,4.0,2.0,...,1,48.0,3313,16634,66,174,2014,0.527879,0.025604,0.056107
373,0,23.0,2.0,7.0,3.0,1,26.0,5.0,7.0,3.0,...,1,72.0,3313,16634,66,174,2014,-0.418300,0.059863,-0.277964
374,0,18.0,4.0,7.0,3.0,3,20.0,6.0,7.0,3.0,...,1,48.0,3313,46432,66,174,2014,0.382071,0.034341,-0.021062
375,0,18.0,4.0,7.0,3.0,1,21.0,0.0,7.0,3.0,...,1,72.0,3313,46432,66,174,2014,0.382071,0.034341,-0.021062
376,0,18.0,4.0,7.0,3.0,1,20.0,6.0,7.0,3.0,...,1,48.0,3313,46432,66,174,2014,-0.668314,-0.128175,-0.017363
377,0,18.0,4.0,7.0,3.0,1,20.0,6.0,7.0,3.0,...,1,48.0,3313,46432,66,174,2014,-0.668314,-0.128175,-0.017363


# Topest clusters for each destinations

In [32]:
def make_key(item):
    return "_".join([str(i) for i in item])

match_col = ["srch_destination_id"]
cluster_col = match_col + ['hotel_cluster']
groups = Ntrain.groupby(cluster_col)
top_clusters = {}
for name, group in groups:
    click = len(group.is_booking[group.is_booking == False])
    booking = len(group.is_booking[group.is_booking == True])
    
    score = 1.0 * booking + 0.15 * click
    
    clus_name = make_key(name[:len(match_col)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score

In [33]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

In [36]:
# Make prediction
preds = []
for index, row in Ntest.iterrows():
    key = make_key([row[m] for m in match_col])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])

In [38]:
metrics.mapk([[l] for l in Ntest["hotel_cluster"]], preds, k=5)

0.22811475409836066

# Exact Matching

In [40]:
match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']

groups = Ntrain.groupby(match_cols)
    
def generate_exact_matches(row, match_cols):
    index = tuple([row[t] for t in match_cols])
    try:
        group = groups.get_group(index)
    except Exception:
        return []
    clus = list(set(group.hotel_cluster))
    return clus

exact_matches = []
for i in range(Ntest.shape[0]):
    exact_matches.append(generate_exact_matches(Ntest.iloc[i], match_cols))

# Merge Three ALG (Exact Matching+preds+Most_Comm)

In [42]:
def f5(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]
metrics.mapk([[l] for l in Ntest["hotel_cluster"]], full_preds, k=5)

0.26523118957545189

# Following: Form of submission file

In [32]:
write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(t2["id"][i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_clusters"] + write_frame
with open("predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))

KeyError: 'id'