In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from matplotlib import rcParams

from sklearn.ensemble import RandomForestClassifier

import datetime
from operator import itemgetter
import csv

pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',50)

%matplotlib inline

## Sites to look at

https://www.dataquest.io/blog/kaggle-tutorial/

Need one-hot encoding for our categorical data if using scikit randomforest

https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/

## K-fold cross-validation of Random Forest Classifier with added features

##### Load dataset

In [None]:
df = pd.read_csv('../../training_set_VU_DM_2014.csv')

##### Convert time-fields to usable features

In [None]:
#convert date_time to datetime
df["date_time"] = pd.to_datetime(df["date_time"])

#add column for the starting date of the booking
df["book_start"] = df["date_time"] + pd.to_timedelta(df['srch_booking_window'], unit='D')
df["book_end"] = df["book_start"] + pd.to_timedelta(df['srch_length_of_stay'], unit='D')

#extract usable features
df["srch_weekday"] = df["date_time"].dt.weekday
df["srch_month"] = df["date_time"].dt.month
df["srch_quarter"] = df["date_time"].dt.quarter
df["srch_year"] = df["date_time"].dt.year
df["book_start_weekday"] = df["book_start"].dt.weekday
df["book_start_month"] = df["book_start"].dt.month
df["book_start_quarter"] = df["book_start"].dt.quarter
df["book_start_year"] = df["book_start"].dt.year
df["book_end_weekday"] = df["book_end"].dt.weekday
df["book_end_month"] = df["book_end"].dt.month
df["book_end_quarter"] = df["book_end"].dt.quarter
df["book_end_year"] = df["book_end"].dt.year


##### Some feature engineering with competitor columns

In [None]:
#Take minimum of comp#_rate columns. If Expedia is cheaper than all competitors (all are 1), this will be 1.
#If one competitor is cheaper (-1), this column will equal -1 (and have less chance of being booked at Expedia!)
df["comp"] = df[["comp%d_rate"%i for i in range(1,9)]].min(axis=1)

In [None]:
#multiply comp_rate and comp_rate_percent_diff and drop the old columns
for i in range(1,9):
    df["comp%d"%i] = df["comp%d_rate"%i] * df["comp%d_rate_percent_diff"%i]
    df.drop(["comp%d_rate"%i, "comp%d_rate_percent_diff"%i],axis=1,inplace=True)

In [None]:
#https://stackoverflow.com/a/30949063

df['avg_prop_starrating'] = df.groupby('srch_id')['prop_starrating'].transform('mean')
df['avg_prop_location_score1'] = df.groupby('srch_id')['prop_location_score1'].transform('mean')
df['avg_prop_location_score2'] = df.groupby('srch_id')['prop_location_score2'].transform('mean')
df['avg_price_usd'] = df.groupby('srch_id')['price_usd'].transform('mean')


In [None]:
#create difference columns, comparing the average with the value of each row
df['avg_prop_starrating_diff'] = df['prop_starrating'] - df['avg_prop_starrating']
df['avg_prop_location_score1_diff'] = df['prop_location_score1'] - df['avg_prop_location_score1']
df['avg_prop_location_score2_diff'] = df['prop_location_score2'] - df['avg_prop_location_score2']
df['avg_price_usd_diff'] = df['avg_price_usd'] - df['price_usd'] #cheaper is better!

##### Split training set into training and test set for evaluating performance

We don't have book_bool in the test set provided, so we can do this to estimate performance?

In [None]:
print("TOTAL SEARCHES IN DATASET: %d"%len(df["srch_id"].unique()))

In [None]:
#perform 10-fold cross-validation over a subset of the data
splits = 10
ids = df["srch_id"].unique()
np.random.shuffle(ids)
ids = ids[0:int(len(ids)/splits)*splits] #drop some srch_ids to keep test set sizes equal
split_ids = np.split(ids,splits)

result = []

for n,i in enumerate(split_ids):
    
    test_set = df.loc[df["srch_id"].isin(i)]
    training_set = df.loc[~(df["srch_id"].isin(i)) & (df["srch_id"].isin(ids))]
#     print(len(test_set["srch_id"].unique()))
#     print(len(training_set["srch_id"].unique()))
    
    #do something with NaN? For now, just set to zero
    
    #this code is based on the benchmark code given at
    #https://github.com/benhamner/ExpediaPersonalizedSortCompetition/
    
    test_set.fillna(0,inplace=True)
    training_set.fillna(0,inplace=True)
    
    #train model
    feature_names = list(training_set.columns)
    feature_names.remove("date_time")
    feature_names.remove("book_start")
    feature_names.remove("book_end")
    feature_names.remove("position")
    feature_names.remove("click_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("random_bool")
    feature_names.remove("booking_bool")
    features = training_set[feature_names].values
    target = training_set["booking_bool"].values
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=1,
                                        n_jobs=4,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    
    #test model
    feature_names = list(test_set.columns)
    feature_names.remove("date_time")
    feature_names.remove("book_start")
    feature_names.remove("book_end")
    feature_names.remove("position")
    feature_names.remove("click_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("random_bool")
    feature_names.remove("booking_bool")
    features = test_set[feature_names].values
    
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(-1.0*predictions)
    recommendations = zip(test_set["srch_id"], test_set["prop_id"], test_set["booking_bool"], test_set["click_bool"], predictions)
    
    rows = [(srch_id, prop_id,booking_bool,click_bool)
        for srch_id, prop_id,booking_bool,click_bool, rank_float
        in sorted(recommendations, key=itemgetter(0,4))]
    
    writer = csv.writer(open("predict%d.csv"%n, "w"), lineterminator="\n")
    writer.writerow(("SearchId", "PropertyId", "BookingBool","ClickBool"))
    writer.writerows(rows)

In [None]:
def relevance(row):
    if row["BookingBool"]:
        return 5
    elif row["ClickBool"]:
        return 1
    else:
        return 0

#https://www.kaggle.com/wendykan/ndcg-example
""" Reference from https://gist.github.com/bwhite/3726239
"""
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

score = []
for n in range(splits):
    result = pd.read_csv('predict%d.csv'%n)
    result['rel'] = result.apply(relevance,axis=1)
    score_n = 0
    for srch_id in result["SearchId"].unique():
        #use NDCG@38 as per Kaggle site
        score_n += ndcg_at_k(result.loc[result["SearchId"]==srch_id,"rel"].values,38)
    score.append(score_n / len(result["SearchId"].unique()))
print(score)
print("AVERAGED SCORE: %s"%np.mean(score))

In [None]:
for n,i in enumerate(score):
    print("SPLIT %d: %.16f"%(n,i))
print("AVERAGED SCORE: %.16f +/- %.16f"%(np.mean(score),1.96*np.std(score)))

# Exploration

In [None]:
df.head(50)

In [None]:
print(df.columns.values)

In [None]:
df.dtypes

##### show correlations with the target booking_bool

In [None]:
df.corr()["booking_bool"]

##### comp#_inv has some weird values

In [None]:
#what are -1 values? Expedia has no availability, but competitor does? Why would Expedia then show the hotel?
#(Create new feature based on availablility?)
print(df["comp1_inv"].value_counts())
print("Number of NaNs: %d"%df["comp1_inv"].isnull().sum())

##### Not all searches lead to a booking, but all do have a clicked item!

In [None]:
id_book = set(df.loc[df["booking_bool"]==1,"srch_id"].unique())
ids = set(df["srch_id"].unique())
print("Number of unique searches: %d"%len(ids))
print("Number of unique searches resulting in booking: %d"%len(id_book))
print("Number of unique searches without booking: %d"%len(ids-id_book))
id_click = set(df.loc[df["click_bool"]==1,"srch_id"].unique())
print("Number of unique searches with clicks: %d"%len(id_click))
print("Number of unique searches without clicks: %d"%len(ids-id_click))

##### show histogram of all columns

In [None]:
df.hist(df.columns.values,figsize=(8,10*len(df.columns)),layout=(len(df.columns),1));