In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from matplotlib import rcParams

from sklearn.ensemble import RandomForestClassifier

import datetime
from operator import itemgetter
import csv

pd.set_option('display.max_columns',75)
pd.set_option('display.max_rows',75)

%matplotlib inline

## Sites to look at

https://www.dataquest.io/blog/kaggle-tutorial/

Need one-hot encoding for our categorical data if using scikit randomforest

https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/

## Prepare training set

##### Load dataset

In [None]:
df = pd.read_csv('../../training_set_VU_DM_2014.csv')

##### Convert time-fields to usable features

In [None]:
def convert_date_time(df):
    #convert date_time to datetime
    df["date_time"] = pd.to_datetime(df["date_time"])

    #add column for the starting date of the booking
    df["book_start"] = df["date_time"] + pd.to_timedelta(df['srch_booking_window'], unit='D')
    df["book_end"] = df["book_start"] + pd.to_timedelta(df['srch_length_of_stay'], unit='D')

    #extract usable features
    df["srch_weekday"] = df["date_time"].dt.weekday
    df["srch_month"] = df["date_time"].dt.month
    df["srch_quarter"] = df["date_time"].dt.quarter
    df["srch_year"] = df["date_time"].dt.year
    df["book_start_weekday"] = df["book_start"].dt.weekday
    df["book_start_month"] = df["book_start"].dt.month
    df["book_start_quarter"] = df["book_start"].dt.quarter
    df["book_start_year"] = df["book_start"].dt.year
    df["book_end_weekday"] = df["book_end"].dt.weekday
    df["book_end_month"] = df["book_end"].dt.month
    df["book_end_quarter"] = df["book_end"].dt.quarter
    df["book_end_year"] = df["book_end"].dt.year

In [None]:
convert_date_time(df)

##### Some feature engineering with competitor columns

In [None]:
def add_comp_columns(df):
    #Take minimum of comp#_rate columns. If Expedia is cheaper than all competitors (all are 1), this will be 1.
    #If one competitor is cheaper (-1), this column will equal -1 (and have less chance of being booked at Expedia!)
    df["comp"] = df[["comp%d_rate"%i for i in range(1,9)]].min(axis=1)
    
    #multiply comp_rate and comp_rate_percent_diff and drop the old columns
    for i in range(1,9):
        df["comp%d"%i] = df["comp%d_rate"%i] * df["comp%d_rate_percent_diff"%i]
        df.drop(["comp%d_rate"%i, "comp%d_rate_percent_diff"%i],axis=1,inplace=True)

def add_averages_and_diffs(df):
    #https://stackoverflow.com/a/30949063
    df['avg_prop_starrating'] = df.groupby('srch_id')['prop_starrating'].transform('mean')
    df['avg_prop_location_score1'] = df.groupby('srch_id')['prop_location_score1'].transform('mean')
    df['avg_prop_location_score2'] = df.groupby('srch_id')['prop_location_score2'].transform('mean')
    df['avg_price_usd'] = df.groupby('srch_id')['price_usd'].transform('mean')

    #create difference columns, comparing the average with the value of each row
    df['avg_prop_starrating_diff'] = df['prop_starrating'] - df['avg_prop_starrating']
    df['avg_prop_location_score1_diff'] = df['prop_location_score1'] - df['avg_prop_location_score1']
    df['avg_prop_location_score2_diff'] = df['prop_location_score2'] - df['avg_prop_location_score2']
    df['avg_price_usd_diff'] = df['avg_price_usd'] - df['price_usd'] #cheaper is better!

In [None]:
add_comp_columns(df)
add_averages_and_diffs(df)

# K-FOLD CROSS-VALIDATION

##### Split training set into training and test set for evaluating performance

We don't have booking_bool in the test set provided, so we can do this to estimate performance

In [None]:
print("TOTAL SEARCHES IN DATASET: %d"%len(df["srch_id"].unique()))

In [None]:
#perform 10-fold cross-validation over a subset of the data
splits = 10

#get list of unique ids
ids = df["srch_id"].unique()

#shuffle the ids randomly
np.random.shuffle(ids)

#and obtain a list of test_ids per split
ids = ids[0:int(len(ids)/splits)*splits] #drop some srch_ids to keep test set sizes equal
split_ids = np.split(ids,splits)

#init list to store results per split
result = []

for n,i in enumerate(split_ids):
    
    #obtain test_set from ids, and training_set from the other ids
    test_set = df.loc[df["srch_id"].isin(i)]
    training_set = df.loc[~(df["srch_id"].isin(i)) & (df["srch_id"].isin(ids))]

    #fill NaNs in both sets with zero
    test_set.fillna(0,inplace=True)
    training_set.fillna(0,inplace=True)
    
    #obtain list of feature names to train model on
    feature_names = list(training_set.columns)
    feature_names.remove("date_time")
    feature_names.remove("book_start")
    feature_names.remove("book_end")
    feature_names.remove("position")
    feature_names.remove("click_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("random_bool")
    feature_names.remove("booking_bool")
    
    #obtain feature values from training set
    features = training_set[feature_names].values
    
    #obtain target values from training set
    target = training_set["booking_bool"].values
    
    #initialize model
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=1,
                                        n_jobs=4,
                                        min_samples_split=10,
                                        random_state=1)
    #train model
    classifier.fit(features, target)
    
    #obtain list of feature names for the test set
    feature_names = list(test_set.columns)
    feature_names.remove("date_time")
    feature_names.remove("book_start")
    feature_names.remove("book_end")
    feature_names.remove("position")
    feature_names.remove("click_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("random_bool")
    feature_names.remove("booking_bool")
    
    #get feature values from the test set
    features = test_set[feature_names].values
    
    #make predictions
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(-1.0*predictions)
    recommendations = zip(test_set["srch_id"], test_set["prop_id"],\
                          test_set["booking_bool"], test_set["click_bool"],\
                          predictions)
    
    #generate the sorted rows
    rows = [(srch_id, prop_id,booking_bool,click_bool)
        for srch_id, prop_id,booking_bool,click_bool, rank_float
        in sorted(recommendations, key=itemgetter(0,4))]
    
    #write sorted rows to file
    with open("predict%d.csv"%n, "w") as outfile:
        writer = csv.writer(outfile, lineterminator="\n")
        writer.writerow(("SearchId", "PropertyId", "BookingBool","ClickBool"))
        writer.writerows(rows)


In [None]:
def relevance(row):
    """
    Returns the relevance score per row
    """
    if row["BookingBool"]:
        return 5
    elif row["ClickBool"]:
        return 1
    else:
        return 0

def dcg_at_k(r, k):
    """
    Calculates the discounted cumulative gain as per Kaggle's definition
    
    Returns the DCG@k
    
    Code adjusted from https://www.kaggle.com/wendykan/ndcg-example
    """
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum((2 ** r -1)/ np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    """
    Calculates the normalized discounted cumulative gain as per
    Kaggle's definition
    
    Returns the NDCG@k
    
    Code adjusted from https://www.kaggle.com/wendykan/ndcg-example
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

#initialize list to store score per split
score = []

for n in range(splits):
    
    #read predict file for this split
    result = pd.read_csv('predict%d.csv'%n)
    
    #add relevance score
    result['rel'] = result.apply(relevance,axis=1)
    
    #initialize score at zero
    score_n = 0
    
    #loop over all srch_id
    for srch_id in result["SearchId"].unique():
        #use NDCG@38 as per Kaggle site
        score_n += ndcg_at_k(result.loc[result["SearchId"]==srch_id,"rel"].values,38)
    
    #append averaged score to final list
    score.append(score_n / len(result["SearchId"].unique()))
print(score)
print("AVERAGED SCORE: %s"%np.mean(score))

In [None]:
#pretty print scores per split
for n,i in enumerate(score):
    print("SPLIT %d: %.16f"%(n,i))
    
#print final score with 95% confidence intervals
print("AVERAGED SCORE: %.16f +/- %.16f"%(np.mean(score),1.96*np.std(score)))

# GENERATING PREDICTION FILE

##### Train model

In [None]:
#fill all NaNs with zeros
df.fillna(0,inplace=True)
    
#obtain feature names
feature_names = list(df.columns)
#model cannot train on DateTime
feature_names.remove("date_time")
feature_names.remove("book_start")
feature_names.remove("book_end")
#remove columns not available in test set
feature_names.remove("position")
feature_names.remove("click_bool")
feature_names.remove("gross_bookings_usd")
feature_names.remove("booking_bool")
#remove useless variable (without position)
feature_names.remove("random_bool")

#obtain feature values
features = df[feature_names].values

#obtain target values
target = df["booking_bool"].values

#remove training set from memory as it is no longer needed
del df

#initialize RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50, 
                                    verbose=2,
                                    n_jobs=4,
                                    min_samples_split=10,
                                    random_state=1)
#train model
classifier.fit(features, target)

#remove features, target from memory as it is no longer needed
del features
del target

##### Load test set and prepare

In [None]:
#load test set
test = pd.read_csv('../../test_set_VU_DM_2014.csv')

#add features like in training set
convert_date_time(test)
add_comp_columns(test)
add_averages_and_diffs(test)

#fill all NaNs with zeros
test.fillna(0,inplace=True)

In [None]:
# obtain feature names
feature_names = list(test.columns)

#remove DateTime columns
feature_names.remove("date_time")
feature_names.remove("book_start")
feature_names.remove("book_end")

#remove useless variable (without position)
feature_names.remove("random_bool")

# obtain feature values
features = test[feature_names].values

# predict using trained model
predictions = classifier.predict_proba(features)[:,1]
predictions = list(-1.0*predictions)
recommendations = zip(test["srch_id"], test["prop_id"], predictions)

#remove test set from memory
del test

In [None]:
# sort based on recommendation value
rows = [(srch_id, prop_id)
        for srch_id, prop_id, rank_float
        in sorted(recommendations, key=itemgetter(0,2))]

#write sorted prediction to file
with open("final_predict.csv", "w") as outfile:
    writer = csv.writer(outfile, lineterminator="\n")
    writer.writerow(("SearchId", "PropertyId"))
    writer.writerows(rows)

# Exploration

##### show first few rows

In [None]:
df.head(50)

##### show column names

In [None]:
print(df.columns.values)

##### show column types

In [None]:
df.dtypes

##### show short description per column

In [None]:
df.describe()

In [None]:
len(df["srch_id"].unique())

In [None]:
df.groupby("srch_id").count()["date_time"].max()

##### show correlations with the target booking_bool

In [None]:
df.corr()["booking_bool"]

In [None]:
def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    cax = ax.matshow(corr)
    fig.colorbar(cax)
    ax.matshow(corr)
    plt.yticks(range(len(corr.columns)), corr.columns);
    

In [None]:
plot_corr(df,25)

##### comp#_inv has some weird values

In [None]:
#what are -1 values? Expedia has no availability, but competitor does? Why would Expedia then show the hotel?
#(Create new feature based on availablility?)
print(df["comp1_inv"].value_counts())
print("Number of NaNs: %d"%df["comp1_inv"].isnull().sum())

##### Not all searches lead to a booking, but all do have a clicked item!

In [None]:
id_book = set(df.loc[df["booking_bool"]==1,"srch_id"].unique())
ids = set(df["srch_id"].unique())
print("Number of unique searches: %d"%len(ids))
print("Number of unique searches resulting in booking: %d"%len(id_book))
print("Number of unique searches without booking: %d"%len(ids-id_book))
id_click = set(df.loc[df["click_bool"]==1,"srch_id"].unique())
print("Number of unique searches with clicks: %d"%len(id_click))
print("Number of unique searches without clicks: %d"%len(ids-id_click))

##### Get N largest values from column

In [None]:
df.loc[np.argsort(df["price_usd"].values)[-1:-10:-1],["srch_id","price_usd"]]

In [None]:
#this search seems to have incorrect price_usd values, as all values have no cents?
df.loc[df["srch_id"]==78107]

##### show histogram of all columns

In [None]:
df.hist(df.columns.values,figsize=(8,10*len(df.columns)),layout=(len(df.columns),1));