In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm
from numba import jit
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

In [3]:
traindf = pd.read_pickle("./pickles/df_small_train.pkl")
testdf = pd.read_pickle("./pickles/df_small_test.pkl")
nep_testdf = pd.read_pickle("./pickles/df_small_train_test_yk.pkl")

# run for a very small dataset for testing
# traindf = pd.read_pickle("./pickles/df_verysmall_train_test_yk.pkl")


# run if you want to use the whole train/test dataset
# traindf = pd.read_csv("data/training_set_VU_DM.csv")
traindf.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
1196703,79994,2013-01-03 13:23:33,29,132,,,219,2990,3,4.5,...,,,,,0.0,0.0,,0,,0
1196704,79994,2013-01-03 13:23:33,29,132,,,219,4256,4,4.5,...,,,,,0.0,0.0,,1,322.31,1
1196705,79994,2013-01-03 13:23:33,29,132,,,219,6427,4,4.0,...,,,,,0.0,0.0,28.0,0,,0
1196706,79994,2013-01-03 13:23:33,29,132,,,219,8289,4,4.5,...,,,,,0.0,0.0,,0,,0
1196707,79994,2013-01-03 13:23:33,29,132,,,219,13668,4,4.0,...,,,,,0.0,0.0,,0,,0


In [4]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor and there are available rooms.
    1 is True, 0 is False.
    """
    competitor_bools = []
    availability_bools = []
    
    # Columns for existing competitors
    compare_cols = ["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate",
                "comp6_rate", "comp7_rate", "comp8_rate"]
    
    # Columns for room availability
    availability_cols = ["comp1_inv", "comp2_inv", "comp3_inv", "comp4_inv", "comp5_inv", 
                        "comp6_inv", "comp7_inv", "comp8_inv"]

    for index, row in df.iterrows():
        comp_bool = 0
        available = 0

        # Check for a combination of competitor price and room-availability. 
        for competitor, availability in zip(compare_cols, availability_cols):
            if row[competitor] == 1: # and row[availability] == 1: TODO!!!
                comp_bool = 1
            if row[availability] == 1:
                available = 1
        
        competitor_bools.append(comp_bool)
        availability_bools.append(available)
        
    # Append list of bools to new column
    df["competitor_bool"] = competitor_bools
    df["availability_bools"] = availability_bools
    
    return df

In [5]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # Get none-missing values
    hist_starrating = df.visitor_hist_starrating.isna()
    hist_adr = df.visitor_hist_adr_usd.isna()

    # Dit kan waarschijnlijk veel mooier en sneller maar het werkt...
    total_visited = []
    for index, row in df.iterrows():
        if hist_starrating[index] or hist_adr[index]:
            visited = 0
        else:
            visited = 1
        total_visited.append(visited)

    df["visited_before"] = total_visited
    
    return df

In [6]:
def price_quality(df):
    """
    Add a column of ratio price/quality to the DataFrame.
    """
    price_quality = []
    
    # Check for missing values
    for index, row in df.iterrows():
        price, quality =  row["price_usd"],  row["prop_starrating"]
        if price and quality:
            ratio = price / quality
        else:
            ratio = None
        price_quality.append(ratio)
        
    df["price_quality"] = price_quality
    
    # Replace missing values with median
    df["price_quality"].fillna((df["price_quality"].median()), inplace=True)
    
    return df 

In [7]:
def price_category(df):
    """
    Add a column of categories of price_usd and a column
    that corrected price for number of nights.
    Preprocessing of quantile cut showed that categories are:
    [(6.0889999999999995, 69.0] < (69.0, 90.0] < (90.0, 110.0] 
    < (110.0, 136.0] < (136.0, 170.077] < (170.077, 239.0] < (239.0, 554655.0]]
    
    """
    
    # Correct for number of nights ad add as new column
    df["price_correction"] = df["price_usd"] / df["srch_length_of_stay"]
    
    # Replace missing values
    df["price_correction"].fillna((df["price_correction"].median()), inplace=True)
    df["price_usd"].fillna((df["price_usd"].median()), inplace=True)
    
    # Make a new column of price categories
    df['PriceBand'] = pd.qcut(df["price_correction"], 7, labels=np.arange(1,8))
    
    return df

In [8]:
def process_remaining_cols(df):
    """
    Add some remaining (and interesting columns) to the dataframe.
    """
    
    # Replace missing values
    df["promotion_flag"].fillna((df["promotion_flag"].mean()), inplace=True)
    df["prop_brand_bool"].fillna((df["prop_brand_bool"].median()), inplace=True)
    df["random_bool"].fillna((df["random_bool"].median()), inplace=True)
    
    return df

In [9]:
def add_score(df):
    """
    Add a score 
    """
    if df["booking_bool"] == 1 or df["click_bool"] == 1:
        score = 1
    else:
        score = 0
    return score


def preprocessing(traindf, dollarprice=True):
    traindf = competitors(traindf)
    traindf = visitor_history(traindf)
    traindf = price_quality(traindf)
    traindf = price_category(traindf)
    traindf = process_remaining_cols(traindf)
    
    # Add relevant columns
    df = traindf[["prop_id", "srch_id", "position", "price_quality", 
                  "competitor_bool", "availability_bools", "visited_before", 
                  "click_bool", "booking_bool", "PriceBand", 
                  "promotion_flag", "prop_brand_bool", "random_bool"]]
    
    return df

In [11]:
# Oh jit kan niet met pandas Series werken, moet omgeschreven worden naar np arrays
# @jit(nopython=True)
df = preprocessing(nep_testdf)
df.head(5)

Unnamed: 0,prop_id,srch_id,position,price_quality,competitor_bool,availability_bools,visited_before,click_bool,booking_bool,PriceBand,promotion_flag,prop_brand_bool,random_bool
3303361,1230,221792,7,96.333333,1,1,0,0,0,6,0,0,0
3303362,1546,221792,31,72.5,1,1,0,0,0,4,0,0,0
3303363,3572,221792,32,69.5,0,1,0,0,0,4,0,0,0
3303364,6623,221792,21,76.333333,0,0,0,0,0,6,1,1,0
3303365,8586,221792,34,73.0,0,0,0,0,0,6,0,0,0


In [12]:
df["price_quality"] = df.price_quality.astype(np.float32)
print(df.dtypes)
print()

if df.isnull().sum().sum() != 0:    
    print("\x1b[31mMissing values: \'\x1b[0m")
    print(df.isnull().sum())
else:
    print("\x1b[31mNo missing values!! :D \'\x1b[0m")

prop_id                  int64
srch_id                  int64
position                 int64
price_quality          float32
competitor_bool          int64
availability_bools       int64
visited_before           int64
click_bool               int64
booking_bool             int64
PriceBand             category
promotion_flag           int64
prop_brand_bool          int64
random_bool              int64
dtype: object

[31mNo missing values!! :D '[0m


## Totale dataset zonder missende waardes, hier kan RandomForestClassifier op gerund worden

In [13]:
filename = "./pickles/df_small_clean_test.pkl"
if not os.path.exists(filename):
    df.to_pickle(filename)

df.describe()

Unnamed: 0,prop_id,srch_id,position,price_quality,competitor_bool,availability_bools,visited_before,click_bool,booking_bool,promotion_flag,prop_brand_bool,random_bool
count,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0,50003.0
mean,69962.446393,168180.720197,16.966842,98.83709,0.151071,0.086195,0.054057,0.044477,0.027778,0.212607,0.628762,0.294122
std,40552.68154,94663.088682,10.462782,1722.586914,0.358122,0.280654,0.226132,0.206155,0.164339,0.409156,0.483141,0.455652
min,1.0,391.0,1.0,2.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35140.0,88955.0,8.0,29.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,69489.0,169827.0,16.0,38.666668,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,105139.5,251018.0,26.0,52.1495,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,140816.0,332726.0,39.0,102439.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
y = df["prop_id"]
X = df.drop("prop_id", axis=1).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
# Fit model on whole dataset
rfc = RandomForestClassifier(n_jobs=1)
model = rfc.fit(X_train, y_train)
predictions = model.predict(X_test)



In [13]:
X_test["predictions"] = predictions
X_test.head(10)

pred_groups = X_test.groupby('srch_id').agg({'position':lambda x: list(x)})
pred_groups

Unnamed: 0_level_0,position
srch_id,Unnamed: 1_level_1
65,"[15, 14, 29, 30, 37, 3, 36, 8]"
2050,"[24, 9, 19]"
2625,"[4, 26, 16, 1, 7]"
3927,"[18, 19]"
7707,"[13, 36, 28, 27, 25, 24, 20, 7]"
...,...
322659,"[27, 12, 13, 9, 26, 7, 8]"
327160,"[18, 6, 4]"
329266,"[16, 14, 21, 20, 1, 13]"
330548,"[13, 24, 14, 20]"


In [14]:
rank_pred_groups   = X.groupby('srch_id').agg({'position':lambda x: list(x)})

# Get unique srchids to group by
unique_srchid = X.srch_id.unique()