In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm
from numba import jit
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

  import pandas.util.testing as tm


In [2]:
traindf = pd.read_pickle("./pickles/df_small_train.pkl")
# testdf = pd.read_pickle("./pickles/df_small_test.pkl")
# nep_testdf = pd.read_pickle("./pickles/df_small_train_test_yk.pkl")

# run for a very small dataset for testing
# traindf = pd.read_pickle("./pickles/df_verysmall_train_test_yk.pkl")


# run if you want to use the whole train/test dataset
# traindf = pd.read_csv("data/test_set_VU_DM.csv")
traindf.head()

# filename = "./pickles/clean_test_set_VU_DM.pkl"
filename = "./pickles/df_small_clean.pkl"

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
1196703,79994,2013-01-03 13:23:33,29,132,,,219,2990,3,4.5,...,,,,,0.0,0.0,,0,,0
1196704,79994,2013-01-03 13:23:33,29,132,,,219,4256,4,4.5,...,,,,,0.0,0.0,,1,322.31,1
1196705,79994,2013-01-03 13:23:33,29,132,,,219,6427,4,4.0,...,,,,,0.0,0.0,28.0,0,,0
1196706,79994,2013-01-03 13:23:33,29,132,,,219,8289,4,4.5,...,,,,,0.0,0.0,,0,,0
1196707,79994,2013-01-03 13:23:33,29,132,,,219,13668,4,4.0,...,,,,,0.0,0.0,,0,,0


In [3]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor and there are available rooms.
    1 is True, 0 is False.
    """
    competitor_bools = []
    availability_bools = []
    
    # Columns for existing competitors
    compare_cols = ["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate",
                "comp6_rate", "comp7_rate", "comp8_rate"]
    
    # Columns for room availability
    availability_cols = ["comp1_inv", "comp2_inv", "comp3_inv", "comp4_inv", "comp5_inv", 
                        "comp6_inv", "comp7_inv", "comp8_inv"]

    for index, row in df.iterrows():
        comp_bool = 0
        available = 0

        # Check for a combination of competitor price and room-availability. 
        for competitor, availability in zip(compare_cols, availability_cols):
            if row[competitor] == 1: # and row[availability] == 1: TODO!!!
                comp_bool = 1
            if row[availability] == 1:
                available = 1
        
        competitor_bools.append(comp_bool)
        availability_bools.append(available)
        
    # Append list of bools to new column
    df["competitor_bool"] = competitor_bools
    df["availability_bools"] = availability_bools
    
    return df

In [4]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # Get none-missing values
    hist_starrating = df.visitor_hist_starrating.isna()
    hist_adr = df.visitor_hist_adr_usd.isna()

    # Dit kan waarschijnlijk veel mooier en sneller maar het werkt...
    total_visited = []
    for index, row in df.iterrows():
        if hist_starrating[index] or hist_adr[index]:
            visited = 0
        else:
            visited = 1
        total_visited.append(visited)

    df["visited_before"] = total_visited
    
    return df

In [5]:
def price_quality(df):
    """
    Add a column of ratio price/quality to the DataFrame.
    """
    price_quality = []
    
    # Check for missing values
    for index, row in df.iterrows():
        price, quality =  row["price_usd"],  row["prop_starrating"]
        if price and quality:
            ratio = price / quality
        else:
            ratio = None
        price_quality.append(ratio)
        
    df["price_quality"] = price_quality
    
    # Replace missing values with median
    df["price_quality"].fillna((df["price_quality"].median()), inplace=True)
    
    return df 

In [6]:
def price_category(df):
    """
    Add a column of categories of price_usd and a column
    that corrected price for number of nights.
    Preprocessing of quantile cut showed that categories are:
    [(6.0889999999999995, 69.0] < (69.0, 90.0] < (90.0, 110.0] 
    < (110.0, 136.0] < (136.0, 170.077] < (170.077, 239.0] < (239.0, 554655.0]]
    
    """
    
    # Correct for number of nights ad add as new column
    df["price_correction"] = df["price_usd"] / df["srch_length_of_stay"]
    
    # Replace missing values
    df["price_correction"].fillna((df["price_correction"].median()), inplace=True)
    df["price_usd"].fillna((df["price_usd"].median()), inplace=True)
    
    # Make a new column of price categories
    df['PriceBand'] = pd.qcut(df["price_correction"], 7, labels=np.arange(1,8))
    
    return df

In [7]:
def process_remaining_cols(df):
    """
    Add some remaining (and interesting columns) to the dataframe.
    """
    
    # Replace missing values with median
    df["prop_brand_bool"].fillna((df["prop_brand_bool"].median()), inplace=True)
    df["random_bool"].fillna((df["random_bool"].median()), inplace=True)
    
    # Boolians
    df["prop_location_score1"].fillna((df["prop_location_score1"].mean()), inplace=True)
    df["prop_location_score2"].fillna((df["prop_location_score2"].mean()), inplace=True)
        
     # Replace missing values with mean
    df["promotion_flag"].fillna((df["promotion_flag"].mean()), inplace=True)
    
    return df

In [21]:
def add_score(df):
    """
    Add a score 
    """
    
    # every hotel that is clicked on gets an importance score of 1
    df["importance"] = df["click_bool"]
    
    # every hotel that is booked gets an importance score of 5 
    df["importance"][df["booking_bool"] == 1] = 5
        
    return df


def preprocessing(traindf, dollarprice=True):
    traindf = competitors(traindf)
    print("cleaned competitors")
    
    traindf = visitor_history(traindf)
    print("cleaned visitor history")
    
    traindf = price_quality(traindf)
    print("cleaned price quality")
    
    traindf = price_category(traindf)
    print("cleaned price category")
    
    traindf = process_remaining_cols(traindf)
    print("did remaining columns")
    
    
    # Add relevant columns
    df = traindf[["prop_id", "srch_id", "position", "price_quality", 
                  "competitor_bool", "availability_bools", "visited_before", 
                  "click_bool", "booking_bool", "PriceBand", 
                  "promotion_flag", "prop_brand_bool", "random_bool",
                  "prop_location_score1", "prop_location_score2"]]
        
    df = add_score(df)
    df.drop(columns=["click_bool", "booking_bool"])
    
    return df

In [22]:
# Oh jit kan niet met pandas Series werken, moet omgeschreven worden naar np arrays
# @jit(nopython=True)
df = preprocessing(traindf)
# df.head(5)
df.describe()

cleaned competitors
cleaned visitor history
cleaned price quality
cleaned price category
did remaining columns


Unnamed: 0,prop_id,srch_id,position,price_quality,competitor_bool,availability_bools,visited_before,click_bool,booking_bool,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,importance
count,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0
mean,70233.165501,164188.521577,16.837235,91.824185,0.162684,0.095368,0.041776,0.044532,0.027668,0.21125,0.634053,0.297902,2.843792,0.126832,0.155204
std,40668.580445,94670.44902,10.429717,1517.174629,0.369081,0.293726,0.200078,0.206277,0.164021,0.408199,0.4817,0.457341,1.526112,0.138302,0.827339
min,1.0,124.0,1.0,2.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35122.0,85522.0,8.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.79,0.0283,0.0
50%,70065.0,163166.0,16.0,39.65,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.77,0.1134,0.0
75%,105183.0,242490.0,26.0,53.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.97,0.13435,0.0
max,140816.0,332740.0,39.0,110931.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.98,1.0,5.0


In [23]:
display(df.describe())

Unnamed: 0,prop_id,srch_id,position,price_quality,competitor_bool,availability_bools,visited_before,click_bool,booking_bool,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,importance
count,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0,49335.0
mean,70233.165501,164188.521577,16.837235,91.824185,0.162684,0.095368,0.041776,0.044532,0.027668,0.21125,0.634053,0.297902,2.843792,0.126832,0.155204
std,40668.580445,94670.44902,10.429717,1517.174629,0.369081,0.293726,0.200078,0.206277,0.164021,0.408199,0.4817,0.457341,1.526112,0.138302,0.827339
min,1.0,124.0,1.0,2.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35122.0,85522.0,8.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.79,0.0283,0.0
50%,70065.0,163166.0,16.0,39.65,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.77,0.1134,0.0
75%,105183.0,242490.0,26.0,53.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.97,0.13435,0.0
max,140816.0,332740.0,39.0,110931.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.98,1.0,5.0


In [24]:
# df["price_quality"] = df.price_quality.astype(np.float32)
print(df.dtypes)
print()

if df.isnull().sum().sum() != 0:    
    print("\x1b[31mMissing values: \'\x1b[0m")
    print(df.isnull().sum())
else:
    print("\x1b[31mNo missing values!! :D \'\x1b[0m")

prop_id                    int64
srch_id                    int64
position                   int64
price_quality            float64
competitor_bool            int64
availability_bools         int64
visited_before             int64
click_bool                 int64
booking_bool               int64
PriceBand               category
promotion_flag             int64
prop_brand_bool            int64
random_bool                int64
prop_location_score1     float64
prop_location_score2     float64
importance                 int64
dtype: object

[31mNo missing values!! :D '[0m


## Totale dataset zonder missende waardes, hier kan RandomForestClassifier op gerund worden

In [26]:
filename = filename
if not os.path.exists(filename):
    df.to_pickle(filename)

# df.describe()