In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm

from numba import jit
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

  import pandas.util.testing as tm


# Kies testing_set is False of True!!

In [None]:
# HIER!!!!!!
testing_set = True

In [None]:
def choose_data(testing_set):
    
    if testing_set is True:
        #total = pd.read_csv("./data/training_set_VU_DM.csv")
        traindf = pd.read_hdf("./data/corrected_price_testset.hdf")
        filename = "./data/test_clean.hdf"
        
    else:
        #total = pd.read_csv("./data/test_set_VU_DM.csv")
        traindf = pd.read_hdf("./data/corrected_price.hdf")
        filename = "./data/traindf_clean.hdf"
        
    return traindf, filename
        
traindf, filename = choose_data(testing_set)

In [6]:
def downsampling(df):
    """
    Balance classes in trainingset, based on click_bool (not booking_bool)
    """
    # Get 50% of data with importance of 5 or 1
    clicks = df[df.importance != 0].index
    randoms = np.random.choice(clicks, len(df.loc[df.importance != 0]) , replace=False)
    click_sample = df.loc[randoms]
    
    # Other 50% of the data
    not_click = df[df.importance == 0].index
    random_indices = np.random.choice(not_click, len(df.loc[df.importance == 0]), replace=False)
    not_click_sample = df.loc[random_indices]

    df_new = pd.concat([not_click_sample, click_sample], axis=0)
    
    return df_new

In [7]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor and there are available rooms.
    1 is True, 0 is False.
    """

    # we say that there is no competitor with a lower price
    df["competitor_lower"] = 0 #competitor_bools
    
    # comp1rate = 1 if price is lower
    df["competitor_lower"][df["comp1_rate"] == 1] = 1
    df["competitor_lower"][df["comp2_rate"] == 1] = 1
    df["competitor_lower"][df["comp3_rate"] == 1] = 1
    df["competitor_lower"][df["comp4_rate"] == 1] = 1
    df["competitor_lower"][df["comp5_rate"] == 1] = 1
    df["competitor_lower"][df["comp6_rate"] == 1] = 1
    df["competitor_lower"][df["comp7_rate"] == 1] = 1
    df["competitor_lower"][df["comp8_rate"] == 1] = 1
    
    # we say at first there is no competitor hotel available
    df["competitor_available"] = 0
    
    # availability bool = 1 if there if the competitor and expedia are available
    df["competitor_available"][df["comp1_inv"] == 1] = 1
    df["competitor_available"][df["comp2_inv"] == 1] = 1
    df["competitor_available"][df["comp3_inv"] == 1] = 1
    df["competitor_available"][df["comp4_inv"] == 1] = 1
    df["competitor_available"][df["comp5_inv"] == 1] = 1
    df["competitor_available"][df["comp6_inv"] == 1] = 1
    df["competitor_available"][df["comp7_inv"] == 1] = 1
    df["competitor_available"][df["comp8_inv"] == 1] = 1

    return df

In [8]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # most visitors haven't visited a hotel yet
    df["visited_before"] = 0
    
    # where there is a history field filled in, visited_before is turned into 21
    df["visited_before"][df["visitor_hist_starrating"].notna() | df["visitor_hist_adr_usd"].notna()] = 1
    
    return df

In [9]:
def price_quality(df):
    """
    Add a column of ratio price/quality to the DataFrame.
    """
    
    df["price_quality"] = None
    
    df["prop_starrating"].replace(0, 0.0001,inplace=True) # TODO
    
    df["price_quality"][df["price_usd"].notna() & df["prop_starrating"].notna() & df["prop_starrating"] != 0] = df["price_usd"] / df["prop_starrating"]

    # Replace missing values with median
    df["price_quality"].fillna((df["price_quality"].median()), inplace=True)
    
    return df 



In [10]:
def price_category(df):
    """
    Add a column of categories of price_usd and a column
    that corrected price for number of nights.
    Preprocessing of quantile cut showed that categories are:
    [(6.0889999999999995, 69.0] < (69.0, 90.0] < (90.0, 110.0] 
    < (110.0, 136.0] < (136.0, 170.077] < (170.077, 239.0] < (239.0, 554655.0]]
    
    """
    
    columnames = list(df.columns)
    columnames.extend(["avg_price_propid", "std_avg_price_propid", "amount_hotels", 
                       "avg_price_propid_after", "std_avg_price_propid_after"])
    
    df = df.reindex(columns=columnames)
    
    # copy prices (for now, at the end we will just update the price i suppose)
    df["price_correction"] = df["price_usd"]
    
    print("Made all extra columns")
    display(df[["avg_price_propid", "std_avg_price_propid", "amount_hotels", 
                       "avg_price_propid_after", "std_avg_price_propid_after", "price_correction"]])
    
    amount = 0
    
    propids = list(df.prop_id.unique())
    
    for prop_id in tqdm(propids, desc="Processing propids:"): 
        # calculate average and standard deviation
        std = df["price_usd"][df["prop_id"] == prop_id].std()
        avg = df["price_usd"][df["prop_id"] == prop_id].mean() 
        
        # count how many times this hotel appears in the dataframe
        df["amount_hotels"][df["prop_id"] == prop_id] = len(df[df["prop_id"] == prop_id])
        
        # put average and standard deviation in dataframe
        df["avg_price_propid"][df["prop_id"] == prop_id] = avg
        df["std_avg_price_propid"][df["prop_id"] == prop_id] = std
        
        # If std is high, correct for number of nights
        if std > 50:
            amount += 1
            df["price_correction"][df["prop_id"] == prop_id] = df["price_usd"][df["prop_id"] == prop_id] / df["srch_length_of_stay"][df["prop_id"] == prop_id]
        
        # for now separate columns so we can compare
        df["avg_price_propid_after"][df["prop_id"] == prop_id] = df["price_correction"][df["prop_id"] == prop_id].mean()
        df["std_avg_price_propid_after"][df["prop_id"] == prop_id] = df["price_correction"][df["prop_id"] == prop_id].std()
            
    print("Amount of properties with std > 50: ", amount)
    
    # Ik kreeg hier een error sorry
#     print("Correcting ", len(df[df["std_avg_price_propid"] > 50]), " property prices")
    
    return df

In [11]:
def process_remaining_cols(df):
    """
    Add some remaining (and interesting columns) to the dataframe.
    """
    
    # Replace missing values with median
    df["prop_brand_bool"].fillna((df["prop_brand_bool"].median()), inplace=True)
    df["random_bool"].fillna((df["random_bool"].median()), inplace=True)
    
    # Boolians
#     df["prop_location_score1"].fillna((df["prop_location_score1"].mean()), inplace=True)
    df["prop_location_score1"].fillna(0, inplace=True)
    df["prop_location_score2"].fillna(0, inplace=True)
        
    # Replace missing values with mean
#     df["promotion_flag"].fillna((df["promotion_flag"].mean()), inplace=True)
    df["promotion_flag"].fillna(0, inplace=True)
    
    return df

In [12]:
def add_score(df):
    """
    Add an importance score based on click_bool and booking_bool
    """
    
    # every hotel that is clicked on gets an importance score of 1
    df["importance"] = df["click_bool"]
    
    # every hotel that is booked gets an importance score of 5 
    df["importance"][df["booking_bool"] == 1] = 5
        
    return df

### Aparte functies aanroepen voor de kolommen die je erbij wilt

In [14]:
# traindf = competitors(traindf)
# print("cleaned competitors")

# traindf = visitor_history(traindf)
# print("cleaned visitor history")

traindf = price_quality(traindf)
print("cleaned price quality")

# traindf = price_category(traindf)
# print("cleaned price category")

traindf = process_remaining_cols(traindf)
print("did remaining columns")


# Add relevant columns    
if testing_set is False:
    
    df = traindf[["prop_id", 
                  "srch_id", 
                  "position", 
                  "price_quality", 
                  #"competitor_lower", 
                  #"competitor_available", 
                  #"visited_before", 
                  "click_bool", 
                  "booking_bool",  
                  "price_usd",
                  "promotion_flag", 
                  "prop_brand_bool", 
                  "random_bool",
                  "prop_location_score1", 
                  "prop_location_score2",
                  "avg_price_propid", 
                  "std_avg_price_propid", 
                  "amount_hotels", 
                  "avg_price_propid_after", 
                  "std_avg_price_propid_after",
                  "price_correction"]]
    
    df = add_score(df)
    print("added score")
    
    # Balance data to 50% importance score or 1 or 5 and 0 
    df = downsampling(df)
    print("Downsampled data")
    
else:
    
    # df without click_bool, booking_bool and position
    df = traindf[["prop_id", 
                  "srch_id", 
                  "price_quality", 
                  #"competitor_lower", 
                  #"competitor_available", 
                  #"visited_before", 
                  "price_usd",
                  "promotion_flag", 
                  "prop_brand_bool", 
                  "random_bool",
                  "prop_location_score1", 
                  "prop_location_score2",
                  "avg_price_propid", 
                  "std_avg_price_propid", 
                  "amount_hotels", 
                  "avg_price_propid_after", 
                  "std_avg_price_propid_after",
                  "price_correction"
                 ]]

cleaned price quality
did remaining columns


In [15]:
display(df.head())
display(df.describe())

Unnamed: 0,prop_id,srch_id,price_quality,price_usd,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,std_avg_price_propid_after,price_correction
0,3180,1,39.666667,119.0,0,1,0,2.94,0.0691,125.520757,11.71016,185.0,125.520757,11.71016,119.0
1,5543,1,39.333333,118.0,0,1,0,2.64,0.0843,117.532579,20.483788,252.0,117.532579,20.483788,118.0
2,14142,1,24.5,49.0,0,1,0,2.71,0.0556,51.8866,4.464016,150.0,51.8866,4.464016,49.0
3,22393,1,47.666667,143.0,0,1,0,2.4,0.0561,133.987619,27.965448,147.0,133.987619,27.965448,143.0
4,24194,1,26.333333,79.0,0,1,0,2.94,0.209,86.308224,14.807339,214.0,86.308224,14.807339,79.0


Unnamed: 0,prop_id,srch_id,price_quality,price_usd,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,std_avg_price_propid_after,price_correction
count,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0
mean,70081.11,166646.0,52373.94,229.3575,0.2159033,0.6339091,0.2969189,2.879367,0.1018317,229.3575,863.0156,212.7442,172.069,488.3592,172.069
std,40613.63,96149.92,3133332.0,12463.69,0.4114476,0.4817348,0.4569005,1.533165,0.1509622,1690.832,12318.28,317.3549,859.2206,6221.937,6299.941
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,1.0,0.07,0.0,0.0
25%,35021.0,83311.5,29.76333,85.0,0.0,0.0,0.0,1.79,0.0014,95.15172,18.1783,49.0,89.824,18.17132,75.0
50%,69607.0,167095.0,40.0,122.26,0.0,1.0,0.0,2.77,0.0356,133.38,37.84726,111.0,123.87,37.66439,110.0
75%,105179.0,249966.0,56.75,185.0,0.0,1.0,1.0,4.04,0.1379,197.7817,80.76247,246.0,171.5808,76.04886,162.0
max,140821.0,332787.0,4541855000.0,9661340.0,1.0,1.0,1.0,6.98,1.0,239854.7,1122171.0,2376.0,119822.5,561107.7,4830670.0


In [16]:
# df["price_quality"] = df.price_quality.astype(np.float32)
print(df.dtypes)
print()

if df.isnull().sum().sum() != 0:    
    print("\x1b[31mMissing values: \'\x1b[0m")
    print(df.isnull().sum())
else:
    print("\x1b[31mNo missing values!! :D \'\x1b[0m")

prop_id                         int64
srch_id                         int64
price_quality                 float64
price_usd                     float64
promotion_flag                  int64
prop_brand_bool                 int64
random_bool                     int64
prop_location_score1          float64
prop_location_score2          float64
avg_price_propid              float64
std_avg_price_propid          float64
amount_hotels                 float64
avg_price_propid_after        float64
std_avg_price_propid_after    float64
price_correction              float64
dtype: object

[31mNo missing values!! :D '[0m


## Totale dataset zonder missende waardes

In [17]:
# save the dataframe if it does not exist yet
# if not os.path.exists(filename):
#     df.to_hdf(filename, key="df", format="table")
df.to_hdf(filename, key="df", format="table")

In [18]:
# test if it worked
reread = pd.read_hdf(filename)

In [19]:
display(reread.describe())

Unnamed: 0,prop_id,srch_id,price_quality,price_usd,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,std_avg_price_propid_after,price_correction
count,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0,4959183.0
mean,70081.11,166646.0,52373.94,229.3575,0.2159033,0.6339091,0.2969189,2.879367,0.1018317,229.3575,863.0156,212.7442,172.069,488.3592,172.069
std,40613.63,96149.92,3133332.0,12463.69,0.4114476,0.4817348,0.4569005,1.533165,0.1509622,1690.832,12318.28,317.3549,859.2206,6221.937,6299.941
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,1.0,0.07,0.0,0.0
25%,35021.0,83311.5,29.76333,85.0,0.0,0.0,0.0,1.79,0.0014,95.15172,18.1783,49.0,89.824,18.17132,75.0
50%,69607.0,167095.0,40.0,122.26,0.0,1.0,0.0,2.77,0.0356,133.38,37.84726,111.0,123.87,37.66439,110.0
75%,105179.0,249966.0,56.75,185.0,0.0,1.0,1.0,4.04,0.1379,197.7817,80.76247,246.0,171.5808,76.04886,162.0
max,140821.0,332787.0,4541855000.0,9661340.0,1.0,1.0,1.0,6.98,1.0,239854.7,1122171.0,2376.0,119822.5,561107.7,4830670.0
