In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm

from numba import jit
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

  import pandas.util.testing as tm


# Kies testing_set is False of True!!

In [2]:
# HIER!!!!!!
testing_set = False

In [3]:
def choose_data(testing_set):
    
    if testing_set is True:
        #total = pd.read_csv("./data/training_set_VU_DM.csv")
        traindf = pd.read_hdf("./data/corrected_price_testset.hdf")
        filename = "./data/test_clean.hdf"
        
    else:
        #total = pd.read_csv("./data/test_set_VU_DM.csv")
        traindf = pd.read_hdf("./data/corrected_price.hdf")
        filename = "./data/traindf_clean.hdf"
        
    return traindf, filename
        
traindf, filename = choose_data(testing_set)

In [4]:
def downsampling(df):
    """
    Balance classes in trainingset, based on click_bool (not booking_bool)
    """
    # Get 50% of data with importance of 5 or 1
    clicks = df[df.importance != 0].index
    randoms = np.random.choice(clicks, len(df.loc[df.importance != 0]) , replace=False)
    click_sample = df.loc[randoms]

    
    # Other 50% of the data
    not_click = df[df.importance == 0].index
    random_indices = np.random.choice(not_click, len(df.loc[df.importance == 0]), replace=False)
    not_click_sample = df.loc[random_indices]

    df_new = pd.concat([not_click_sample, click_sample], axis=0)
    
    return df_new

In [5]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor and there are available rooms.
    1 is True, 0 is False.
    """

    # we say that there is no competitor with a lower price
    df["competitor_lower"] = 0 #competitor_bools
    
    # comp1rate = 1 if price is lower
    df["competitor_lower"][df["comp1_rate"] == 1] = 1
    df["competitor_lower"][df["comp2_rate"] == 1] = 1
    df["competitor_lower"][df["comp3_rate"] == 1] = 1
    df["competitor_lower"][df["comp4_rate"] == 1] = 1
    df["competitor_lower"][df["comp5_rate"] == 1] = 1
    df["competitor_lower"][df["comp6_rate"] == 1] = 1
    df["competitor_lower"][df["comp7_rate"] == 1] = 1
    df["competitor_lower"][df["comp8_rate"] == 1] = 1
    
    # we say at first there is no competitor hotel available
    df["competitor_available"] = 0
    
    # availability bool = 1 if there if the competitor and expedia are available
    df["competitor_available"][df["comp1_inv"] == 1] = 1
    df["competitor_available"][df["comp2_inv"] == 1] = 1
    df["competitor_available"][df["comp3_inv"] == 1] = 1
    df["competitor_available"][df["comp4_inv"] == 1] = 1
    df["competitor_available"][df["comp5_inv"] == 1] = 1
    df["competitor_available"][df["comp6_inv"] == 1] = 1
    df["competitor_available"][df["comp7_inv"] == 1] = 1
    df["competitor_available"][df["comp8_inv"] == 1] = 1

    return df

In [6]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # most visitors haven't visited a hotel yet
    df["visited_before"] = 0
    
    # where there is a history field filled in, visited_before is turned into 21
    df["visited_before"][df["visitor_hist_starrating"].notna() | df["visitor_hist_adr_usd"].notna()] = 1
    
    return df

In [7]:
def price_quality(df):
    """
    Add a column of ratio price/quality to the DataFrame.
    """
    
    df["price_quality"] = None
    
    df["prop_starrating"].replace(0, 0.0001,inplace=True) # TODO
    
    df["price_quality"][df["price_usd"].notna() & df["prop_starrating"].notna() & df["prop_starrating"] != 0] = df["price_usd"] / df["prop_starrating"]

    # Replace missing values with median
    df["price_quality"].fillna((df["price_quality"].median()), inplace=True)
    
    return df 



In [8]:
def price_category(df):
    """
    Add a column of categories of price_usd and a column
    that corrected price for number of nights.
    Preprocessing of quantile cut showed that categories are:
    [(6.0889999999999995, 69.0] < (69.0, 90.0] < (90.0, 110.0] 
    < (110.0, 136.0] < (136.0, 170.077] < (170.077, 239.0] < (239.0, 554655.0]]
    
    """
    
    columnames = list(df.columns)
    columnames.extend(["avg_price_propid", "std_avg_price_propid", "amount_hotels", 
                       "avg_price_propid_after", "std_avg_price_propid_after"])
    
    df = df.reindex(columns=columnames)
    
    # copy prices (for now, at the end we will just update the price i suppose)
    df["price_correction"] = df["price_usd"]
    
    print("Made all extra columns")
    display(df[["avg_price_propid", "std_avg_price_propid", "amount_hotels", 
                       "avg_price_propid_after", "std_avg_price_propid_after", "price_correction"]])
    
    amount = 0
    
    propids = list(df.prop_id.unique())
    
    for prop_id in tqdm(propids, desc="Processing propids:"): 
        # calculate average and standard deviation
        std = df["price_usd"][df["prop_id"] == prop_id].std()
        avg = df["price_usd"][df["prop_id"] == prop_id].mean() 
        
        # count how many times this hotel appears in the dataframe
        df["amount_hotels"][df["prop_id"] == prop_id] = len(df[df["prop_id"] == prop_id])
        
        # put average and standard deviation in dataframe
        df["avg_price_propid"][df["prop_id"] == prop_id] = avg
        df["std_avg_price_propid"][df["prop_id"] == prop_id] = std
        
        # If std is high, correct for number of nights
        if std > 50:
            amount += 1
            df["price_correction"][df["prop_id"] == prop_id] = df["price_usd"][df["prop_id"] == prop_id] / df["srch_length_of_stay"][df["prop_id"] == prop_id]
        
        # for now separate columns so we can compare
        df["avg_price_propid_after"][df["prop_id"] == prop_id] = df["price_correction"][df["prop_id"] == prop_id].mean()
        df["std_avg_price_propid_after"][df["prop_id"] == prop_id] = df["price_correction"][df["prop_id"] == prop_id].std()
            
    print("Amount of properties with std > 50: ", amount)
    
    # Ik kreeg hier een error sorry
#     print("Correcting ", len(df[df["std_avg_price_propid"] > 50]), " property prices")
    
    return df

In [9]:
def process_remaining_cols(df):
    """
    Add some remaining (and interesting columns) to the dataframe.
    """
    
    # Replace missing values with median
    df["prop_brand_bool"].fillna((df["prop_brand_bool"].median()), inplace=True)
    df["random_bool"].fillna((df["random_bool"].median()), inplace=True)
    
    # Boolians
#     df["prop_location_score1"].fillna((df["prop_location_score1"].mean()), inplace=True)
    df["prop_location_score1"].fillna(-1, inplace=True)
    df["prop_location_score2"].fillna(-1, inplace=True)
        
    # Replace missing values with mean
#     df["promotion_flag"].fillna((df["promotion_flag"].mean()), inplace=True)
    df["promotion_flag"].fillna(-1, inplace=True)
    
    return df

In [10]:
def add_score(df):
    """
    Add an importance score based on click_bool and booking_bool
    """
    
    # every hotel that is clicked on gets an importance score of 1
    df["importance"] = df["click_bool"]
    
    # every hotel that is booked gets an importance score of 5 
    df["importance"][df["booking_bool"] == 1] = 5
        
    return df

In [11]:
def price_rank(df):
    """
    Add the rank for every prop_id within each srch_id
    """
    df["price_rank"] = df.groupby("srch_id")["price_usd"].rank()
    
    return df

In [12]:
def locationscore_rank(df):
    """
    Add the rank for every location score2 within each srch_id
    """
    df["locationscore2_rank"] = df.groupby("srch_id")["prop_location_score2"].rank()
    df["locationscore1_rank"] = df.groupby("srch_id")["prop_location_score1"].rank()
    
    return df


### Aparte functies aanroepen voor de kolommen die je erbij wilt

In [13]:
# traindf = competitors(traindf)
# print("cleaned competitors")

# traindf = visitor_history(traindf)
# print("cleaned visitor history")

traindf = price_quality(traindf)
print("cleaned price quality")

traindf = price_rank(traindf)
print("added a rank of price per search_id")

traindf = process_remaining_cols(traindf)
print("did remaining columns")

traindf = locationscore_rank(traindf)
print("added a rank of location score search_id")


# Add relevant columns    
if testing_set is False:
    
    df = traindf[["prop_id", 
                  "srch_id", 
                  "position", 
                  "price_quality", 
                  #"competitor_lower", 
                  #"competitor_available", 
                  #"visited_before", 
                  "click_bool", 
                  "booking_bool",  
                  "price_usd",
                  "promotion_flag", 
                  #"prop_brand_bool", 
                  #"random_bool",
                  "prop_location_score1", 
                  "prop_location_score2",
                  "avg_price_propid", 
                  "std_avg_price_propid", 
                  "amount_hotels", 
                  "avg_price_propid_after", 
                  #"std_avg_price_propid_after",
                  "price_rank",
                  "price_correction",
                  "locationscore2_rank",
                  "locationscore1_rank"]]
    
    df = add_score(df)
    print("added score")
    
    # Balance data to 50% importance score or 1 or 5 and 0 
    df = downsampling(df)
    print("Downsampled data")
    
else:
    
    # df without click_bool, booking_bool and position
    df = traindf[["prop_id", 
                  "srch_id", 
                  "price_quality", 
                  #"competitor_lower", 
                  #"competitor_available", 
                  #"visited_before", 
                  "price_usd",
                  "promotion_flag", 
                  #"prop_brand_bool", 
                  #"random_bool",
                  "prop_location_score1", 
                  "prop_location_score2",
                  "avg_price_propid", 
                  "std_avg_price_propid", 
                  "amount_hotels", 
                  "avg_price_propid_after", 
                  #"std_avg_price_propid_after",
                  "price_rank",
                  "price_correction",
                  "locationscore2_rank",
                  "locationscore1_rank"
                 ]]

cleaned price quality
added a rank of price per search_id
did remaining columns
added a rank of location score search_id
added score
Downsampled data


In [14]:
display(df.head())
display(df.describe())

Unnamed: 0,prop_id,srch_id,position,price_quality,click_bool,booking_bool,price_usd,promotion_flag,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,price_rank,price_correction,locationscore2_rank,importance
115902,20122,7746,8,140.2475,0,0,560.99,0,5.05,0.4681,459.442183,168.18268,197.0,188.468906,14.0,280.495,11.0,0
132046,69427,8832,14,31.695,0,0,126.78,0,5.09,0.0651,176.590803,86.029934,137.0,176.590803,9.0,126.78,20.0,0
3220154,14002,216249,32,18.485,0,0,36.97,0,3.09,-1.0,69.104621,29.906671,145.0,69.104621,4.0,36.97,14.5,0
1174683,70233,78503,10,67.9925,0,0,271.97,0,1.95,-1.0,251.536364,176.271926,11.0,63.508639,16.0,19.426429,13.0,0
4382078,84306,294025,25,54.666667,0,0,164.0,0,3.89,0.0818,131.629308,42.636522,318.0,131.629308,18.0,164.0,20.0,0


Unnamed: 0,prop_id,srch_id,position,price_quality,click_bool,booking_bool,price_usd,promotion_flag,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,price_rank,price_correction,locationscore2_rank,importance
count,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0
mean,70079.18,166366.6,16.85624,57518.21,0.04474858,0.02791051,254.2096,0.2156198,2.872589,-0.1181882,254.2096,1303.301,213.0141,189.4903,14.58187,189.4903,14.58187,0.1563906
std,40609.92,96112.23,10.42566,7122742.0,0.2067514,0.1647165,16001.24,0.4112517,1.531011,0.4889088,2895.059,15683.1,314.9667,2625.44,9.00852,8544.575,8.622716,0.8307484
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,3.375,0.0,1.0,3.375,1.0,0.0,1.0,0.0
25%,35010.0,82936.0,8.0,29.75667,0.0,0.0,85.0,0.0,1.79,0.0014,95.08475,18.22662,49.0,90.27157,7.0,75.23,7.5,0.0
50%,69638.0,166507.0,16.0,40.0,0.0,0.0,122.0,0.0,2.77,0.0355,133.0982,37.08703,111.0,124.0667,14.0,110.0,13.5,0.0
75%,105168.0,249724.0,26.0,56.66667,0.0,0.0,184.96,0.0,4.04,0.1373,200.0695,78.49519,250.0,172.765,22.0,162.0,21.0,0.0
max,140821.0,332785.0,40.0,9443491000.0,1.0,1.0,19726330.0,1.0,6.98,1.0,1173038.0,3102435.0,2357.0,1172941.0,38.0,9381309.0,38.0,5.0


In [15]:
# df["price_quality"] = df.price_quality.astype(np.float32)
print(df.dtypes)
print()

if df.isnull().sum().sum() != 0:    
    print("\x1b[31mMissing values: \'\x1b[0m")
    print(df.isnull().sum())
else:
    print("\x1b[31mNo missing values!! :D \'\x1b[0m")

prop_id                     int64
srch_id                     int64
position                    int64
price_quality             float64
click_bool                  int64
booking_bool                int64
price_usd                 float64
promotion_flag              int64
prop_location_score1      float64
prop_location_score2      float64
avg_price_propid          float64
std_avg_price_propid      float64
amount_hotels             float64
avg_price_propid_after    float64
price_rank                float64
price_correction          float64
locationscore2_rank       float64
importance                  int64
dtype: object

[31mNo missing values!! :D '[0m


## Totale dataset zonder missende waardes

In [16]:
# save the dataframe if it does not exist yet
# if not os.path.exists(filename):
#     df.to_hdf(filename, key="df", format="table")
df.to_hdf(filename, key="df", format="table")

In [17]:
# test if it worked
reread = pd.read_hdf(filename)

In [18]:
display(reread.describe())

Unnamed: 0,prop_id,srch_id,position,price_quality,click_bool,booking_bool,price_usd,promotion_flag,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,price_rank,price_correction,locationscore2_rank,importance
count,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0,4958347.0
mean,70079.18,166366.6,16.85624,57518.21,0.04474858,0.02791051,254.2096,0.2156198,2.872589,-0.1181882,254.2096,1303.301,213.0141,189.4903,14.58187,189.4903,14.58187,0.1563906
std,40609.92,96112.23,10.42566,7122742.0,0.2067514,0.1647165,16001.24,0.4112517,1.531011,0.4889088,2895.059,15683.1,314.9667,2625.44,9.00852,8544.575,8.622716,0.8307484
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,3.375,0.0,1.0,3.375,1.0,0.0,1.0,0.0
25%,35010.0,82936.0,8.0,29.75667,0.0,0.0,85.0,0.0,1.79,0.0014,95.08475,18.22662,49.0,90.27157,7.0,75.23,7.5,0.0
50%,69638.0,166507.0,16.0,40.0,0.0,0.0,122.0,0.0,2.77,0.0355,133.0982,37.08703,111.0,124.0667,14.0,110.0,13.5,0.0
75%,105168.0,249724.0,26.0,56.66667,0.0,0.0,184.96,0.0,4.04,0.1373,200.0695,78.49519,250.0,172.765,22.0,162.0,21.0,0.0
max,140821.0,332785.0,40.0,9443491000.0,1.0,1.0,19726330.0,1.0,6.98,1.0,1173038.0,3102435.0,2357.0,1172941.0,38.0,9381309.0,38.0,5.0
