In [None]:
# TODO in the price_quality komen nu infs, vervangen door nan?

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import random
import csv
import os
import math
import numpy as np

from numba import jit
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# read in the dataframe you want to clean, can be a csv, hdf
# traindf = pd.read_csv("./data/training_set_VU_DM.csv")
traindf = pd.read_csv("./data/test_set_VU_DM.csv")

# put filename here: we use hdf because it can store the entire dataset, while pickle files can't
# filename = "./data/traindf_clean.hdf"
filename = "./data/test_clean.hdf"

# if testing set is True we will not add the importance column (because we can't)
testing_set = True

In [None]:
def downsampling(df):
    """
    Balance classes in trainingset, based on click_bool (not booking_bool)
    """
    # Get 50% of data with importance of 5 or 1
    clicks = df[df.importance != 0].index
    randoms = np.random.choice(clicks, len(df.loc[df.importance != 0]) , replace=False)
    click_sample = df.loc[randoms]
    
    # Other 50% of the data
    not_click = df[df.importance == 0].index
    random_indices = np.random.choice(not_click, len(df.loc[df.importance == 0]), replace=False)
    not_click_sample = df.loc[random_indices]

    df_new = pd.concat([not_click_sample, click_sample], axis=0)
    
    return df_new

In [None]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor and there are available rooms.
    1 is True, 0 is False.
    """

    # we say that there is no competitor with a lower price
    df["competitor_lower"] = 0 #competitor_bools
    
    # comp1rate = 1 if price is lower
    df["competitor_lower"][df["comp1_rate"] == 1] = 1
    df["competitor_lower"][df["comp2_rate"] == 1] = 1
    df["competitor_lower"][df["comp3_rate"] == 1] = 1
    df["competitor_lower"][df["comp4_rate"] == 1] = 1
    df["competitor_lower"][df["comp5_rate"] == 1] = 1
    df["competitor_lower"][df["comp6_rate"] == 1] = 1
    df["competitor_lower"][df["comp7_rate"] == 1] = 1
    df["competitor_lower"][df["comp8_rate"] == 1] = 1
    
    # we say at first there is no competitor hotel available
    df["competitor_available"] = 0
    
    # availability bool = 1 if there if the competitor and expedia are available
    df["competitor_available"][df["comp1_inv"] == 1] = 1
    df["competitor_available"][df["comp2_inv"] == 1] = 1
    df["competitor_available"][df["comp3_inv"] == 1] = 1
    df["competitor_available"][df["comp4_inv"] == 1] = 1
    df["competitor_available"][df["comp5_inv"] == 1] = 1
    df["competitor_available"][df["comp6_inv"] == 1] = 1
    df["competitor_available"][df["comp7_inv"] == 1] = 1
    df["competitor_available"][df["comp8_inv"] == 1] = 1

    return df

In [None]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # most visitors haven't visited a hotel yet
    df["visited_before"] = 0
    
    # where there is a history field filled in, visited_before is turned into 21
    df["visited_before"][df["visitor_hist_starrating"].notna() | df["visitor_hist_adr_usd"].notna()] = 1
    
    return df

In [None]:
def price_quality(df):
    """
    Add a column of ratio price/quality to the DataFrame.
    """
    
    df["price_quality"] = None
    
    df["prop_starrating"].replace(0, 0.0001,inplace=True) # TODO
    
    df["price_quality"][df["price_usd"].notna() & df["prop_starrating"].notna() & df["prop_starrating"] != 0] = df["price_usd"] / df["prop_starrating"]

    # Replace missing values with median
    df["price_quality"].fillna((df["price_quality"].median()), inplace=True)
    
    return df 



In [None]:
def price_category(df):
    """
    Add a column of categories of price_usd and a column
    that corrected price for number of nights.
    Preprocessing of quantile cut showed that categories are:
    [(6.0889999999999995, 69.0] < (69.0, 90.0] < (90.0, 110.0] 
    < (110.0, 136.0] < (136.0, 170.077] < (170.077, 239.0] < (239.0, 554655.0]]
    
    """
    
    # Correct for number of nights ad add as new column
    df["price_correction"] = df["price_usd"] / df["srch_length_of_stay"]
    
    # Replace missing values
    df["price_correction"].fillna((df["price_correction"].median()), inplace=True)
    df["price_usd"].fillna((df["price_usd"].median()), inplace=True)
    
    # Make a new column of price categories
    # TODO: apparently this is not a number
    df['PriceBand'] = pd.qcut(df["price_correction"], 7, labels=np.arange(1,8))
    
    return df

In [None]:
def process_remaining_cols(df):
    """
    Add some remaining (and interesting columns) to the dataframe.
    """
    
    # Replace missing values with median
    df["prop_brand_bool"].fillna((df["prop_brand_bool"].median()), inplace=True)
    df["random_bool"].fillna((df["random_bool"].median()), inplace=True)
    
    # Boolians
#     df["prop_location_score1"].fillna((df["prop_location_score1"].mean()), inplace=True)
    df["prop_location_score1"].fillna(0, inplace=True)
    df["prop_location_score2"].fillna(0, inplace=True)
        
    # Replace missing values with mean
#     df["promotion_flag"].fillna((df["promotion_flag"].mean()), inplace=True)
    df["promotion_flag"].fillna(0, inplace=True)
    
    return df

In [None]:
def add_score(df):
    """
    Add an importance score based on click_bool and booking_bool
    """
    
    # every hotel that is clicked on gets an importance score of 1
    df["importance"] = df["click_bool"]
    
    # every hotel that is booked gets an importance score of 5 
    df["importance"][df["booking_bool"] == 1] = 5
        
    return df

In [None]:
def prop_id_score(df):
    """
    The average price and standard deviation per prop_id
    """ 
    
    df["prop_id_price_mean"] = df.groupby('prop_id')['price_quality'].transform('mean')
    

    return df

### Aparte functies aanroepen voor de kolommen die je erbij wilt

In [None]:
traindf = competitors(traindf)
print("cleaned competitors")

traindf = visitor_history(traindf)
print("cleaned visitor history")

traindf = price_quality(traindf)
print("cleaned price quality")

traindf = price_category(traindf)
print("cleaned price category")

traindf = process_remaining_cols(traindf)
print("did remaining columns")

traindf = prop_id_score(traindf)
print("added prop_id_score")

# Add relevant columns    
if testing_set is False:
    
    df = traindf[["prop_id", 
                  "srch_id", 
                  "position", 
                  "price_quality", 
                  "competitor_lower", 
                  "competitor_available", 
                  "visited_before", 
                  "click_bool", 
                  "booking_bool", 
                  "PriceBand", 
                  "price_usd",
                  "promotion_flag", 
                  "prop_brand_bool", 
                  "random_bool",
                  "prop_location_score1", 
                  "prop_location_score2",
                  "prop_id_price_mean"]]
    
    df = add_score(df)
    print("added score")
    
    # Balance data to 50% importance score or 1 or 5 and 0 
    df = downsampling(df)
    print("Downsampled data")
    
else:
    
    # df without click_bool, booking_bool and position
    df = traindf[["prop_id", 
                  "srch_id", 
                  "price_quality", 
                  "competitor_lower", 
                  "competitor_available", 
                  "visited_before", 
                  "PriceBand", 
                  "price_usd",
                  "promotion_flag", 
                  "prop_brand_bool", 
                  "random_bool",
                  "prop_location_score1", 
                  "prop_location_score2",
                  "prop_id_price_mean"
                 ]]

In [None]:
display(df.head())
display(df.describe())

In [None]:
# df["price_quality"] = df.price_quality.astype(np.float32)
print(df.dtypes)
print()

if df.isnull().sum().sum() != 0:    
    print("\x1b[31mMissing values: \'\x1b[0m")
    print(df.isnull().sum())
else:
    print("\x1b[31mNo missing values!! :D \'\x1b[0m")

## Totale dataset zonder missende waardes

In [None]:
# save the dataframe if it does not exist yet
# if not os.path.exists(filename):
#     df.to_hdf(filename, key="df", format="table")
df.to_hdf(filename, key="df", format="table")

In [None]:
# test if it worked
reread = pd.read_hdf(filename)

In [None]:
display(reread.describe())