In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm

from numba import jit
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Kies testing_set is False of True!!

In [2]:
# HIER!!!!!!
testing_set = False

In [3]:
if testing_set:
    testdf = pd.read_hdf("./data/corrected_price_testset_rollback_and_avg.hdf")
else:
    traindf = pd.read_hdf("./data/corrected_price_rollback_and_avg.hdf")

In [4]:
def estimate_pos(traindf, testdf):
    
    srch_id_dest_id_dict = traindf.loc[traindf["random_bool"] == 0]
#     srch_id_dest_id_dict = traindf["srch_destination_id"]
    
    srch_id_dest_id_dict = traindf.groupby(["srch_destination_id", "prop_id"]).agg(
        {"position": "mean"}
    )
    
    srch_id_dest_id_dict = srch_id_dest_id_dict.rename(
        index=str, columns={"position": "estimated_position"}
    ).reset_index()
    
    srch_id_dest_id_dict["srch_destination_id"] = (
        srch_id_dest_id_dict["srch_destination_id"].astype(str).astype(int)
    )
    srch_id_dest_id_dict["prop_id"] = (
        srch_id_dest_id_dict["prop_id"].astype(str).astype(int)
    )
    srch_id_dest_id_dict["estimated_position"] = (
        1 / srch_id_dest_id_dict["estimated_position"]
    )
    
    testdf = testdf.merge(
        srch_id_dest_id_dict, how="left", on=["srch_destination_id", "prop_id"]
    )
    
    traindf = traindf.merge(
        srch_id_dest_id_dict, how="left", on=["srch_destination_id", "prop_id"]
    )
    
   
    return testdf, traindf

# testdf, traindf = estimate_pos(traindf, testdf)

In [6]:
if testing_set is True:
    filename = "./data/test_clean.hdf"
    traindf = testdf
else:
    filename = "./data/traindf_clean.hdf"

In [7]:
def downsampling(df):
    """
    Balance classes in trainingset, based on click_bool (not booking_bool)
    """
    
    total = len(df)
    print(total)
    
    bookings = df[df.importance == 5].index
    amount_bookings = len(bookings) * 5
    booking_indices = np.random.choice(bookings, amount_bookings, replace=True)
    booking_sample = df.loc[booking_indices]
    
    print("Amount of bookings: ", len(bookings), "booking sample: ", len(booking_sample))
    
    # get half of the amount of data for clicks
    clicks = df[df.importance == 1].index
    click_indices = np.random.choice(clicks, math.ceil(amount_bookings/2), replace=True)
    click_sample = df.loc[click_indices]
    
    print("Amount of clickes: ", len(clicks), "click sample: ", len(click_sample))
    
    # same for non-clicked, non-booked hotels
    not_click = df[df.importance == 0].index
    not_click_indices = np.random.choice(not_click, math.ceil(amount_bookings/2), replace=False)
    not_click_sample = df.loc[not_click_indices]
    
    print(len(not_click_sample))
    
    df_new = pd.concat([not_click_sample, click_sample, booking_sample], axis=0)
    
    print("Total df now: ", len(df_new))
    
    return df_new


# test = add_score(traindf)
# test = downsampling(test)

In [8]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor and there are available rooms.
    1 is True, 0 is False.
    """
    
    # we say at first there is no competitor hotel available
    df["better_available_competitor"] = 0
    
    # comp1rate = 1 if price is lower
    # availability bool = 1 if there if the competitor and expedia are available
    df["better_available_competitor"][(df["comp1_inv"] == 1) & (df["comp1_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp2_inv"] == 1) & (df["comp2_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp3_inv"] == 1) & (df["comp3_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp4_inv"] == 1) & (df["comp4_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp5_inv"] == 1) & (df["comp5_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp6_inv"] == 1) & (df["comp6_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp7_inv"] == 1) & (df["comp7_rate"] == 1)] = 1
    df["better_available_competitor"][(df["comp8_inv"] == 1) & (df["comp8_rate"] == 1)] = 1
    
#     print(df.columns)
    
    df = df.drop(columns=['comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff',
             'comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 
             'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff', 
             'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 
             'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff', 
             'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff', 
             'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', 
             'comp8_rate', 'comp8_inv', 'comp8_rate_percent_diff'])

    return df

In [9]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # most visitors haven't visited a hotel yet
    df["visited_before"] = 0
    
    # where there is a history field filled in, visited_before is turned into 1
    df["visited_before"][df["visitor_hist_starrating"].notna() | df["visitor_hist_adr_usd"].notna()] = 1
    
    return df

In [10]:
def price_quality(df):
    """
    Add a column of ratio price/quality to the DataFrame.
    Also add review/quality to the df.
    """
    
    df["price_quality"] = np.nan
    df["price_review"] = np.nan
    print(len(df[df["prop_review_score"].isna()]))
#     df["prop_starrating"].replace(0, 0.0001, inplace=True) # TODO
    
    df["price_quality"][df["price_correction"].notna() & df["prop_starrating"].notna() & df["prop_starrating"] != 0] = df["price_correction"] / df["prop_starrating"]
    df["price_review"][df["price_correction"].notna() & df["prop_review_score"].notna() & df["prop_review_score"] != 0] = df["price_correction"] / df["prop_review_score"]

    # Replace missing values with median
#     df["price_quality"].fillna((df["price_quality"].median()), inplace=True)
    
    return df 


In [11]:
def process_remaining_cols(df):
    """
    Add some remaining (and interesting columns) to the dataframe.
    """
    
    # Replace missing values with median
    df["prop_brand_bool"].fillna((df["prop_brand_bool"].median()), inplace=True)
    df["random_bool"].fillna((df["random_bool"].median()), inplace=True)
    
    # Boolians
#     df["prop_location_score1"].fillna((df["prop_location_score1"].mean()), inplace=True)
    df["prop_location_score1"].fillna(-1, inplace=True)
    df["prop_location_score2"].fillna(-1, inplace=True)
        
    # Replace missing values with mean
#     df["promotion_flag"].fillna((df["promotion_flag"].mean()), inplace=True)
    df["promotion_flag"].fillna(-1, inplace=True)
    
    df["prop_starrating"].fillna(-1, inplace=True)
    
    
    return df

In [12]:
def add_score(df):
    """
    Add an importance score based on click_bool and booking_bool
    """
    
    # every hotel that is clicked on gets an importance score of 1
    df["importance"] = np.nan
    
    df["importance"] = df["click_bool"]
    
    # every hotel that is booked gets an importance score of 5 
    df["importance"][df["booking_bool"] == 1] = 5
        
    return df

In [13]:
def price_rank(df):
    """
    Add the rank for every prop_id within each srch_id
    """
    df["price_rank"] = df.groupby("srch_id")["price_correction"].rank()
    
    return df

In [14]:
def locationscore_rank(df):
    """
    Add the rank for every location score within each srch_id
    """
    
    df["total_loc_score"] = df["prop_location_score2"] + df["prop_location_score1"]
    
    df["locationscore2_rank"] = df.groupby("srch_id")["prop_location_score2"].rank()
    df["locationscore1_rank"] = df.groupby("srch_id")["prop_location_score1"].rank()
    
    return df

test = locationscore_rank(traindf)

In [15]:
def starrating(df):
    """
    Also add starrating rank and mean starrating for each property
    """
    
    df["starrating_rank"] = df.groupby("srch_id")["prop_starrating"].rank()
    
    
#     df["mean_rating_propid"] = df.groupby("prop_id")["prop_starrating"].transform('mean')
    
    return df

In [16]:
def price_diff(df):
    
    df["price_diff_hist"] = df["price_correction"] - df["prop_log_historical_price"]
    df["diff_price_srchid"] = df["price_correction"] - df['avg_price_propid_after']
    df["diff_price_propid"] = df["price_correction"] - df['avg_price_srchid']
    
    return df


### Aparte functies aanroepen voor de kolommen die je erbij wilt

In [17]:
processed = competitors(traindf)
print("cleaned competitors")

processed = visitor_history(processed)
print("cleaned visitor history")

processed = price_quality(processed)
print("cleaned price quality")

processed = price_rank(processed)
print("added a rank of price per search_id")

# traindf = process_remaining_cols(traindf)
# print("did remaining columns")

processed = locationscore_rank(processed)
print("added a rank of location score search_id")

processed = starrating(processed)
print("added starrating and mean of prop_ids")

processed = price_diff(processed)
print("added history of price info")

print(processed.columns)

cleaned competitors
cleaned visitor history
7364
cleaned price quality
added a rank of price per search_id
added a rank of location score search_id
added starrating and mean of prop_ids
added history of price info
Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'click_bool',
       'gross_bookings_usd', 'booking_bool', 'avg_price_propid',
       'std_avg_price_propid', 'amount_hotels', 'avg_price_propid_after',
       'std

In [18]:
relevant_columns = ['srch_id', 
                    'prop_id', 
                    'prop_starrating', 
                    'prop_review_score', 
                    'prop_location_score1', 
                    'prop_location_score2',
                    'std_avg_price_propid', 
                    'std_avg_price_propid_after', 
                    'amount_hotels', 
                    'avg_price_propid_after',
                    'price_correction', 
                    'avg_price_srchid',
                    'better_available_competitor', 
                    'visited_before', 
                    'price_quality',
                    'price_review', 
                    'price_rank', 
                    'total_loc_score', 
                    'locationscore2_rank',
                    'locationscore1_rank', 
                    'starrating_rank', 
                    'price_diff_hist',
                    'diff_price_srchid', 
                    'diff_price_propid']

# Add relevant columns    
if testing_set is False:
    relevant_columns.append("click_bool")
    relevant_columns.append("booking_bool")
    relevant_columns.append("position")
    
    df = processed[relevant_columns]
    
    df = add_score(df)
    print("added score")
    
    # Balance data to 50% importance score or 1 or 5 and 0 
    df = downsampling(df)
    print("Downsampled data")
    
else:
    # df without click_bool, booking_bool and position
    df = processed[relevant_columns]

added score
4958347
Amount of bookings:  138390 booking sample:  691950
Amount of clickes:  83489 click sample:  345975
345975
Total df now:  1383900
Downsampled data


In [19]:
display(df.head())
display(df.describe())

Unnamed: 0,srch_id,prop_id,prop_starrating,prop_review_score,prop_location_score1,prop_location_score2,std_avg_price_propid,std_avg_price_propid_after,amount_hotels,avg_price_propid_after,...,locationscore2_rank,locationscore1_rank,starrating_rank,price_diff_hist,diff_price_srchid,diff_price_propid,click_bool,booking_bool,position,importance
4890640,328278,22095,3,4.5,1.1,,14.586387,14.586387,40.0,93.3075,...,,3.5,19.0,105.48,16.6925,28.24,0,0,9,0
2433421,163437,75052,4,0.0,4.2,,19.334744,18.423059,22.0,32.599038,...,,19.0,10.0,41.475,8.875962,-7.143636,0,0,7,0
4746108,318447,37140,5,4.5,4.91,0.0323,23403.429778,23403.429778,194.0,2048.416495,...,10.0,29.0,23.0,234.45,-1808.066495,125.807931,0,0,8,0
3755593,252267,100588,3,0.0,5.19,0.1571,20.668376,20.668376,14.0,117.631429,...,1.0,10.0,4.5,137.12,24.368571,-96.711538,0,0,9,0
4915866,329956,72013,4,4.5,1.61,,84.825027,84.825027,62.0,114.255806,...,,17.5,16.0,80.37,-29.055806,-10.1544,0,0,12,0


Unnamed: 0,srch_id,prop_id,prop_starrating,prop_review_score,prop_location_score1,prop_location_score2,std_avg_price_propid,std_avg_price_propid_after,amount_hotels,avg_price_propid_after,...,locationscore2_rank,locationscore1_rank,starrating_rank,price_diff_hist,diff_price_srchid,diff_price_propid,click_bool,booking_bool,position,importance
count,1383900.0,1383900.0,1383900.0,1382351.0,1383900.0,1182625.0,1383900.0,1383900.0,1383900.0,1383900.0,...,1182625.0,1383900.0,1383900.0,1383900.0,1383900.0,1383900.0,1383900.0,1383900.0,1383900.0,1383900.0
mean,166441.3,70106.76,3.288982,3.864549,2.880221,0.1699898,1369.423,695.4482,214.3328,189.7459,...,13.36397,13.81741,14.39709,192.6358,7.22629,-1.994727,0.75,0.5,10.88999,2.75
std,96087.63,40690.07,0.9928849,0.9326225,1.507768,0.1821452,14833.01,7991.582,319.0915,2591.383,...,9.022381,8.875497,8.178356,8789.603,8793.539,2998.704,0.4330129,0.5,9.826543,2.277609
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.500305,...,1.0,1.0,1.0,-6.203333,-654525.0,-1004519.0,0.0,0.0,1.0,0.0
25%,83109.0,34832.0,3.0,3.5,1.79,0.0353,19.45177,18.92382,47.0,82.97179,...,5.0,6.0,7.0,64.18,-34.29923,-41.6875,0.75,0.0,3.0,0.75
50%,166642.0,69594.0,3.0,4.0,2.77,0.1067,38.78596,36.44367,110.0,117.0,...,12.0,12.5,14.0,97.73,-9.043258,-13.4375,1.0,0.5,8.0,3.0
75%,249745.0,105212.0,4.0,4.5,4.03,0.2404,79.58297,68.20672,253.0,165.3273,...,20.5,21.0,21.0,144.34,7.3025,15.17647,1.0,1.0,17.0,5.0
max,332785.0,140821.0,5.0,5.0,6.98,1.0,1801694.0,1213058.0,2357.0,654632.9,...,37.0,37.5,37.0,3272890.0,2805152.0,899711.1,1.0,1.0,40.0,5.0


In [20]:
# df["price_quality"] = df.price_quality.astype(np.float32)
print(df.dtypes)
print()

print(len(df))

if df.isnull().sum().sum() != 0:    
    print("\x1b[31mMissing values: \'\x1b[0m")
    print(df.isnull().sum())
else:
    print("\x1b[31mNo missing values!! :D \'\x1b[0m")

srch_id                          int64
prop_id                          int64
prop_starrating                  int64
prop_review_score              float64
prop_location_score1           float64
prop_location_score2           float64
std_avg_price_propid           float64
std_avg_price_propid_after     float64
amount_hotels                  float64
avg_price_propid_after         float64
price_correction               float64
avg_price_srchid               float64
better_available_competitor      int64
visited_before                   int64
price_quality                  float64
price_review                   float64
price_rank                     float64
total_loc_score                float64
locationscore2_rank            float64
locationscore1_rank            float64
starrating_rank                float64
price_diff_hist                float64
diff_price_srchid              float64
diff_price_propid              float64
click_bool                       int64
booking_bool             

## Totale dataset zonder missende waardes

In [21]:
# save the dataframe if it does not exist yet
# if not os.path.exists(filename):
#     df.to_hdf(filename, key="df", format="table")
df.to_hdf(filename, key="df")

In [22]:
# test if it worked
reread = pd.read_hdf(filename)