In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm

In [2]:
traindf = pd.read_pickle("./pickles/df_small_train.pkl")
testdf = pd.read_pickle("./pickles/df_small_test.pkl")

# run if you want to use the whole train/test dataset
# traindf = pd.read_csv("data/training_set_VU_DM.csv")

In [3]:
def price_quality_solution(df, csv_name):
    
    csv_name = "./results/" + csv_name
    
    # get list of hotels per search id
    results_per_search = df.groupby('srch_id').size()
    
    column_values = df["srch_id"].values
    ids =  pd.unique(column_values) 
    
    if not os.path.exists("./results"):
        os.mkdir("./results")
        
    with open(csv_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["srch_id", "prop_id"])

    total_ids = len(ids)    
        
    for i in tqdm(range(len(ids))):
        ideetje = ids[i]
        
        # get all rows that belong with this search
        df_temp = df[df["srch_id"] == ideetje]
        
        # get list of property ids
        prop_ids = df_temp["prop_id"].values
        
        # rank the property ids based on price/quality
        prop_ids_sorted = [x for _, x in sorted(zip(df_temp["pricequality"], prop_ids))]
                
        # write csv
        with open(csv_name, 'a', newline='') as file:
            writer = csv.writer(file)
            
            for prop_id in prop_ids_sorted:
                writer.writerow([ideetje, prop_id])

In [4]:

def random_solution(df, csv_name):
    
    csv_name = "./results/" + csv_name
    
    # get list of hotels per search id
    results_per_search = df.groupby('srch_id').size()
    
    column_values = df["srch_id"].values
    ids =  pd.unique(column_values) 
    
    if not os.path.exists("./results"):
        os.mkdir("./results")
        
    with open(csv_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["srch_id", "prop_id"])

    for i, ideetje in enumerate(ids):
        df_temp = df[df["srch_id"] == ideetje]
        prop_ids = df_temp["prop_id"].values
        
        # shuffle hotels
        random.shuffle(prop_ids)
    
        # write csv
        with open(csv_name, 'a', newline='') as file:
            writer = csv.writer(file)
            
            for prop_id in prop_ids:
                writer.writerow([ideetje, prop_id])

In [5]:
def price_quality(df):

    df['pricequality'] = df['price_usd'] / df['prop_starrating']
    
    return df

In [6]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor, 1 is True, 0 is False.
    """
    competitor_bools = []
    compare_cols = ["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate",
                "comp6_rate", "comp7_rate", "comp8_rate"]

    for index, row in df.iterrows():
        comp_bool = 0

        # compare_cols_total exists of a column of data from comp_rate and 
        # comp_inv. So, a combination of competitor price and room-availability. 
        for competitor in compare_cols:
            if row[competitor] == 1:
                comp_bool = 1
        competitor_bools.append(comp_bool)
    
    # Append list of bools to new column
    df["competitor_bool"] = competitor_bools
    
    return df

In [7]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # Get none-missing values
    hist_starrating = df.visitor_hist_starrating.isna()
    hist_adr = df.visitor_hist_adr_usd.isna()

    # Dit kan waarschijnlijk veel mooier en sneller maar het werkt...
    total_visited = []
    for index, row in df.iterrows():
        if hist_starrating[index] or hist_adr[index]:
            visited = 0
        else:
            visited = 1
        total_visited.append(visited)

    df["total_visited"] = total_visited
    
    return df

In [8]:
from tqdm import tqdm

def price_quality_solution(df, csv_name):
    
    csv_name = "./results/" + csv_name
    
    # get list of hotels per search id
    results_per_search = df.groupby('srch_id').size()
    
    column_values = df["srch_id"].values
    ids =  pd.unique(column_values) 
    
    if not os.path.exists("./results"):
        os.mkdir("./results")
        
    with open(csv_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["srch_id", "prop_id"])

    total_ids = len(ids)    
        
    for i in tqdm(range(len(ids))):
        ideetje = ids[i]
        
        # get all rows that belong with this search
        df_temp = df[df["srch_id"] == ideetje]
        
        # get list of property ids
        prop_ids = df_temp["prop_id"].values
        
        # rank the property ids based on price/quality
        prop_ids_sorted = [x for _, x in sorted(zip(df_temp["pricequality"], prop_ids))]
                
        # write csv
        with open(csv_name, 'a', newline='') as file:
            writer = csv.writer(file)
            
            for prop_id in prop_ids_sorted:
                writer.writerow([ideetje, prop_id])

In [9]:
traindf = price_quality(traindf)
traindf = competitors(traindf)
traindf = visitor_history(traindf)n

In [10]:
def change_time(df):
    
    if 'date_time' in df.columns:
        # Change time information to year and month columns
        df["date_time"] = pd.to_datetime(df["date_time"])
        df["year"] = df["date_time"].dt.year
        df["month"] = df["date_time"].dt.month

        #Delete column date-time
        df = df.drop('date_time', 1)
        
    return df

In [11]:
# Without price_quality because of floating error (TODO)
df = traindf[["prop_id", "srch_id", "position", "competitor_bool", "total_visited", "click_bool", "booking_bool"]]


In [12]:
df['price'] = traindf['price_usd'].fillna((traindf['price_usd'].median()))
# df['pricequality'] = df['pricequality'].fillna(0)
# df['pricequality'] = df['pricequality'].astype(int)
# df['pricequality'] = df['pricequality'].fillna(0).astype(np.int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
def add_score(df):
    """
    Add a score 
    """
    if df["booking_bool"] == 1 or df["click_bool"] == 1:
        score = 1
    else:
        score = 0
    return score

df['score'] = df.apply(add_score , axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [14]:
y = df["prop_id"]
X = df.copy()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
regr = RandomForestRegressor(n_jobs=1)
rfr = regr.fit(X_train, y_train)
prediction = rfr.predict(X_test)
print(r2_score(prediction, y_test))



0.9999999999953617


In [17]:
prediction = pd.DataFrame(data=prediction, columns=["prop_id"])
searchid = pd.DataFrame(data=traindf["srch_id"], columns=["srch_id"])
X = X.reset_index()

In [18]:
# write csv
with open("random_forest.csv", 'a', newline='') as file:
    writer = csv.writer(file)

    for index in range(0,len(prediction)):
        writer.writerow([X.loc[index ,"srch_id"], int(prediction.loc[index ,"prop_id"])])