In [2]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import pandas as pd
import random
import csv
import os
import math
import numpy as np
from tqdm import tqdm
from numba import jit

In [15]:
traindf = pd.read_pickle("./pickles/df_small_train.pkl")
traindf = pd.read_pickle("./pickles/df_verysmall_train_test_yk.pkl")
testdf = pd.read_pickle("./pickles/df_small_test.pkl")

# run if you want to use the whole train/test dataset
# traindf = pd.read_csv("data/training_set_VU_DM.csv")
# traindf = traindf.sample(1000)
# testdf = traindf.sample(200)

In [4]:
def competitors(df):
    """
    Make a new column in the dataframe (competitor_bool) for when there 
    exists a competitor, 1 is True, 0 is False.
    """
    competitor_bools = []
    compare_cols = ["comp1_rate", "comp2_rate", "comp3_rate", "comp4_rate", "comp5_rate",
                "comp6_rate", "comp7_rate", "comp8_rate"]

    for index, row in df.iterrows():
        comp_bool = 0

        # compare_cols_total exists of a column of data from comp_rate and 
        # comp_inv. So, a combination of competitor price and room-availability. 
        for competitor in compare_cols:
            if row[competitor] == 1:
                comp_bool = 1
        competitor_bools.append(comp_bool)
    
    # Append list of bools to new column
    df["competitor_bool"] = competitor_bools
    
    return df

In [5]:
def visitor_history(df):
    """
    Add column that tells us whether someone has visited a hotel before.
    Column name = total_visited; 1 is True, 0 is False.
    """
    
    # Get none-missing values
    hist_starrating = df.visitor_hist_starrating.isna()
    hist_adr = df.visitor_hist_adr_usd.isna()

    # Dit kan waarschijnlijk veel mooier en sneller maar het werkt...
    total_visited = []
    for index, row in df.iterrows():
        if hist_starrating[index] or hist_adr[index]:
            visited = 0
        else:
            visited = 1
        total_visited.append(visited)

    df["total_visited"] = total_visited
    
    return df

In [6]:
def add_score(df):
    """
    Add a score 
    """
    if df["booking_bool"] == 1 or df["click_bool"] == 1:
        score = 1
    else:
        score = 0
    return score


def preprocessing(traindf):
    traindf = competitors(traindf)
    traindf = visitor_history(traindf)
    
    # Without price_quality because of floating error (TODO)
    df = traindf[["prop_id", "srch_id", "position", "competitor_bool", "total_visited", "click_bool", "booking_bool"]]
    
    df['score'] = df.apply(add_score , axis=1)
    
    return df

In [16]:
# May take a while
df = preprocessing(traindf)
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,prop_id,srch_id,position,competitor_bool,total_visited,click_bool,booking_bool,score
1249064,4500,83595,13,0,0,0,0,0
1249065,14603,83595,27,0,0,0,0,0
1249066,19960,83595,19,0,0,0,0,0
1249067,24606,83595,28,0,0,0,0,0
1249068,27481,83595,10,0,0,0,0,0
1249069,30331,83595,30,0,0,0,0,0
1249070,38791,83595,20,0,0,0,0,0
1249071,38879,83595,33,0,0,0,0,0
1249072,41220,83595,24,0,0,0,0,0
1249073,48318,83595,9,0,0,0,0,0


In [18]:
y = df["prop_id"]
X = df.copy()
# X = df.drop("prop_id", axis=1).copy()

# TODO: test en trainingset maken
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# X_train = df.sample(1000)
# y_train = df.prop_id
# X_test = df.sample(200)
# y_test = df.prop_id

In [19]:
# prediction = rfr.predict(X_test)
# prediction_proba = rfr.predict_proba(X_test)
# print(r2_score(prediction, y_test))
# propid_pred_groups = X_train.groupby('srch_id').agg({'position':lambda x: list(x)})
# propid_pred_groups

# Fit model on whole dataset
rfc = RandomForestClassifier(n_jobs=1)
model = rfc.fit(X_train, y_train)
predictions = model.predict(X_test)



In [20]:
X_test["predictions"] = predictions
X_test.head(10)

pred_groups = X_test.groupby('srch_id').agg({'position':lambda x: list(x)})
pred_groups

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,position
srch_id,Unnamed: 1_level_1
65,"[15, 14, 29, 30, 37, 3, 36, 8]"
2050,"[24, 9, 19]"
2625,"[4, 26, 16, 1, 7]"
3927,"[18, 19]"
7707,"[13, 36, 28, 27, 25, 24, 20, 7]"
...,...
322659,"[27, 12, 13, 9, 26, 7, 8]"
327160,"[18, 6, 4]"
329266,"[16, 14, 21, 20, 1, 13]"
330548,"[13, 24, 14, 20]"


In [83]:
def dcg_from_ranking(y_true, ranking):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    ranking : array-like, shape = [k]
        Document indices, i.e.,
            ranking[0] is the index of top-ranked document,
            ranking[1] is the index of second-ranked document,
            ...
    k : int
        Rank.
    Returns
    -------
    DCG @k : float
    """
    y_true = np.asarray(y_true)
    ranking = np.asarray(ranking)
    rel = y_true[ranking]
    gains = 2 ** rel - 1
    discounts = np.log2(np.arange(len(ranking)) + 2)
    return np.sum(gains / discounts)


def ndcg_from_ranking(y_true, ranking):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    ranking : array-like, shape = [k]
        Document indices, i.e.,
            ranking[0] is the index of top-ranked document,
            ranking[1] is the index of second-ranked document,
            ...
    k : int
        Rank.
    Returns
    -------
    NDCG @k : float
    """
    k = len(ranking)
    best_ranking = np.argsort(y_true)[::-1]
    best = dcg_from_ranking(y_true, best_ranking[:k])
    return dcg_from_ranking(y_true, ranking) / best

# def dcg_score(y_true, y_score, order):
#     """Discounted cumulative gain (DCG) at rank K.
#     """
    
#     order = np.argsort(order)[::-1]
#     if len(y_true) <= 20:
#         k = len(y_true)
#     else:
#         k = 20
#     y_true = np.take(y_true, order[:k])

#     gain = 2 ** y_true - 1

#     discounts = np.log2(np.arange(len(y_true)) + 2)
#     return np.sum(gain / discounts)

def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="linear"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best


In [26]:
rank_pred_groups   = X.groupby('srch_id').agg({'position':lambda x: list(x)})

# Get unique srchids to group by
unique_srchid = X.srch_id.unique()
# rfc = RandomForestClassifier(n_jobs=1)

In [113]:
# Oh jit kan niet met pandas Series werken, moet omgeschreven worden naar np arrays
# @jit(nopython=True)

def calc_score_predictions(df):    
    scores = []
    k = 20
    
    # Iterate over every group of search_ids
    for index in range(1, len(unique_srchid)):

        # Filter slices on unique srch_id
        slicer = df.loc[df['srch_id'] == unique_srchid[index]]
    
        order_true = list(slicer.position)
        y_true = list(slicer.prop_id)
        y_score = list(slicer.predictions)

        # Compare predictions with actual position
        # scores = ndcg_score(y_true, y_score)
        
        order = np.argsort(order_true)[::-1]
        y_true_temp = np.take(y_true, order[:k])
        print(order, "list1")
        gain = 2 ** y_true_temp - 1
        discounts = np.log2(np.arange(len(y_true_temp)) + 2)
        print(np.sum(discounts), gain)
        
        order = np.argsort(order_true)[::-1]
        y_true = np.take(y_score, order[:k])
        print(order, "list2")
        gain = 2 ** y_true - 1
        discounts = np.log2(np.arange(len(y_true)) + 2)
        print(np.sum(discounts), gain)

        return np.mean(scores)


scores  = calc_score_predictions(X_test)

print("score: ", round(scores * 100, 2))

[2 8 3 0 5 1 6 7 4] list1
21.791061114716953 [-1 -1 -1 -1 -1 -1 -1 -1 -1]
[2 8 3 0 5 1 6 7 4] list2
21.791061114716953 [-1 -1 -1 -1 -1 -1 -1 -1 -1]
score:  nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [12]:
propid_pred_groups = X.groupby('srch_id').agg({'prop_id':lambda x: list(x)})
rank_pred_groups   = X.groupby('srch_id').agg({'position':lambda x: list(x)})

logger = lambda x: math.log(x + 1, 2)
X['log_rank'] = X.groupby(by = 'srch_id')['position'].rank(ascending = False).map(logger)

# sum( (2 ** points - 1) / log2(rank_in_results + 1) )

In [13]:
propid_pred_groups.head(10)

# scores = []
# for search_id in range(1, len(propid_pred_groups)):
#     y_true = list(propid_pred_groups[:search_id]["prediction_prop_id"])[0]
#     rank = list(rank_pred_groups[:search_id]["position"])[0]
#     print(y_true, rank)
#     score = ndcg_from_ranking(y_true, rank)
#     scores.append(score)

# print(scores)

Unnamed: 0_level_0,prop_id
srch_id,Unnamed: 1_level_1
124,"[3709, 11719, 12082, 13297, 15174, 17345, 2146..."
218,"[19663, 32400, 73307, 108073, 132936]"
439,"[9095, 9826, 12860, 16708, 18660, 20110, 23639..."
597,"[1988, 10087, 95874, 97465, 102644, 135952]"
870,"[12720, 21234, 23504, 31806, 35718, 42391, 424..."
1237,"[11396, 14133, 22765, 23121, 37255, 46859, 543..."
1350,"[4375, 8469, 17709, 20751, 35653, 43705, 44794..."
1384,"[1172, 3054, 11243, 13109, 22967, 31977, 36891..."
1545,"[16371, 16699, 18906, 19070, 19804, 27448, 290..."
1635,"[6370, 6413, 7778, 10585, 10841, 11755, 14882,..."


In [14]:
rank_pred_groups.head(10)

Unnamed: 0_level_0,position
srch_id,Unnamed: 1_level_1
124,"[33, 35, 36, 2, 10, 30, 3, 29, 12, 26, 15, 20,..."
218,"[1, 6, 4, 3, 2]"
439,"[26, 24, 25, 21, 27, 3, 12, 14, 6, 13, 1, 28, ..."
597,"[2, 6, 7, 1, 4, 3]"
870,"[24, 9, 14, 18, 27, 7, 29, 22, 15, 19, 8, 1, 2..."
1237,"[4, 13, 26, 14, 21, 15, 10, 16, 1, 24, 19, 27,..."
1350,"[13, 9, 21, 19, 14, 6, 26, 7, 12, 30, 22, 24, ..."
1384,"[29, 6, 9, 10, 24, 8, 13, 19, 7, 18, 4, 25, 1,..."
1545,"[33, 9, 6, 13, 30, 31, 4, 8, 24, 20, 1, 27, 15..."
1635,"[21, 12, 31, 17, 18, 5, 14, 2, 30, 9, 28, 20, ..."
