In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
import numpy as np

from sklearn.feature_selection import SelectFromModel
import matplotlib.pylab as plt
import seaborn as sns
sns.set()

import lightgbm

In [None]:
traindf = pd.read_hdf("../data/traindf_clean.hdf")
traindf = traindf.drop("estimated_importance", axis=1)

In [None]:
def downsampling(df):
    """
    Balance classes in trainingset, based on click_bool (not booking_bool)
    """
    
    length = len(df.loc[df.importance == 5])
    length1 = len(df.loc[df.importance == 1])

    
    # Get 50% of data with importance of 5 or 1
    clicks = df[df.importance == 5].index
    randoms = np.random.choice(clicks, length , replace=False)
    click_sample = df.loc[randoms]
    

    not_click = df[df.importance == 1].index
    random_indices = np.random.choice(not_click, length1, replace=False)
    not_click_sample = df.loc[random_indices]
    print(len(random_indices))
    
    not_click = df[df.importance == 0].index
    random_indices = np.random.choice(not_click, length1, replace=False)
    not_click_sample2 = df.loc[random_indices]
    print(len(random_indices))

    df_new = pd.concat([click_sample, not_click_sample, not_click_sample2], axis=0)
    
    print("Percentage of not click impressions: ", len(df_new[df_new.importance == 0])/len(df))
    print("Percentage of click impression: ", len(df_new[df_new.importance != 0])/len(df))
    
    return df_new

traindf = downsampling(traindf)

In [None]:
print(list(traindf.columns))

traindf = traindf.sort_values(by=['srch_id'])
# traindf = traindf.drop(["price_correction"], axis=1)

# traindf = traindf[["estimated_importance", "estimated_position", "price_usd_norm_by_prop_id", "prop_starrating_norm_by_srch_id", 
#                    "position", "importance", "click_bool", "booking_bool", "srch_id" ,"price_review", "prop_id"]]



In [None]:
# split label and other variables
x_train, y_train = traindf, traindf["importance"]
positions = pd.DataFrame(y_train).set_index(traindf.srch_id)

# 1 procent van totale data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42, shuffle=False, stratify = None)
x_traincopy = x_train.copy()
x_train = x_train.drop(columns=["position", "importance", "click_bool", "booking_bool"])
x_test = x_test.drop(columns=["position", "importance", "click_bool", "booking_bool"])

x_train.shape, y_train.shape, x_test.shape



In [None]:
params = {'objective': 'rank:ndcg', 'learning_rate': 0.12,
          'max_depth': 6,  'n_estimators': 1000}
    
# groups equal to length of queries
query_lengths = x_train.groupby('srch_id').size().values
query_lengths_y = x_test.groupby('srch_id').size().values

In [None]:
# dtrain = xgb.DMatrix(x_train, label=y_train)
# dtrain.set_group(query_lengths)

# dtest = xgb.DMatrix(x_test, label=y_test)
# dtest.set_group(query_lengths_y)

# params = {"max_depth":100, 'objective': 'rank:pairwise', 'eval_metric': 'ndcg@5-'}

# evalist= [(dtest, 'eval'), (dtrain, 'train')]
# num_round = 50
# bst = xgb.train(params, dtrain, num_round, evalist, maximize=True)

In [None]:
# Make fake column of positions to test score in ndcg scoring function
# x_traincopy["predicted_rank"] = x_train.groupby(['srch_id']).cumcount()+1
# x_train["position_temp"] = x_train.groupby(['srch_id']).cumcount()+1

In [None]:

# predictions_sorted = []
# prop_ids_sorted = []
# x_train["y_label"] = y_train


# print(x_train.columns)
# # x_train = x_train.drop(["position", "pred"], axis=1)

# # Sort predictions for each group SEPERATELY
# for srchid, group in x_train.groupby('srch_id'):

#     dtrain_temp = xgb.DMatrix(group.drop(["position_temp", "y_label"], axis=1), label=y_train)
#     # Predictions for one search_id
#     pred = bst.predict(dtrain_temp)
    
#     # Sort all columns (position and prop_id) from this group based on predictions
#     predictions = [x for _,x in sorted(zip(pred, group.position_temp), reverse=True)] # TODO: Reverse = False toch? Hoezo werkt dit..
#     predictions_sorted.append(predictions)
    
#     prop_id = [x for _,x in sorted(zip(pred, group.prop_id), reverse=True)]
#     prop_ids_sorted.append(prop_id)
    
# # Flatten lists
# predictions_sorted = [item for sublist in predictions_sorted for item in sublist]
# prop_ids_sorted = [item for sublist in prop_ids_sorted for item in sublist]

# x_traincopy["prop_id"] = prop_ids_sorted
# x_traincopy["predicted_rank"] = predictions_sorted
# final_df = x_traincopy

# print(round(dataframe_ndcg_score(x_traincopy),6))

# fig = xgb.plot_importance(booster=model)
# # fig.figure.savefig("../plots/competitors.pdf", bbox_inches='tight')
# plt.show()

In [None]:
model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train.drop(["prop_id", "srch_id"], axis=1), y_train, query_lengths, verbose=True)

In [None]:
# Make fake column of positions to test score in ndcg scoring function
x_train["position_temp"] = x_train.groupby(['srch_id']).cumcount()+1

In [None]:
# x_train["position"] = y_train
predictions_sorted = []
prop_ids_sorted = []

# x_train = x_train.drop(["position", "pred"], axis=1)

# Sort predictions for each group SEPERATELY
for srchid, group in x_train.groupby('srch_id'):
    
    # Predictions for one search_id
    pred = model.predict(group.drop(["position_temp", "prop_id", "srch_id"], axis=1))
    
    # Sort all columns (position and prop_id) from this group based on predictions
    predictions = [x for _,x in sorted(zip(pred, group.position_temp), reverse=True)] # TODO: Reverse = False toch? Hoezo werkt dit..
    predictions_sorted.append(predictions)
    
    prop_id = [x for _,x in sorted(zip(pred, group.prop_id), reverse=True)]
    prop_ids_sorted.append(prop_id)
    
# Flatten lists
predictions_sorted = [item for sublist in predictions_sorted for item in sublist]
prop_ids_sorted = [item for sublist in prop_ids_sorted for item in sublist]


In [None]:
x_traincopy["prop_id"] = prop_ids_sorted
x_traincopy["predicted_rank"] = predictions_sorted
final_df = x_traincopy
final_df.head()

In [None]:
# final_df["predicted_rank"] = final_df["position_temp"]
final_df.head()

In [None]:
# final_df.to_pickle("../pickles/XGBoost_test9mei.pkl")

In [None]:
def indiv_search_ndcg_score(true_rank, predicted_rank, booked, clicked):
    # holder for ideal score
    idcg = 0
    
    # holder for our score
    dcg = 0

    # itereer over elk resultaat in de search
    for i in range(len(true_rank)):
        # hotels die geboekt zijn zijn heel belangrijk (5)
        if booked[i] == 1:
            
            # increment ideal score
            if i != 0:
                idcg = idcg + (5 / np.log2(i+1))
            else:
                idcg += 5

            # if predicted well, increment score
            if true_rank[i] == predicted_rank[i]:
                if i != 0:
                    dcg = dcg + (5 / np.log2(i+1))
                else:
                    dcg += 5
                
        # hotels waarop geklikt is zijn een beetje belangrijk (1)
        elif clicked[i] == 1:

            # increment ideal score
            if i != 0:
                idcg = idcg + (1 / np.log2(i+1))
            else:
                idcg += 1
            
            # if predicted well, increment score
            if true_rank[i] == predicted_rank[i]:
                dcg = dcg + (1 / np.log2(i+2))

    if idcg == 0:
        return dcg
    return dcg/idcg

# score function for every search (thus iterate over groupby object)
def dataframe_ndcg_score(solution):
    all_scores = []
    for srchid, group in solution.groupby('srch_id'):

        # true_rank
        true_rank = np.asarray(group['position'])

        # predicted_rank
        predicted_rank = np.asarray(group['predicted_rank'])
        
        # booking_bool
        booked = np.asarray(group['booking_bool'])
        
        # click_bool
        clicked = np.asarray(group['click_bool'])
        all_scores.append(indiv_search_ndcg_score(true_rank, predicted_rank, booked, clicked))
        
    return np.nanmean(all_scores)

In [None]:
print(round(dataframe_ndcg_score(x_traincopy),6))

In [None]:
fig = xgb.plot_importance(booster=model)
# fig.figure.savefig("../plots/competitors.pdf", bbox_inches='tight')
plt.show()

In [None]:
fig.figure.savefig("../plots/final_model_feature_imortance.pdf", bbox_inches='tight')