In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from importlib import reload
import seaborn as sns

import xgboost as xgb
from xgboost import XGBRanker

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, average_precision_score

In [34]:
df_train = pd.read_hdf("../data/traindf_clean.hdf")
df_test = pd.read_hdf("../data/test_clean.hdf")

df_train = df_train.drop(columns=["prop_id_score", "importance", "click_bool", "booking_bool"])

# Toch niet met priceband want categorie
df_test = df_test.drop("PriceBand", axis=1)
df_train = df_train.drop("PriceBand", axis=1)

raw_train_data = df_train.copy()
raw_test_data = df_test.copy()

In [35]:
def split_data(df):
    X = df.drop('position', axis=1).copy()
    y = df['position'].copy()
    return train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_test, y_train, y_test = split_data(df_train)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3966677, 11), (991670, 11), (3966677,), (991670,))

In [36]:
def train_xgb_model(model, X_train, y_train, X_test, y_test, group, eval_group):
    model = model.fit(X_train, y_train, group, eval_set=[(X_train, y_train), (X_test, y_test)], eval_group=eval_group)
    return model

In [37]:
group = X_train.groupby('srch_id').size().values

eval_group = [X_train.groupby('srch_id').size().values, X_test.groupby('srch_id').size().values]

xgb_model = train_xgb_model(XGBRanker(objective='rank:pairwise', eval_metric='ndcg@50', n_estimators=150, 
                                      learning_rate=0.1, subsample=0.9, random_state=1), 
                            X_train, y_train, X_test, y_test, group=group, eval_group=eval_group)


In [38]:
y_train_pred = xgb_model.predict(X_train)
res_train = list(xgb_model.evals_result['eval_0'].values())[0]
ndcg_train = round(sum(res_train)/len(res_train), 2)
print('*'*20 + 'TRAIN' + '*'*20)
print('train avg ndcg@50: %.2f' % ndcg_train)
print('*'*20 + 'TEST' + '*'*20)
y_test_pred = xgb_model.predict(X_test)
res_test = list(xgb_model.evals_result['eval_1'].values())[0]
ndcg_test = round(sum(res_test)/len(res_test), 2)
print('test avg ndcg@50: %.2f' % ndcg_test)

********************TRAIN********************
train avg ndcg@50: 0.84
********************TEST********************
test avg ndcg@50: 0.87


In [39]:
import pickle
with open('y_test_predictions6mei.pkl','wb') as f:
    pickle.dump(y_test_pred, f)

In [40]:
final_df = df_test.reset_index().copy() 
final_df['rank'] = xgb_model.predict(df_test)

In [43]:
final_df[['srch_id', 'prop_id', 'rank']].head()

Unnamed: 0,srch_id,prop_id,rank
0,1,3180,0.612655
1,1,5543,0.424152
2,1,14142,0.535589
3,1,22393,0.439443
4,1,24194,0.120507


In [45]:
submission_df = final_df[['srch_id', 'prop_id', 'rank']].sort_values(by=['srch_id', 'rank'], ascending=[False, False])\
                .drop('rank', axis=1).set_index('srch_id')

In [46]:
submission_df.to_csv('submission.csv')

In [54]:
predictions_sorted = []

# Sort predictions for each group SEPERATELY
for srchid, group in final_df.groupby('srch_id'):

        
    predictions = [x for _,x in sorted(zip(group["rank"], group.prop_id))] # TODO: Reverse = False toch? Hoezo werkt dit..
    predictions_sorted.append(predictions)

# Flatten list
predictions_sorted = [item for sublist in predictions_sorted for item in sublist]

In [55]:
final_df["prop_ids"] = predictions_sorted
final_df.head()

Unnamed: 0,index,prop_id,srch_id,price_quality,competitor_lower,competitor_available,visited_before,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,rank,prop_ids
0,0,3180,1,39.666667,0,0,0,0,1,0,2.94,0.0691,0.612655,99484
1,1,5543,1,39.333333,0,0,0,0,1,0,2.64,0.0843,0.424152,54937
2,2,14142,1,24.5,1,0,0,0,1,0,2.71,0.0556,0.535589,61934
3,3,22393,1,47.666667,0,0,0,0,1,0,2.4,0.0561,0.439443,50162
4,4,24194,1,26.333333,0,0,0,0,1,0,2.94,0.209,0.120507,28181


In [56]:
submissiondf = final_df[["srch_id", "prop_id"]]


Unnamed: 0,srch_id,prop_id
0,1,3180
1,1,5543
2,1,14142
3,1,22393
4,1,24194


In [57]:
submission_df.to_csv('submission7mei.csv')