In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 

In [2]:
traindf = pd.read_hdf("../data/traindf_clean.hdf")
testdf = pd.read_hdf("../data/test_clean.hdf")

In [3]:
# Toch niet met priceband want categorie
testdf = testdf.drop("PriceBand", axis=1)
traindf = traindf.drop("PriceBand", axis=1)

In [4]:
print(list(traindf.columns))
print(list(testdf.columns))

traindf = traindf.sort_values(by=['srch_id'])

['prop_id', 'srch_id', 'position', 'price_quality', 'competitor_lower', 'competitor_available', 'visited_before', 'click_bool', 'booking_bool', 'price_usd', 'promotion_flag', 'prop_brand_bool', 'random_bool', 'prop_location_score1', 'prop_location_score2', 'prop_id_price_mean', 'importance']
['prop_id', 'srch_id', 'price_quality', 'competitor_lower', 'competitor_available', 'visited_before', 'promotion_flag', 'prop_brand_bool', 'random_bool', 'prop_location_score1', 'prop_location_score2']


In [5]:
# split label and other variables
x_train, y_train = traindf, traindf["importance"]
positions = pd.DataFrame(y_train).set_index(traindf.srch_id)

# 1 procent van totale data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42, shuffle=False, stratify = None)
x_traincopy = x_train.copy()
x_train = x_train.drop(columns=["position", "importance", "competitor_available",
                               "competitor_lower", "visited_before"])

# x_test, y_test = testdf.drop(columns=["position", "click_bool", "booking_bool"]), testdf["position"]
x_test = testdf

x_train.shape, y_train.shape, x_test.shape

((355006, 12), (355006,), (4959183, 11))

In [6]:
params = {'objective': 'rank:pairwise', 'learning_rate': 0.1,
          'gamma': 1.0, 'min_child_weight': 0.1,
          'max_depth': 6,  'n_estimators': 500}

# groups equal to length of queries
query_lengths = x_train.groupby('srch_id').size().values
    

In [7]:
model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, query_lengths, verbose=True)

XGBRanker(base_score=0.5, booster=None, colsample_bylevel=1, colsample_bynode=1,
          colsample_bytree=1, gamma=1.0, gpu_id=-1, importance_type='gain',
          interaction_constraints=None, learning_rate=0.1, max_delta_step=0,
          max_depth=6, min_child_weight=0.1, missing=nan,
          monotone_constraints=None, n_estimators=500, n_jobs=0,
          num_parallel_tree=1, objective='rank:pairwise', random_state=0,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
          tree_method=None, validate_parameters=False, verbosity=None)

In [None]:
# Make fake column of positions to test score in ndcg scoring function
x_train["position_temp"] = x_train.groupby(['srch_id']).cumcount()+1

In [46]:
# x_train["position"] = y_train
predictions_sorted = []
prop_ids_sorted = []

# x_train = x_train.drop(["position", "pred"], axis=1)

# Sort predictions for each group SEPERATELY
for srchid, group in x_train.groupby('srch_id'):
    
    # Predictions for one search_id
    pred = model.predict(group.drop("position_temp", axis=1))
    
    # Sort all columns (position and prop_id) from this group based on predictions
    predictions = [x for _,x in sorted(zip(pred, group.position_temp), reverse=True)] # TODO: Reverse = False toch? Hoezo werkt dit..
    predictions_sorted.append(predictions)
    
    prop_id = [x for _,x in sorted(zip(pred, group.prop_id), reverse=True)]
    prop_ids_sorted.append(prop_id)
    
# Flatten lists
predictions_sorted = [item for sublist in predictions_sorted for item in sublist]
prop_ids_sorted = [item for sublist in prop_ids_sorted for item in sublist]


In [48]:
x_traincopy["prop_id"] = prop_ids_sorted
x_traincopy["predicted_rank"] = predictions_sorted
final_df = x_traincopy
final_df.head()

Unnamed: 0,prop_id,srch_id,position,price_quality,competitor_lower,competitor_available,visited_before,click_bool,booking_bool,price_usd,promotion_flag,prop_brand_bool,random_bool,prop_location_score1,prop_location_score2,prop_id_price_mean,importance,pred,predicted_rank
23,68914,1,12,64.03,1,0,0,0,0,128.06,0,1,1,2.71,0.0465,39.469167,0,-8.781317,2
2209657,107872,148396,20,38.285,0,0,0,0,0,153.14,0,0,0,4.84,0.0,46.419412,0,10.445211,1
2209666,139893,148396,32,58.384,1,0,0,0,0,291.92,0,1,0,4.62,0.1326,67.414444,0,-8.781317,2
2209709,116696,148399,34,49.806667,0,0,0,0,0,149.42,0,1,0,2.56,0.0144,46.710303,0,3.174961,1
2209734,104251,148401,15,38.0,0,0,0,0,0,76.0,0,1,0,0.0,0.0,38.0,0,9.907165,1


In [None]:
final_df["predicted_rank"] = final_df["position_temp"]
final_df.head()

In [24]:
# pred = model.predict(x_train.drop("position", axis=1))
# x_traincopy["pred"] = pred
# x_temp = x_traincopy.sort_values(by=['pred'], ascending=False)["position"]

# predictions_sorted = []
# x_train["position"] = y_train

# # Sort predictions for each group SEPERATELY
# for srchid, group in x_train.groupby('srch_id'):
    
#     # Predictions for one search_id
#     pred = model.predict(group.drop("position", axis=1))
# #     print(pred, group)
# #     group['predicted_rank'] = pred
# #     group = group.sort_values('predicted_rank')
        
#     predictions = [x for _,x in sorted(zip(pred, group.position), reverse=True)] # TODO: Reverse = False toch? Hoezo werkt dit..
#     predictions_sorted.append(predictions)
    
# predictions_sorted = [item for sublist in predictions_sorted for item in sublist]




943709     110228
1508702     24294
2298369     59107
2520570     88820
394821     135806
1681663     23016
3462287     62215
3023470    111449
1284776     90809
1436242     42175
Name: prop_id, dtype: object

In [21]:
x_traincopy["predicted_rank"] = x_temp

In [49]:
x_traincopy.to_pickle("../pickles/XGBoost_solution.pkl")

## Test set only

In [None]:
# pred = model.predict(x_test)

# x_test["predicted_rank"] = pred
# predictions_sorted = []

# # Sort predictions for each group SEPERATELY
# for srchid, group in x_test.groupby('srch_id'):
    
#     predictions = [x for _,x in sorted(zip(group.predicted_rank, group.prop_id))]
#     predictions_sorted.append(predictions)
    
# predictions_sorted = [item for sublist in predictions_sorted for item in sublist]

In [None]:
# final_df = x_test[["srch_id", "prop_id"]]
# final_df.to_csv("submission5mei.csv", columns=["srch_id", "prop_id"], index=False)

In [None]:
# # Test if it worked
# test = pd.read_csv("submission5mei.csv")
# test.head()

In [None]:
# importances = pd.DataFrame(x_test.columns, model.feature_importances_)
# importances.head(10)

In [None]:
x_traincopy.head(5)