In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 

In [2]:
traindf = pd.read_hdf("../data/traindf_clean.hdf")

In [3]:
print(list(traindf.columns))

traindf = traindf.sort_values(by=['srch_id'])

['prop_id', 'srch_id', 'position', 'price_quality', 'click_bool', 'booking_bool', 'price_usd', 'promotion_flag', 'prop_location_score1', 'prop_location_score2', 'avg_price_propid', 'std_avg_price_propid', 'amount_hotels', 'avg_price_propid_after', 'price_rank', 'price_correction', 'importance']


In [None]:
# split label and other variables
x_train, y_train = traindf, traindf["importance"]
positions = pd.DataFrame(y_train).set_index(traindf.srch_id)

# 1 procent van totale data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.98, random_state=42, shuffle=False, stratify = None)
x_traincopy = x_train.copy()
x_train = x_train.drop(columns=["position", "importance", "click_bool", "booking_bool"])
x_test = x_test.drop(columns=["position", "importance", "click_bool", "booking_bool"])

x_train.shape, y_train.shape, x_test.shape

In [5]:
params = {'objective': 'rank:pairwise', 'learning_rate': 0.15,
          'gamma': 0.9, 'min_child_weight': 1.0,
          'max_depth': 6,  'n_estimators': 400}


# groups equal to length of queries
query_lengths = x_train.groupby('srch_id').size().values
    

In [6]:
model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, query_lengths, verbose=True)

XGBRanker(base_score=0.5, booster=None, colsample_bylevel=1, colsample_bynode=1,
          colsample_bytree=1, gamma=0.9, gpu_id=-1, importance_type='gain',
          interaction_constraints=None, learning_rate=0.3, max_delta_step=0,
          max_depth=7, min_child_weight=0.7, missing=nan,
          monotone_constraints=None, n_estimators=500, n_jobs=0,
          num_parallel_tree=1, objective='rank:pairwise', random_state=0,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
          tree_method=None, validate_parameters=False, verbosity=None)

In [7]:
# Make fake column of positions to test score in ndcg scoring function
x_train["position_temp"] = x_train.groupby(['srch_id']).cumcount()+1

In [8]:
# x_train["position"] = y_train
predictions_sorted = []
prop_ids_sorted = []

# x_train = x_train.drop(["position", "pred"], axis=1)

# Sort predictions for each group SEPERATELY
for srchid, group in x_train.groupby('srch_id'):
    
    # Predictions for one search_id
    pred = model.predict(group.drop("position_temp", axis=1))
    
    # Sort all columns (position and prop_id) from this group based on predictions
    predictions = [x for _,x in sorted(zip(pred, group.position_temp), reverse=True)] # TODO: Reverse = False toch? Hoezo werkt dit..
    predictions_sorted.append(predictions)
    
    prop_id = [x for _,x in sorted(zip(pred, group.prop_id), reverse=True)]
    prop_ids_sorted.append(prop_id)
    
# Flatten lists
predictions_sorted = [item for sublist in predictions_sorted for item in sublist]
prop_ids_sorted = [item for sublist in prop_ids_sorted for item in sublist]


In [9]:
x_traincopy["prop_id"] = prop_ids_sorted
x_traincopy["predicted_rank"] = predictions_sorted
final_df = x_traincopy
final_df.head()

Unnamed: 0,prop_id,srch_id,position,price_quality,click_bool,booking_bool,price_usd,promotion_flag,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,price_rank,price_correction,importance,predicted_rank
9,68914,1,10,70.1725,0,0,280.69,0,2.83,0.1028,180.062267,393.023003,675.0,140.930095,27.0,280.69,0,18
19,59526,1,6,47.86,0,0,191.44,0,2.08,0.015,163.645434,374.466721,633.0,126.070117,24.0,191.44,0,21
22,88218,1,32,57.56,0,0,115.12,0,2.83,0.0145,109.046717,364.516025,530.0,85.131512,7.5,115.12,0,22
27,89073,1,29,63.38,0,0,190.14,0,2.3,0.0032,179.193532,39.934266,385.0,179.193532,22.5,190.14,0,2
4,95307,1,4,35.895,0,0,143.58,0,2.64,0.1241,137.648135,432.975724,665.0,103.923122,16.0,143.58,0,15


In [10]:
# final_df["predicted_rank"] = final_df["position_temp"]
final_df.head()

Unnamed: 0,prop_id,srch_id,position,price_quality,click_bool,booking_bool,price_usd,promotion_flag,prop_location_score1,prop_location_score2,avg_price_propid,std_avg_price_propid,amount_hotels,avg_price_propid_after,price_rank,price_correction,importance,predicted_rank
9,68914,1,10,70.1725,0,0,280.69,0,2.83,0.1028,180.062267,393.023003,675.0,140.930095,27.0,280.69,0,18
19,59526,1,6,47.86,0,0,191.44,0,2.08,0.015,163.645434,374.466721,633.0,126.070117,24.0,191.44,0,21
22,88218,1,32,57.56,0,0,115.12,0,2.83,0.0145,109.046717,364.516025,530.0,85.131512,7.5,115.12,0,22
27,89073,1,29,63.38,0,0,190.14,0,2.3,0.0032,179.193532,39.934266,385.0,179.193532,22.5,190.14,0,2
4,95307,1,4,35.895,0,0,143.58,0,2.64,0.1241,137.648135,432.975724,665.0,103.923122,16.0,143.58,0,15


In [11]:
# x_traincopy["predicted_rank"] = x_temp

In [12]:
final_df.to_pickle("../pickles/XGBoost_test9mei.pkl")

In [13]:
importances = pd.DataFrame(x_test.columns, model.feature_importances_)
importances.head(15)

Unnamed: 0,0
0.051867,prop_id
0.04784,srch_id
0.086438,price_quality
0.059818,price_usd
0.143632,promotion_flag
0.074507,prop_location_score1
0.127152,prop_location_score2
0.073584,avg_price_propid
0.06384,std_avg_price_propid
0.063688,amount_hotels


In [14]:
from sklearn.feature_selection import SelectFromModel
import matplotlib.pylab as plt
import seaborn as sns
sns.set()

xgb.plot_importance(booster=model )
plt.show()

  import pandas.util.testing as tm


<Figure size 640x480 with 1 Axes>