In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 

In [None]:
traindf = pd.read_hdf("../data/traindf_clean.hdf")
testdf = pd.read_hdf("../data/test_clean.hdf")

In [None]:
print(list(traindf.columns))
print(list(testdf.columns))

In [None]:
traindf = traindf.sort_values(by=['srch_id'])

In [None]:
# split label and other variables
x_train, y_train = traindf, traindf["importance"]

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    shuffle=False, 
                                                    stratify = None)

# Drop columns that are to be predicted with importance (which is set to y_train)
x_train = x_train.drop(columns=["position", "importance", "booking_bool", "click_bool"])

# x_test is the total testset with all columns. 
x_test = testdf

In [None]:
# Check for correct shape and columns;
# x_train is probably much smaller because of downsampling: (50% of importance 5 and 1, 50% importance 0). 
# Number of columns in x_train and x_test must be equal!
print(x_train.shape, y_train.shape, x_test.shape)
print(list(x_train.columns))
print(list(x_test.columns))

In [None]:
# TODO: parameter tuning!
params = {'objective': 'rank:pairwise', 'learning_rate': 0.1,
          'gamma': 1.0, 'min_child_weight': 0.1,
          'max_depth': 6,  'n_estimators': 500}

# groups are equal to length of unique queries
query_lengths = x_train.groupby('srch_id').size().values

In [None]:
model = xgb.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, query_lengths, verbose=True)

In [None]:
# Make fake column of positions to test score in ndcg scoring function,
# This is not needed for the testset, because there is nothing to check in the testset
# x_train["position_temp"] = x_train.groupby(['srch_id']).cumcount()+1

In [None]:
# x_train["position"] = y_train
predictions_sorted = []
prop_ids_sorted = []

# Sort predictions for each group SEPERATELY
for srchid, group in x_test.groupby('srch_id'):
    
    # Predictions for one search_id
    pred = model.predict(group)
    
    prop_id = [x for _,x in sorted(zip(pred, group.prop_id), reverse=True)]
    prop_ids_sorted.append(prop_id)
    
# Flatten lists
predictions_sorted = [item for sublist in predictions_sorted for item in sublist]
prop_ids_sorted = [item for sublist in prop_ids_sorted for item in sublist]


In [None]:
x_test["prop_id"] = prop_ids_sorted

## Test set only: convert to csv for submission

In [None]:
filename = "submission9mei.csv"
final_df = x_test[["srch_id", "prop_id"]]
final_df.to_csv(filename, columns=["srch_id", "prop_id"], index=False)

In [None]:
# # Test if it worked
test = pd.read_csv(filename)
test.head()

In [None]:
# importances = pd.DataFrame(x_test.columns, model.feature_importances_)
# importances.head(10)