In [1]:
import numpy as np
from collections import defaultdict
from xgboost import XGBRegressor
from sklearn.datasets import load_svmlight_file

In [3]:
X, y, query_ids = load_svmlight_file('l2r/train.txt', query_id=True)

In [4]:
docs = defaultdict(list)
for doc_id, query_id in enumerate(query_ids):
    docs[query_id].append(doc_id)
    


In [5]:

y_arr = dict()
for query_id in docs:
    cur_docs = docs[query_id]
    y_cur = y[cur_docs]
    y_cur = np.sign(y_cur.reshape(-1, 1) - y_cur).astype(int)
    y_arr[query_id] = y_cur

In [6]:

def objective(y_true, y_pred):
    gradient = np.zeros(y_true.shape[0])
    hessian = np.zeros(y_true.shape[0])
    for query_id in docs:
        cur_docs = np.array(docs[query_id])
        h = y_pred[cur_docs]
        h = h.reshape(-1, 1) - h
        h[h > 50] = 50
        h[h < -50] = -50
        y_cur = y_arr[query_id]
        dC_ds = 1.0 / (1 + np.exp( y_cur * h)) 
        gradient[cur_docs] = -np.sum(y_cur * dC_ds, axis=1) 
        hessian[cur_docs] = np.sum( dC_ds * (1 - dC_ds), axis=1)
    hessian[np.isclose(hessian, 0.0)] = 1.0
    return gradient, hessian

In [7]:
%%time
params = {'objective': objective, 'max_depth': 8, 'n_estimators': 3800, 'n_jobs': 2, 'subsample': 0.755, 'random_state':8}
model1 = XGBRegressor(tree_method='gpu_hist',**params)
model1.fit(X, y)

CPU times: user 1h 22min 8s, sys: 3min 42s, total: 1h 25min 50s
Wall time: 1h 25min 51s


In [8]:
X_test, y_test, query_ids_test = load_svmlight_file('drive/My Drive/l2r/test.txt', query_id=True)
y_pred = model1.predict(X_test)
test= defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
    test[query_id].append(doc_id)

In [9]:
with open("submission.csv", 'w') as write_file:
    print("QueryId,DocumentId", file=write_file)
    for query_id in test:
        docs_t = test[query_id]
        y_pred_i = y_pred[docs_t]
        ids = np.argsort(y_pred_i)[::-1]
        docs = np.array(docs_t)[ids]
        for doc_id in docs:
            print(f"{query_id},{doc_id+1}", file=write_file)