In [46]:
from catboost import CatBoostRanker, Pool
import numpy as np
import os
import pandas as pd

from sklearn.metrics import ndcg_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('data/intern_task.csv')
df = df.sort_values(by='query_id').reset_index(drop=True)

In [3]:
corr = df.corr()
# corr.head()

In [4]:
# sns.heatmap(corr)

In [5]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False

In [6]:
selected_columns = df.columns[columns]
selected_columns.shape

(81,)

In [7]:
df = df[selected_columns]

In [8]:
features = df.columns.values[2:]

In [9]:
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [10]:
df = df.groupby(['query_id', 'rank']).mean().reset_index()

In [11]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features_fitted = sel.fit_transform(df[features]).T

In [12]:
feats = df[features].to_numpy().T
print(feats.shape, features_fitted.shape)

(79, 7515) (70, 7515)


In [13]:
# for i in range(len(feats)):
#   for j in range(len(features_fitted)):
#     if (feats[i] == features_fitted[j]).sum() == 7515:
#       print(i, j)
#       break

In [14]:
drop_ind = [9, 10, 28, 45, 46, 51, 60, 62, 70]

In [15]:
cols = df.columns.values[2:]
drop_cols = [cols[i] for i in drop_ind]
drop_cols

['feature_11',
 'feature_12',
 'feature_41',
 'feature_64',
 'feature_65',
 'feature_72',
 'feature_100',
 'feature_105',
 'feature_128']

In [16]:
df = df.drop(drop_cols, axis=1)

In [17]:
ids = df.query_id.unique()
train_size = round(len(ids) * 0.8)
last_train = ids[train_size]

print(len(ids), last_train)

2000 24010


In [18]:
# df[df.query_id == last_train]

In [19]:
test_index = 6010
train_df, test_df = df[:test_index],  df[test_index:]

In [44]:
train_df.to_csv('data/train.csv')
test_df.to_csv('data/test.csv')

In [20]:
X_train = train_df.drop(['rank', 'query_id'], axis=1).values
y_train = train_df['rank'].values
queries_train = train_df['query_id'].values

X_test = test_df.drop(['rank', 'query_id'], axis=1).values
y_test = test_df['rank'].values
queries_test = test_df['query_id'].values

In [21]:
y_train = y_train.astype('float64')
y_test = y_test.astype('float64')

In [22]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

In [26]:
def create_weights(queries):
    query_set = np.unique(queries)
    query_weights = np.random.uniform(size=query_set.shape[0])
    weights = np.zeros(shape=queries.shape)

    for i, query_id in enumerate(query_set):
        weights[queries == query_id] = query_weights[i]

    return weights

In [27]:
train_with_weights = Pool(
    data=X_train,
    label=y_train,
    group_weight=create_weights(queries_train),
    group_id=queries_train
)

test_with_weights = Pool(
    data=X_test,
    label=y_test,
    group_weight=create_weights(queries_test),
    group_id=queries_test
)

In [31]:
parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG:top=5', 'PFound', 'AverageGain:top=5'],
    'verbose': False,
    'random_seed': 0,
    'metric_period': 50,
    #'task_type': 'GPU',
    'train_dir': 'ranking_model',
    'loss_function': 'YetiRankPairwise'
}

In [32]:
model = CatBoostRanker(**parameters)
model.fit(train_with_weights, eval_set=test_with_weights)

<catboost.core.CatBoostRanker at 0x78dcd24b4370>

In [36]:
y_pred = model.predict(X_test)
ndcg_score([y_test], [y_pred], k=5)

0.9152099486815958

In [37]:
features = df.columns.values[2:]

pred = model.predict(df[features])
ndcg_score([df['rank']], [pred], k=5)

0.9999999999999999

In [38]:
ids = df['query_id']
real_rank = df['rank']

In [39]:
ans = pd.DataFrame({"id": ids, "rank_pred": pred*max_relevance, "rank_real": real_rank})
ans = ans.sort_values(['id', 'rank_pred']).reset_index(drop=True)
ans['rank_pred'] = ans.groupby(['id']).cumcount()

In [41]:
ans.to_csv('data/result.csv')

In [90]:
print("NDCG@5: ", ndcg_score([ans['rank_real']], [ans['rank_pred']], k=5))

NDCG@5:  0.9576359832635982


In [70]:
from metrics import precision_at_k, recall_at_k

In [79]:
pres = []
recall = []

for i in range(int(max_relevance)):
    an = ans[ans.rank_real == i][['rank_pred', 'rank_real']] == i
    pres.append(precision_at_k(df=an, k=5, y_test='rank_real', y_pred='rank_pred'))
    recall.append(recall_at_k(df=an, k=5, y_test='rank_real', y_pred='rank_pred'))

In [89]:
print("Precision@5: ", sum(pres)/len(pres))
print("Recall@5: ", sum(recall)/len(recall))

Precision@5:  1.0
Recall@5:  0.00326844768361688
