# Другой подход

In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("/content/drive/MyDrive/intern_task_train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/intern_task_test.csv")

In [2]:
from sklearn.metrics import ndcg_score
from sklearn.metrics import accuracy_score


def NDCG_atK_score(test_Data, pred_Data, k, logits=False):
  query_ids = test_Data['query_id'].unique()
  y_true_sessions = [[int(rank) for rank in test_Data[test_Data['query_id'] == id]['rank']] for id in query_ids]
  y_pred_sessions = [[int(rank) for rank in pred_Data[pred_Data['query_id'] == id]['rank']] for id in query_ids]
  if logits:
    print(y_true_sessions)
    print(y_pred_sessions)
  ndcg_scores = [ndcg_score([y_true_sessions[i]], [y_pred_sessions[i]], k=5) for i in range(len(y_true_sessions)) if len(y_true_sessions[i]) > 1]
  return np.mean(ndcg_scores)

In [3]:
!pip install catboost

Successfully installed catboost-1.2.5


In [4]:
df_train['clf0'] = (df_train['rank'] > 0).astype(int)
df_train['clf1'] = (df_train['rank'] > 1).astype(int)
df_train['clf2'] = (df_train['rank'] > 2).astype(int)
df_train['clf3'] = (df_train['rank'] > 3).astype(int)

In [5]:
cat_features = ['feature_95', 'feature_96', 'feature_97', 'feature_98', 'feature_99', 'feature_1', 'feature_3', 'feature_28']
for feature in cat_features:
  df_train[feature] = df_train[feature].astype(int)
  df_test[feature] = df_test[feature].astype(int)

In [6]:
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

class Ensemble():
  def __init__(self):
    self.cat_features = ['feature_95', 'feature_96', 'feature_97', 'feature_98', 'feature_99', 'feature_1', 'feature_3', 'feature_28']
    self.clf0 = CatBoostClassifier(cat_features=self.cat_features, task_type="GPU")
    self.clf1 = CatBoostClassifier(cat_features=self.cat_features, task_type="GPU")
    self.clf2 = CatBoostClassifier(cat_features=self.cat_features, task_type="GPU")
    self.clf3 = CatBoostClassifier(cat_features=self.cat_features, task_type="GPU")

  def fit(self, X, Y):
    self.clf0.fit(X, Y['clf0'])
    self.clf1.fit(X, Y['clf1'])
    self.clf2.fit(X, Y['clf2'])
    self.clf3.fit(X, Y['clf3'])

  def predict(self, X):
    self.y_pred_0 = self.clf0.predict_proba(X)
    self.y_pred_1 = self.clf1.predict_proba(X)
    self.y_pred_2 = self.clf2.predict_proba(X)
    self.y_pred_3 = self.clf3.predict_proba(X)

    return self.transform()

  def transform(self):
    output = []
    for i in range(len(self.y_pred_0)):
      f0 = self.y_pred_0[i][0]
      f1 = self.y_pred_0[i][1] - self.y_pred_1[i][1]
      f2 = self.y_pred_1[i][1] - self.y_pred_2[i][1]
      f3 = self.y_pred_2[i][1] - self.y_pred_3[i][1]
      f4 = self.y_pred_3[i][1]
      f = np.array([f0, f1, f2, f3, f4])
      output.append(np.argmax(f))
    return np.array(output)

In [None]:
clf = Ensemble()
clf.fit(df_train.drop(columns=['rank', 'query_id', 'clf0', 'clf1', 'clf2', 'clf3']), df_train)

In [8]:
outputs = clf.predict(df_test.drop(columns=['query_id', 'rank']))
df_tmp = df_test.copy()
df_tmp['rank'] = outputs
print("NDCG@5 score : ", NDCG_atK_score(df_test, df_tmp, k=5))

NDCG@5 score :  0.4516808924180702
