In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip install catboost

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
from random import randint
from catboost import CatBoostClassifier, CatBoostRanker
from catboost import Pool
from catboost import CatBoost, Pool
from sklearn.preprocessing import StandardScaler

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
import pandas as pd


train = pd.read_csv('/content/drive/MyDrive/vk_train_df.csv')
test = pd.read_csv('/content/drive/MyDrive/vk_test_df.csv')
train.head()

# EDA

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
unary_columns = [col for col in list(test) if test[col].nunique(dropna=False) <= 1]
train.drop(columns=unary_columns, inplace=True)
test.drop(columns=unary_columns, inplace=True)

In [None]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_correlation = [column for column in upper.columns if any(upper[column] > 0.8)]

train.drop(columns=high_correlation, inplace=True)
test.drop(columns=high_correlation, inplace=True)

In [None]:
leaky_features = ['feature_1', 'feature_2', 'feature_7', 'feature_8',
                         'feature_9', 'feature_13', 'feature_14', 'feature_17',
                         'feature_23', 'feature_47', 'feature_61', 'feature_62']
train.drop(columns=leaky_features, inplace=True)
test.drop(columns=leaky_features, inplace=True)

In [None]:
cols = train.columns
inds = [i for i in range(1, len(cols) - 1) if cols[i] not in cat_features]

scaler = StandardScaler()
train = train.values
test = test.values
train[:, inds] = scaler.fit_transform(train[:, inds])
test[:, inds] = scaler.transform(test[:, inds])
train = pd.DataFrame(train, columns=cols)
test = pd.DataFrame(test, columns=cols)

cat_features = [i for i in train if len(train[i].unique()) <= 10][:-1]
for i in cat_features + ['search_id', 'target']:
    train[i] = train[i].astype(int)
    test[i] = test[i].astype(int)

# CatBoost

In [None]:
cutoff_id = train['search_id'].quantile(0.9) # 90/10 split
X_train = train.loc[train.search_id < cutoff_id].drop(["target"],axis=1)
X_eval = train.loc[train.search_id >= cutoff_id].drop(["target"],axis=1)
y_train = train.loc[train.search_id < cutoff_id]["target"]
y_eval = train.loc[train.search_id >= cutoff_id]["target"]

In [None]:
train_weights = [0.75 if i == 1 else 0.25 for i in y_train]
eval_weights = [0.75 if i == 1 else 0.25 for i in y_eval]

In [None]:
train_pool = Pool(data=X_train,
                  label = y_train,
                  cat_features = cat_features,
                  group_id=X_train['search_id'],
                  weight=train_weights
                 )

eval_pool = Pool(data=X_eval,
                  label = y_eval,
                cat_features = cat_features,
                  group_id=X_eval['search_id'],
                 weight=eval_weights
                 )

In [None]:
parameters = {
    'iterations': 1000,
    'custom_metric': ['NDCG'],
    'verbose':True,
    'depth': 4, #4
    'l2_leaf_reg': 6, #3
    'random_seed': 0,
}

In [None]:
def fit_model(loss_function, train_pool=train_pool, test_pool=eval_pool):
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, verbose=1, eval_set=test_pool)#, use_best_model=True)
    return model


model = fit_model('RMSE')

In [None]:
preds = model.predict(test.drop(columns=['target']))
ndcg_score(np.array([test['target'].values]), np.array([preds]))

BEST SCORE - 0.6445960968089074

In [None]:
'''
0.6445960968089074
scaler
drop high_corr
cat_features <= 10 unique
drop leaky_features1
weights
'''

In [None]:
df = model.get_feature_importance(data=train_pool, prettified=True)
leaky_features = df[df.Importances < 0.001]['Feature Id']
leaky_features

In [None]:
model = CatBoostRanker(**parameters)
grid = {'learning_rate': [0.03, 0.01, 0.05],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [1, 3, 5]}

grid_search_result = model.grid_search(grid,
                                       train_pool,
                                       shuffle=False,
                                       cv=3,
                                       plot=True)
grid_search_result['params']

In [None]:
'''0.629 - CatBoostRanker,
'iterations': 1000,
'depth': 4,
'l2_leaf_reg':1.5,
'random_seed': 0,
with cats_features - unique <= 10
don't drop leaky_features
'''