In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold, cross_val_score

from tqdm import tqdm

import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords 

import re

import warnings
warnings.filterwarnings('ignore')

import os

Функция для обработки заголовков и урлов:

In [72]:
def process(s):
    ru = re.findall(r'[а-я]\w+', s.lower())
    en = re.findall(r'[a-z]\w+', s.lower())
    words_ru = [SnowballStemmer('russian').stem(w) for w in ru if not w in stopwords.words('russian')]
    words_en = [SnowballStemmer('english').stem(w) for w in en if not w in stopwords.words('english')]
    return words_ru + words_en

Считываем урлы:

In [73]:
files = os.listdir(path='content')

In [85]:
urls = []
for file in tqdm(files):
    with open('content/' + file, encoding='utf-8') as f:
        url = process(f.readline())
        urls.append(url)
len(urls)

100%|███████████████████████████████████████████████████████████████████████████| 28026/28026 [01:12<00:00, 385.47it/s]


28026

In [None]:
urls[0]

Обработка заголовков из docs_titles (здесь я ничего не менял):

In [86]:
doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


Обработка train_groups (здесь я обрабатываю заголовки и добавляю в traingroups_titledata в том числе и урл):

In [87]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in tqdm(range(len(train_data))):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = process(doc_to_title[doc_id])
    url = urls[doc_id - 1]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, url, target))

100%|███████████████████████████████████████████████████████████████████████████| 11690/11690 [00:30<00:00, 381.50it/s]


Составляем X_train и y_train (отличие от baseline только в том, что ищем еще и пересечения среди урлов):

In [88]:
y_train = []
X_train = []
groups_train = []
for new_group in tqdm(traingroups_titledata):
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, url, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist_1 = []
        all_dist_2 = []
        words_1 = set(title)
        words_2 = set(url)
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, url_j, target_j = docs[j]
            words_1_j = set(title_j)
            words_2_j = set(url_j)
            all_dist_1.append(len(words_1.intersection(words_1_j)))
            all_dist_2.append(len(words_2.intersection(words_2_j)))
        X_train.append(sorted(all_dist_1, reverse=True)[0:15] + sorted(all_dist_2, reverse=True)[0:15])
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

100%|████████████████████████████████████████████████████████████████████████████████| 129/129 [00:02<00:00, 60.31it/s]

(11690, 30) (11690,) (11690,)





Делаем скейлинг:

In [89]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_train_scaled

array([[ 1.11976314,  1.00918296,  1.22284645, ...,  1.3950957 ,
         1.42423049,  1.44683807],
       [ 2.09637605,  2.69709055,  2.42583181, ..., -0.28968413,
        -0.25412177, -0.22297504],
       [ 0.14315023,  0.4465471 ,  0.62135377, ..., -1.97446396,
        -1.93247403, -1.89278815],
       ...,
       [-1.32176913, -1.2413605 , -1.18312427, ...,  1.3950957 ,
         1.42423049,  1.44683807],
       [ 1.11976314, -1.2413605 , -1.18312427, ...,  1.3950957 ,
         1.42423049,  1.44683807],
       [-0.83346267, -0.67872463, -0.58163159, ..., -0.28968413,
        -0.25412177, -0.22297504]])

Аналогичным образом обрабатываем test_groups, составляем X_test, делаем скейлинг:

In [98]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in tqdm(range(len(test_data))):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = process(doc_to_title[doc_id])
    url = urls[doc_id - 1]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title, url))

100%|████████████████████████████████████████████████████████████████████████████| 16627/16627 [03:54<00:00, 70.83it/s]


In [91]:
X_test = []
groups_test = []
for new_group in tqdm(testgroups_titledata):
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title, url) in enumerate(docs):
        groups_test.append(new_group)
        all_dist_1 = []
        all_dist_2 = []
        words_1 = set(title)
        words_2 = set(url)
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, url_j = docs[j]
            words_1_j = set(title_j)
            words_2_j = set(url_j)
            all_dist_1.append(len(words_1.intersection(words_1_j)))
            all_dist_2.append(len(words_1.intersection(words_2_j)))
        X_test.append(sorted(all_dist_1, reverse=True)[0:15] + sorted(all_dist_2, reverse=True)[0:15])
X_test = np.array(X_test)
groups_test = np.array(groups_test)
print (X_test.shape, groups_test.shape)

100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [00:02<00:00, 61.02it/s]

(16627, 30) (16627,)





In [92]:
X_test_scaled = StandardScaler().fit_transform(X_test)
X_test_scaled

array([[ 1.15544873e-01,  4.06924852e-01,  5.62329961e-01, ...,
        -3.36259040e-01, -3.35481123e-01, -3.34926109e-01],
       [ 5.69184683e-01,  4.06924852e-01,  2.42456178e-03, ...,
         2.95836622e+00,  2.96904769e+00,  2.97393688e+00],
       [ 5.69184683e-01,  4.06924852e-01,  5.62329961e-01, ...,
         2.95836622e+00,  2.96904769e+00,  2.97393688e+00],
       ...,
       [ 1.15544873e-01,  4.06924852e-01,  5.62329961e-01, ...,
        -3.36259040e-01, -3.35481123e-01, -3.34926109e-01],
       [ 1.02282449e+00,  1.46402032e+00,  1.68214076e+00, ...,
         2.95836622e+00,  2.96904769e+00,  2.97393688e+00],
       [ 3.74466335e+00,  4.63530671e+00,  5.04157315e+00, ...,
         2.95836622e+00,  2.96904769e+00,  2.97393688e+00]])

Подбираем параметры для LightGBM:

In [26]:
params = {'learning_rate': [0.5, 0.2, 0.1, 0.02, 0.05, 0.01, 0.001, 0.0001],
              'n_estimators' : [50, 75, 100, 150, 200, 500, 1000],
              'max_depth': [5, 6, 7, 8, 10]}

model = GridSearchCV(estimator=LGBMClassifier(objective = 'binary', metric = 'f1'), cv=GroupKFold(n_splits=3), n_jobs=-1, param_grid=params)

In [27]:
model.fit(X_train_scaled, y_train, groups_train)

GridSearchCV(cv=GroupKFold(n_splits=3),
             estimator=LGBMClassifier(metric='f1', objective='binary'),
             n_jobs=-1,
             param_grid={'learning_rate': [0.5, 0.2, 0.1, 0.02, 0.05, 0.01,
                                           0.001, 0.0001],
                         'max_depth': [5, 6, 7, 8, 10],
                         'n_estimators': [50, 75, 100, 150, 200, 500, 1000]})

Лучшие параметры:

In [28]:
model.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

In [29]:
model.best_estimator_

LGBMClassifier(max_depth=5, metric='f1', n_estimators=50, objective='binary')

In [30]:
clf = model.best_estimator_

Проводим кросс-валидацию:

In [31]:
cross_val_score(clf, X_train_scaled, y_train, groups=groups_train, scoring=make_scorer(f1_score)).mean()

0.6588201751432985

In [32]:
clf.fit(X_train_scaled, y_train)

LGBMClassifier(max_depth=5, metric='f1', n_estimators=50, objective='binary')

Подбираем параметры для XGBoost:

In [36]:
params_1 = {'learning_rate': [0.5, 0.2, 0.1, 0.02, 0.05, 0.01, 0.001, 0.0001],
              'n_estimators' : [50, 75, 100, 150, 200, 500, 1000],
              'max_depth': [5, 6, 7, 8, 10]}

model_1 = GridSearchCV(estimator=XGBClassifier(eval_metric='logloss'), cv=GroupKFold(n_splits=3), n_jobs=-1, param_grid=params_1)

In [37]:
model_1.fit(X_train_scaled, y_train, groups_train)

GridSearchCV(cv=GroupKFold(n_splits=3),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monoto...one,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, su

Лучшие параметры:

In [38]:
model_1.best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}

In [39]:
model_1.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [40]:
clf_1 = model_1.best_estimator_
cross_val_score(clf_1, X_train_scaled, y_train, groups=groups_train, scoring=make_scorer(f1_score)).mean()

0.6625461666191896

In [41]:
clf_1.fit(X_train_scaled, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

Подбираем параметры для CatBoost:

In [47]:
params_2 = {'learning_rate': [0.5, 0.2, 0.1, 0.02, 0.05, 0.01, 0.001, 0.0001],
              'n_estimators' : [50, 75, 100, 150, 200, 500, 1000],
              'depth': [5, 6, 7, 8, 10]}

model_2 = GridSearchCV(estimator=CatBoostClassifier(), cv=GroupKFold(n_splits=3), n_jobs=-1, param_grid=params_2) 

Я закомментировал некоторые строки, потому что они давали слишком длинный вывод.

In [56]:
#model_2.fit(X_train_scaled, y_train, groups_train)

Лучшие параметры:

In [49]:
model_2.best_params_

{'depth': 6, 'learning_rate': 0.1, 'n_estimators': 100}

In [50]:
model_2.best_estimator_

<catboost.core.CatBoostClassifier at 0x2b0cc485e20>

In [57]:
# clf_2 = model_2.best_estimator_
# cross_val_score(clf_2, X_train_scaled, y_train, groups=groups_train, scoring=make_scorer(f1_score)).mean()

In [58]:
#clf_2.fit(X_train_scaled, y_train)

Предсказываем результат соответствующего алгоритма (clf - LGBM, clf_1 - XGB, clf_2 - CatBoost):

In [94]:
pred = clf.predict(X_test_scaled)
#pred = clf_1.predict(X_test_scaled)
#pred = clf_2.predict(X_test_scaled)

In [95]:
pred

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

Записыааем в файл:

In [96]:
test_data['target'] = pd.Series(np.asarray(pred), dtype=int)
with open('res.csv', 'w') as f:
    f.write(test_data.to_csv(columns=('pair_id', 'target'), index=False))

На кросс-валидации XGBoost дает результат лучше, чем LightGBM, однако на kaggle LightGBM дает лучший скор.