In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import pairwise_distances, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV, KFold
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')
import nltk
import pymorphy2
from tqdm import tqdm
import string
from string import punctuation
from bs4 import BeautifulSoup
import codecs
import re
import string
from nltk.corpus import stopwords

# –ü–æ–ø—ã—Ç–∫–∞ ‚Ññ1

In [2]:
doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


In [3]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [4]:
doc_to_title[0] = ''

In [5]:
doc_to_title_tokenized = dict()
for key in doc_to_title:
    sentence_token = doc_to_title[key]
    sentence_token = sentence_token.lower().split()
    doc_to_title_tokenized[key] = sentence_token

In [6]:
stopword_set_eng = set(nltk.corpus.stopwords.words('english'))
stopword_set_rus = set(nltk.corpus.stopwords.words('russian'))
punctuation1 = punctuation
punct_tokens =set()
useless = set(['–≤','–Ω–∞', '‚Äî', '–∏','—Å', '–∏–ª–∏' '¬ª', '–∏–∑', '–ø–æ', '—Å–æ', '–∂–µ', '—Ç–∞', '-', '|', '//', '‚Ä¢', '>', '/'])
for symb in punctuation1: 
    punct_tokens.add(symb)

In [7]:
def my_is_digit(string):
    if string.isdigit():
        return True
    else:
        try:
            float(string)
            return True
        except ValueError:
            return False

In [9]:
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    doc_to_title_tokenized[key] = []
    for word in hold:
        if word not in (stopword_set_eng and stopword_set_rus and punct_tokens and useless):
            doc_to_title_tokenized[key].append(word)
    if doc_to_title_tokenized[key] == []:
        doc_to_title_tokenized[key].append('')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:00<00:00, 225417.71it/s]


In [11]:
lemmatizer = pymorphy2.MorphAnalyzer()
inword_punct = set([',', ':', '"', '¬ª', '¬´', '‚Äù', '‚Äú', ')', '(', '[',']', '|', '*'])
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    doc_to_title_tokenized[key] = []
    for word in hold:
        tmp = ''
        for symb in word:
            if symb not in inword_punct:
                tmp = tmp+symb
        word = tmp
        if my_is_digit(word) == False and word != '':
            word = lemmatizer.parse(word)[0].normal_form
            doc_to_title_tokenized[key].append(word)
    if doc_to_title_tokenized[key] == []:
        doc_to_title_tokenized[key].append('')   

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:42<00:00, 666.41it/s]


In [12]:
eng_alph = list(string.ascii_lowercase)
a = ord('–∞')
rus_alph = ''.join([chr(i) for i in range(a,a+6)] + [chr(a+33)] + [chr(i) for i in range(a+6,a+32)])
rus_alphabet=[]
for elem in rus_alph:
    rus_alphabet.append(elem)
total = eng_alph + rus_alphabet
for i in "1234567890":
    total.append(i)
total = set(total)
bad_stuff = dict()
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    for word in hold:
        if len(word)>0:
            if word[-1] not in total:
                if word[-1] not in bad_stuff:
                    bad_stuff[word[-1]] = 1
                else:
                    bad_stuff[word[-1]] += 1

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:00<00:00, 344854.80it/s]


{'.': 7312,
 '‚ï¨': 1,
 '?': 3461,
 '‚Ä¶': 42,
 ';': 44,
 '‚Ññ': 149,
 '‚Äï': 7,
 '!': 974,
 'üîç': 4,
 '>': 70,
 '/': 40,
 '‚ãÜ': 3,
 '¬ß': 9,
 '%': 23,
 '-': 113,
 '‚Äì': 462,
 'üëç': 2,
 '—ñ': 15,
 '¬∞': 5,
 '#': 8,
 '‚Äí': 1,
 'üö©': 9,
 "'": 37,
 '~': 234,
 '‚òÜ': 2,
 '‚Äô': 1,
 'Ïùò': 1,
 '‚Üí': 6,
 '\\': 11,
 '¬Æ': 10,
 '&': 53,
 '¬©': 4,
 '‚úø': 2,
 '+': 127,
 '‚úî': 8,
 '\x97': 4,
 '‚Ä∫': 7,
 '‚Äî': 6,
 '‚ò∫': 2,
 '‚ô•': 6,
 '‚ôÇ': 1,
 '≈º': 1,
 '¬±': 1,
 '‚ñ∫': 4,
 '“£': 1,
 '_': 11,
 '\u200d': 1,
 '—ï': 4,
 '‚ù§': 3,
 '=': 18,
 '<': 3,
 'üìå': 12,
 '^': 1,
 '—ò': 1,
 '◊î': 2,
 '◊ò': 1,
 '◊ô': 3,
 '◊™': 2,
 '}': 9,
 '\xad': 1,
 '@': 4,
 '‚ñ∏': 1,
 '‚ú¶': 3,
 '‚á®': 1,
 'üí™': 1,
 'üìç': 1,
 'üíï': 1,
 '€∂': 2,
 '—ó': 1,
 '¬¨': 2,
 '‚Ñ¢': 4,
 'üíµ': 1,
 '‚ô´': 4,
 '√ü': 1,
 'üìπ': 1,
 ' ñ': 1,
 '‚úà': 1,
 '¬∑': 3,
 'üè°': 1,
 'Ô∏è': 1,
 '\ufeff': 1,
 'üë∂': 2,
 '‚ñ≤': 2,
 '√™': 1,
 '√∞': 2,
 '√±': 11,
 '¬æ': 5,
 '√¢': 1,
 '¬µ': 3,
 '¬∏': 3,
 'ƒÉ': 1,
 '‚ñà': 1,
 '‚ñ

In [13]:
lemmatizer = pymorphy2.MorphAnalyzer()
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    doc_to_title_tokenized[key] = []
    for word in hold:
        if len(word) > 0:
            if word[-1] in bad_stuff:
                word = word[:len(word)-1]
        if my_is_digit(word) == False and word != '':
            word = lemmatizer.parse(word)[0].normal_form
            doc_to_title_tokenized[key].append(word)
    if doc_to_title_tokenized[key] == []:
        doc_to_title_tokenized[key].append('')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:38<00:00, 725.12it/s]


In [15]:
vectorizer = TfidfVectorizer()
prep_vec = []
for key in range(0, len(doc_to_title_tokenized)):
    temp = ' '.join(doc_to_title_tokenized[key])
    prep_vec.append(temp)
data_vec = vectorizer.fit_transform(prep_vec)
data_vec.shape

(28027, 27920)

–î–ª—è –∫–∞–∂–¥–æ–≥–æ –¥–æ–∫–∞ –≤ –≥—Ä—É–ø–ø–µ - 15 –∫–æ—Å–∏–Ω—É—Å–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π –¥–æ –±–ª–∏–∂–∞–π—à–∏—Ö –≤ –∫–∞—á–µ—Å—Ç–≤–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [16]:
def prepare_cosine(filename, arg = 1):
    df = pd.read_csv(filename)
    X1 = np.zeros((df.shape[0], 15), dtype=float)
    if arg == 1:
        y1 = np.zeros(df.shape[0], dtype=float)
    used = 0
    for num in df.group_id.unique():
        temp = df[df.group_id == num]
        temp_docs = temp.doc_id
        dist_mat = pairwise_distances(data_vec[temp.doc_id], metric='cosine')
        for i in range(0, dist_mat.shape[0]):
            cur_dist = np.sort(dist_mat[i])[1:16]
            X1[used + i] = cur_dist
        if arg == 1:
            y1[used:temp.shape[0] + used] = temp['target']
        used += temp.shape[0]
    if arg == 1:
        return X1, y1
    else:
        return X1

In [17]:
X_train, y_train = prepare_cosine('train_groups.csv')

In [18]:
X_test = prepare_cosine('test_groups.csv', arg= 0)

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
df = pd.read_csv('train_groups.csv')
train_data.group_id.unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
      dtype=int64)

In [21]:
X_train.shape

(11690, 15)

In [22]:
params_choose = {'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
                  'max_depth': [1, 2, 3, 4, 5, 6],
                  'n_estimators': [10, 15, 20, 25, 30 ,35, 40, 50, 70, 100],
                  'seed': [0],
                  'verbosity': [0]}
my_model = XGBClassifier()
search_res = GridSearchCV(my_model, params_choose, scoring=make_scorer(f1_score), cv=GroupKFold(n_splits=5))
search_res.fit(X_train, y_train, groups=train_data['group_id'].values)
best_model = search_res.best_estimator_
search_res.best_params_

{'learning_rate': 0.02,
 'max_depth': 1,
 'n_estimators': 30,
 'seed': 0,
 'verbosity': 0}

In [23]:
best_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=30, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=0)

In [24]:
cross_val_score(best_model, X_train, y_train, groups=train_data['group_id'].values,
                scoring=make_scorer(f1_score)).mean()

0.7306687792980197

In [25]:
best_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=30, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=0)

In [26]:
test_gr = pd.read_csv('test_groups.csv')
y_pred = best_model.predict(X_test)
y_pred = y_pred.astype(int)
result = pd.DataFrame({'pair_id': np.asarray(test_gr['pair_id']), 'target': y_pred})
result = result.set_index(['pair_id'])
result.to_csv('TryOne.csv')

# –ü–æ–ø—ã—Ç–∫–∞ ‚Ññ2

In [27]:
doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

28026


In [29]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [30]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [31]:
doc_to_title[0] = ''
doc_to_title_tokenized = dict()
for key in doc_to_title:
    sentence_token = doc_to_title[key]
    sentence_token = sentence_token.lower().split()
    doc_to_title_tokenized[key] = sentence_token

In [33]:
stopword_set_eng = set(nltk.corpus.stopwords.words('english'))
stopword_set_rus = set(nltk.corpus.stopwords.words('russian'))
punctuation1 = punctuation
punct_tokens =set()
useless = set(['–≤','–Ω–∞', '‚Äî', '–∏','—Å', '–∏–ª–∏' '¬ª', '–∏–∑', '–ø–æ', '—Å–æ', '–∂–µ', '—Ç–∞', '-', '|', '//', '‚Ä¢', '>', '/'])
for symb in punctuation1: 
    punct_tokens.add(symb)

In [34]:
def my_is_digit(string):
    if string.isdigit():
        return True
    else:
        try:
            float(string)
            return True
        except ValueError:
            return False

In [36]:
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    doc_to_title_tokenized[key] = []
    for word in hold:
        if word not in (stopword_set_eng and stopword_set_rus and punct_tokens and useless):
            doc_to_title_tokenized[key].append(word)
    if doc_to_title_tokenized[key] == []:
        doc_to_title_tokenized[key].append('')

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:00<00:00, 268775.40it/s]


In [38]:
lemmatizer = pymorphy2.MorphAnalyzer()
inword_punct = set([',', ':', '"', '¬ª', '¬´', '‚Äù', '‚Äú', ')', '(', '[',']', '|', '*'])
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    doc_to_title_tokenized[key] = []
    for word in hold:
        tmp = ''
        for symb in word:
            if symb not in inword_punct:
                tmp = tmp+symb
        word = tmp
        if my_is_digit(word) == False and word != '':
            word = lemmatizer.parse(word)[0].normal_form
            doc_to_title_tokenized[key].append(word)
    if doc_to_title_tokenized[key] == []:
        doc_to_title_tokenized[key].append('')   

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:41<00:00, 681.20it/s]


In [40]:
eng_alph = list(string.ascii_lowercase)
a = ord('–∞')
rus_alph = ''.join([chr(i) for i in range(a,a+6)] + [chr(a+33)] + [chr(i) for i in range(a+6,a+32)])
rus_alphabet=[]
for elem in rus_alph:
    rus_alphabet.append(elem)
total = eng_alph + rus_alphabet
for i in "1234567890":
    total.append(i)
total = set(total)
bad_stuff = dict()
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    for word in hold:
        if len(word)>0:
            if word[-1] not in total:
                if word[-1] not in bad_stuff:
                    bad_stuff[word[-1]] = 1
                else:
                    bad_stuff[word[-1]] += 1
bad_stuff

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:00<00:00, 312319.03it/s]


{'.': 7312,
 '‚ï¨': 1,
 '?': 3461,
 '‚Ä¶': 42,
 ';': 44,
 '‚Ññ': 149,
 '‚Äï': 7,
 '!': 974,
 'üîç': 4,
 '>': 70,
 '/': 40,
 '‚ãÜ': 3,
 '¬ß': 9,
 '%': 23,
 '-': 113,
 '‚Äì': 462,
 'üëç': 2,
 '—ñ': 15,
 '¬∞': 5,
 '#': 8,
 '‚Äí': 1,
 'üö©': 9,
 "'": 37,
 '~': 234,
 '‚òÜ': 2,
 '‚Äô': 1,
 'Ïùò': 1,
 '‚Üí': 6,
 '\\': 11,
 '¬Æ': 10,
 '&': 53,
 '¬©': 4,
 '‚úø': 2,
 '+': 127,
 '‚úî': 8,
 '\x97': 4,
 '‚Ä∫': 7,
 '‚Äî': 6,
 '‚ò∫': 2,
 '‚ô•': 6,
 '‚ôÇ': 1,
 '≈º': 1,
 '¬±': 1,
 '‚ñ∫': 4,
 '“£': 1,
 '_': 11,
 '\u200d': 1,
 '—ï': 4,
 '‚ù§': 3,
 '=': 18,
 '<': 3,
 'üìå': 12,
 '^': 1,
 '—ò': 1,
 '◊î': 2,
 '◊ò': 1,
 '◊ô': 3,
 '◊™': 2,
 '}': 9,
 '\xad': 1,
 '@': 4,
 '‚ñ∏': 1,
 '‚ú¶': 3,
 '‚á®': 1,
 'üí™': 1,
 'üìç': 1,
 'üíï': 1,
 '€∂': 2,
 '—ó': 1,
 '¬¨': 2,
 '‚Ñ¢': 4,
 'üíµ': 1,
 '‚ô´': 4,
 '√ü': 1,
 'üìπ': 1,
 ' ñ': 1,
 '‚úà': 1,
 '¬∑': 3,
 'üè°': 1,
 'Ô∏è': 1,
 '\ufeff': 1,
 'üë∂': 2,
 '‚ñ≤': 2,
 '√™': 1,
 '√∞': 2,
 '√±': 11,
 '¬æ': 5,
 '√¢': 1,
 '¬µ': 3,
 '¬∏': 3,
 'ƒÉ': 1,
 '‚ñà': 1,
 '‚ñ

In [41]:
lemmatizer = pymorphy2.MorphAnalyzer()
for key in tqdm(doc_to_title_tokenized):
    hold = doc_to_title_tokenized[key]
    doc_to_title_tokenized[key] = []
    for word in hold:
        if len(word) > 0:
            if word[-1] in bad_stuff:
                word = word[:len(word)-1]
        if my_is_digit(word) == False and word != '':
            word = lemmatizer.parse(word)[0].normal_form
            doc_to_title_tokenized[key].append(word)
    if doc_to_title_tokenized[key] == []:
        doc_to_title_tokenized[key].append('')   

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28027/28027 [00:38<00:00, 719.06it/s]


–ò—Ç–æ–≥–æ–≤—ã–π –ø–∞—Ä—Å–∏–Ω–≥ –∑–∞–≥–æ–ª–æ–≤–∫–æ–≤ –ø–æ–ª—É—á–µ–Ω

–í—Å—Ç–∞–≤–ª—è–µ–º –∑–∞–≥–æ–ª–æ–≤–∫–∏ –≤ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º

In [43]:
train_data['title'] = pd.Series(dtype='object')
for i, row in train_data.iterrows():
    train_data['title'][i] = doc_to_title_tokenized[row['doc_id']]

In [44]:
test_data = pd.read_csv('test_groups.csv')
test_data['title'] = pd.Series(dtype='object')
for i, row in test_data.iterrows():
    test_data['title'][i] = doc_to_title_tokenized[row['doc_id']]
test_data.to_csv('test_preprocessed.csv', index=False)
test_data

Unnamed: 0,pair_id,group_id,doc_id,title
0,11691,130,6710,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, —Å–µ–±—è, –∏–ª–∏, –¥—Ä—É–≥, ..."
1,11692,130,4030,"[—Å–∫–∞—á–∞—Ç—å, sgl-rp, –¥–æ—Ä–∞–±–æ—Ç–∫–∞, —Å–ª–∏–≤, –º–æ–¥–∞, mysql..."
2,11693,130,5561,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, counter-strike, –∫..."
3,11694,130,4055,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –ø—Ä–æ—Å—Ç–æ–π, –∞–¥–º–∏–Ω–∫, –∫—Å]"
4,11695,130,4247,"[–ø–æ–¥–±–æ—Ä, –∞–¥–º–∏–Ω–æ–≤—ã–π, –¥–ª—è, —Å–µ—Ä–≤–µ—Ä, –∫–æ–¥_4, –∞—Ä—Ö–∏–≤,..."
...,...,...,...,...
16622,28313,309,16637,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, –ø–æ–ª–µ–∑–Ω–æ, –ª–∏, –∫—É—à–∞—Ç—å, —Ç–≤–æ—Ä–æ–≥, ..."
16623,28314,309,16759,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, –ª–µ—á–µ–Ω–∏–µ, —Ç–≤–æ—Ä–æ–≥, ..."
16624,28315,309,15358,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, –æ–ø–∞—Å–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, —Ç–≤–æ—Ä–æ–≥]"
16625,28316,309,17287,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, —á–µ–º, –ø–æ–ª–µ–∑–Ω—ã–π, —Ç–≤–æ—Ä–æ–≥]"


In [45]:
train_data

Unnamed: 0,pair_id,group_id,doc_id,target,title
0,1,1,15731,0,"[–≤–∞–∑, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞]"
1,2,1,14829,0,"[–≤–∞–∑, –æ–ø—Ç–æ–º, —Å–æ—á–∏, —Å—Ä–∞–≤–Ω–∏—Ç—å, —Ü–µ–Ω–∞, –∫—É–ø–∏—Ç—å, –ø–æ—Ç..."
2,3,1,15764,0,"[–∫—É–ø–∏—Ç—å, —Å—Ç—É–ø–∏—Ü–∞, –ª–∞–¥–∞, –∫–∞–ª–∏–Ω–∞2, —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è, ..."
3,4,1,17669,0,[–∫–ª–∞—Å—Å–∏–∫–∞]
4,5,1,14852,0,"[—Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å–≤–æ–π, —Ä—É–∫–∞]"
...,...,...,...,...,...
11685,11686,129,26672,0,"[‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞, -6‚ù§‚òÖ]"
11686,11687,129,25838,0,[g√§stebuch]
11687,11688,129,25703,0,"[jizolofej, archive]"
11688,11689,129,27885,0,"[–∫–∞–∫, –∑–≤–∞—Ç—å, –ø–∞—Ä–µ–Ω—å, –¥–∏–∞–Ω–∞, —à—É—Ä—ã–≥–∏–Ω, –ø—É—Å—Ç—å, –≥–æ..."


–ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è train —á–∞—Å—Ç–∏

In [46]:
df = train_data.copy()
x = np.zeros(df.shape[0])
for i in range(df.shape[0]):
    x[i] = len(df.title[i]) # –ü–æ–¥—Å—á—ë—Ç –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å–ª–æ–≤ –î–õ–Ø –ö–ê–ñ–î–û–ì–û –ó–ê–ì–û–õ–û–í–ö–ê
df['len_word'] = x
df['mean_len'] = np.zeros(len(df.index))
for i in df['group_id'].unique():#–ü–û–î–°–ß–Å–¢ –°–†–ï–î–ù–ï–ì–û –ö–û–õ–ò–ß–ï–°–¢–í–ê –°–õ–û–í –î–õ–Ø –ö–ê–ñ–î–û–ô –ì–†–£–ü–ü–´
        df.loc[df['group_id'] == i, 'mean_len'] = df.loc[df['group_id'] == i, 'len_word'].mean()

–ü–µ—Ä–µ–≤–æ–¥ —Å–ø–∏—Å–∫–∞ —Å–æ —Å–ª–æ–≤–∞–º–∏ –≤ —Å—Ç–æ–ª–±–µ—Ü –∏–∑ —Å—Ç—Ä–æ–∫, –Ω–∞–∑–≤–∞–Ω–∏–µ title2

In [47]:
df['title2'] = 'x'
for i in range(df.shape[0]):
    tmp = ' '.join(df.title[i])
    df.title2[i] = tmp
df

Unnamed: 0,pair_id,group_id,doc_id,target,title,len_word,mean_len,title2
0,1,1,15731,0,"[–≤–∞–∑, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞]",5.0,7.343137,–≤–∞–∑ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞
1,2,1,14829,0,"[–≤–∞–∑, –æ–ø—Ç–æ–º, —Å–æ—á–∏, —Å—Ä–∞–≤–Ω–∏—Ç—å, —Ü–µ–Ω–∞, –∫—É–ø–∏—Ç—å, –ø–æ—Ç...",9.0,7.343137,–≤–∞–∑ –æ–ø—Ç–æ–º —Å–æ—á–∏ —Å—Ä–∞–≤–Ω–∏—Ç—å —Ü–µ–Ω–∞ –∫—É–ø–∏—Ç—å –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª...
2,3,1,15764,0,"[–∫—É–ø–∏—Ç—å, —Å—Ç—É–ø–∏—Ü–∞, –ª–∞–¥–∞, –∫–∞–ª–∏–Ω–∞2, —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è, ...",10.0,7.343137,–∫—É–ø–∏—Ç—å —Å—Ç—É–ø–∏—Ü–∞ –ª–∞–¥–∞ –∫–∞–ª–∏–Ω–∞2 —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è –ø–µ—Ä–µ—Ö–æ...
3,4,1,17669,0,[–∫–ª–∞—Å—Å–∏–∫–∞],1.0,7.343137,–∫–ª–∞—Å—Å–∏–∫–∞
4,5,1,14852,0,"[—Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å–≤–æ–π, —Ä—É–∫–∞]",6.0,7.343137,—Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å–≤–æ–π —Ä—É–∫–∞
...,...,...,...,...,...,...,...,...
11685,11686,129,26672,0,"[‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞, -6‚ù§‚òÖ]",2.0,5.835165,‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞ -6‚ù§‚òÖ
11686,11687,129,25838,0,[g√§stebuch],1.0,5.835165,g√§stebuch
11687,11688,129,25703,0,"[jizolofej, archive]",2.0,5.835165,jizolofej archive
11688,11689,129,27885,0,"[–∫–∞–∫, –∑–≤–∞—Ç—å, –ø–∞—Ä–µ–Ω—å, –¥–∏–∞–Ω–∞, —à—É—Ä—ã–≥–∏–Ω, –ø—É—Å—Ç—å, –≥–æ...",9.0,5.835165,–∫–∞–∫ –∑–≤–∞—Ç—å –ø–∞—Ä–µ–Ω—å –¥–∏–∞–Ω–∞ —à—É—Ä—ã–≥–∏–Ω –ø—É—Å—Ç—å –≥–æ–≤–æ—Ä–∏—Ç—å ...


In [48]:
#knn = NearestNeighbors(metric='cosine')# –ó–∞–¥–∞—ë–º knn
df['dist_mean'] = np.zeros(len(df.index))# –∫–æ–ª–æ–Ω–∫–∞ –¥–ª—è —Å—Ä–µ–¥–Ω–µ–≥–æ –∫–æ—Å–∏–Ω—É—Å–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
df['dist_var'] = np.zeros(len(df.index))#–ö–æ–ª–æ–Ω–∫–∞ –¥–ª—è –¥–∏—Å–ø–µ—Ä—Å–∏–∏ –∫–æ—Å–∏–Ω—É—Å–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
for i in df['group_id'].unique():
    vect = TfidfVectorizer()
    knn = NearestNeighbors(metric='cosine')
    X = vect.fit_transform(df.loc[df['group_id'] == i, 'title2'])# TfIDF –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä –¥–ª—è –∫–∞–∂–¥–æ–π –≥—Ä—É–ø–ø—ã –ø–æ –∑–∞–≥–æ–ª–æ–≤–∫–∞–º
    knn.fit(X) # fit Knn –Ω–∞ –∫–∞–∂–¥–æ–π –≥—Ä—É–ø–ø–µ
    distances = knn.kneighbors(n_neighbors=15)[0] # –ø—Ä–µ–¥–∏–∫—Ç–∏–º –ø–µ—Ä–≤—ã–µ 15 –≤–µ—Å–æ–≤ –ø–æ Knn 
    for j in range(15):
        df.loc[df['group_id'] == i, f'tif_{j}'] = distances[:, j] # –ó–∞–ø–∏—Å—ã–≤–∞–µ–º —ç—Ç–∏ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è
    # —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏—Å—Ç–∏–∫–∏ –ø–æ–ø–∞—Ä–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
    cd = cosine_distances(X)# –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º –≤—Å–µ –∫–æ—Å–∏–Ω—É—Å–Ω—ã–µ –ø–æ–ø–∞—Ä–Ω—ã–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è –¥–ª—è –≥—Ä—É–ø–ø—ã
    df.loc[df['group_id'] == i, 'dist_mean'] = cd.mean()#—Å—Ä–µ–¥–Ω–µ–µ
    df.loc[df['group_id'] == i, 'dist_var'] = cd.var()#–¥–∏—Å–ø–µ—Ä—Å–∏—è

In [49]:
df

Unnamed: 0,pair_id,group_id,doc_id,target,title,len_word,mean_len,title2,dist_mean,dist_var,...,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,1,1,15731,0,"[–≤–∞–∑, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞]",5.0,7.343137,–≤–∞–∑ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞,0.950332,0.017279,...,0.542737,0.618136,0.618136,0.618136,0.618136,0.634562,0.663602,0.679844,0.693884,0.705194
1,2,1,14829,0,"[–≤–∞–∑, –æ–ø—Ç–æ–º, —Å–æ—á–∏, —Å—Ä–∞–≤–Ω–∏—Ç—å, —Ü–µ–Ω–∞, –∫—É–ø–∏—Ç—å, –ø–æ—Ç...",9.0,7.343137,–≤–∞–∑ –æ–ø—Ç–æ–º —Å–æ—á–∏ —Å—Ä–∞–≤–Ω–∏—Ç—å —Ü–µ–Ω–∞ –∫—É–ø–∏—Ç—å –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª...,0.950332,0.017279,...,0.623437,0.627031,0.633158,0.642637,0.804322,0.804322,0.804322,0.804322,0.877425,0.892640
2,3,1,15764,0,"[–∫—É–ø–∏—Ç—å, —Å—Ç—É–ø–∏—Ü–∞, –ª–∞–¥–∞, –∫–∞–ª–∏–Ω–∞2, —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è, ...",10.0,7.343137,–∫—É–ø–∏—Ç—å —Å—Ç—É–ø–∏—Ü–∞ –ª–∞–¥–∞ –∫–∞–ª–∏–Ω–∞2 —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è –ø–µ—Ä–µ—Ö–æ...,0.950332,0.017279,...,0.807100,0.811677,0.812596,0.829409,0.832100,0.832603,0.861944,0.863614,0.870043,0.878626
3,4,1,17669,0,[–∫–ª–∞—Å—Å–∏–∫–∞],1.0,7.343137,–∫–ª–∞—Å—Å–∏–∫–∞,0.950332,0.017279,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,5,1,14852,0,"[—Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å–≤–æ–π, —Ä—É–∫–∞]",6.0,7.343137,—Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å–≤–æ–π —Ä—É–∫–∞,0.950332,0.017279,...,0.483746,0.654517,0.656743,0.667192,0.734024,0.766981,0.777200,0.785432,0.786615,0.806900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,11686,129,26672,0,"[‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞, -6‚ù§‚òÖ]",2.0,5.835165,‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞ -6‚ù§‚òÖ,0.970944,0.018614,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11686,11687,129,25838,0,[g√§stebuch],1.0,5.835165,g√§stebuch,0.970944,0.018614,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11687,11688,129,25703,0,"[jizolofej, archive]",2.0,5.835165,jizolofej archive,0.970944,0.018614,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11688,11689,129,27885,0,"[–∫–∞–∫, –∑–≤–∞—Ç—å, –ø–∞—Ä–µ–Ω—å, –¥–∏–∞–Ω–∞, —à—É—Ä—ã–≥–∏–Ω, –ø—É—Å—Ç—å, –≥–æ...",9.0,5.835165,–∫–∞–∫ –∑–≤–∞—Ç—å –ø–∞—Ä–µ–Ω—å –¥–∏–∞–Ω–∞ —à—É—Ä—ã–≥–∏–Ω –ø—É—Å—Ç—å –≥–æ–≤–æ—Ä–∏—Ç—å ...,0.970944,0.018614,...,0.940714,0.941054,0.946595,0.948784,0.952123,0.953655,0.963233,0.969483,0.970115,0.974327


In [50]:
trainframe = df.copy()
trainframe

Unnamed: 0,pair_id,group_id,doc_id,target,title,len_word,mean_len,title2,dist_mean,dist_var,...,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,1,1,15731,0,"[–≤–∞–∑, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞]",5.0,7.343137,–≤–∞–∑ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞,0.950332,0.017279,...,0.542737,0.618136,0.618136,0.618136,0.618136,0.634562,0.663602,0.679844,0.693884,0.705194
1,2,1,14829,0,"[–≤–∞–∑, –æ–ø—Ç–æ–º, —Å–æ—á–∏, —Å—Ä–∞–≤–Ω–∏—Ç—å, —Ü–µ–Ω–∞, –∫—É–ø–∏—Ç—å, –ø–æ—Ç...",9.0,7.343137,–≤–∞–∑ –æ–ø—Ç–æ–º —Å–æ—á–∏ —Å—Ä–∞–≤–Ω–∏—Ç—å —Ü–µ–Ω–∞ –∫—É–ø–∏—Ç—å –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª...,0.950332,0.017279,...,0.623437,0.627031,0.633158,0.642637,0.804322,0.804322,0.804322,0.804322,0.877425,0.892640
2,3,1,15764,0,"[–∫—É–ø–∏—Ç—å, —Å—Ç—É–ø–∏—Ü–∞, –ª–∞–¥–∞, –∫–∞–ª–∏–Ω–∞2, —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è, ...",10.0,7.343137,–∫—É–ø–∏—Ç—å —Å—Ç—É–ø–∏—Ü–∞ –ª–∞–¥–∞ –∫–∞–ª–∏–Ω–∞2 —Ç—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è –ø–µ—Ä–µ—Ö–æ...,0.950332,0.017279,...,0.807100,0.811677,0.812596,0.829409,0.832100,0.832603,0.861944,0.863614,0.870043,0.878626
3,4,1,17669,0,[–∫–ª–∞—Å—Å–∏–∫–∞],1.0,7.343137,–∫–ª–∞—Å—Å–∏–∫–∞,0.950332,0.017279,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,5,1,14852,0,"[—Å—Ç—É–ø–∏—Ü–∞, –Ω–∏–≤–∞, –∑–∞–º–µ–Ω–∞, –ø–æ–¥—à–∏–ø–Ω–∏–∫, —Å–≤–æ–π, —Ä—É–∫–∞]",6.0,7.343137,—Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å–≤–æ–π —Ä—É–∫–∞,0.950332,0.017279,...,0.483746,0.654517,0.656743,0.667192,0.734024,0.766981,0.777200,0.785432,0.786615,0.806900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,11686,129,26672,0,"[‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞, -6‚ù§‚òÖ]",2.0,5.835165,‚ù§‚òÖ‚úø‚òÖ–∞–ø—Ä–µ–ª—ë–Ω–∫–∞ -6‚ù§‚òÖ,0.970944,0.018614,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11686,11687,129,25838,0,[g√§stebuch],1.0,5.835165,g√§stebuch,0.970944,0.018614,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11687,11688,129,25703,0,"[jizolofej, archive]",2.0,5.835165,jizolofej archive,0.970944,0.018614,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11688,11689,129,27885,0,"[–∫–∞–∫, –∑–≤–∞—Ç—å, –ø–∞—Ä–µ–Ω—å, –¥–∏–∞–Ω–∞, —à—É—Ä—ã–≥–∏–Ω, –ø—É—Å—Ç—å, –≥–æ...",9.0,5.835165,–∫–∞–∫ –∑–≤–∞—Ç—å –ø–∞—Ä–µ–Ω—å –¥–∏–∞–Ω–∞ —à—É—Ä—ã–≥–∏–Ω –ø—É—Å—Ç—å –≥–æ–≤–æ—Ä–∏—Ç—å ...,0.970944,0.018614,...,0.940714,0.941054,0.946595,0.948784,0.952123,0.953655,0.963233,0.969483,0.970115,0.974327


–¥—Ä–æ–ø–∞–µ–º –∫–æ–ª–æ–Ω–∫–∏ —Å —Ç–µ–∫—Å—Ç–∞–º–∏

In [51]:
trainframe = trainframe.drop(['title', 'title2'], axis = 1 )
trainframe

Unnamed: 0,pair_id,group_id,doc_id,target,len_word,mean_len,dist_mean,dist_var,tif_0,tif_1,...,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,1,1,15731,0,5.0,7.343137,0.950332,0.017279,0.000000,0.101384,...,0.542737,0.618136,0.618136,0.618136,0.618136,0.634562,0.663602,0.679844,0.693884,0.705194
1,2,1,14829,0,9.0,7.343137,0.950332,0.017279,0.397848,0.453717,...,0.623437,0.627031,0.633158,0.642637,0.804322,0.804322,0.804322,0.804322,0.877425,0.892640
2,3,1,15764,0,10.0,7.343137,0.950332,0.017279,0.659347,0.693884,...,0.807100,0.811677,0.812596,0.829409,0.832100,0.832603,0.861944,0.863614,0.870043,0.878626
3,4,1,17669,0,1.0,7.343137,0.950332,0.017279,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,5,1,14852,0,6.0,7.343137,0.950332,0.017279,0.378304,0.378304,...,0.483746,0.654517,0.656743,0.667192,0.734024,0.766981,0.777200,0.785432,0.786615,0.806900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,11686,129,26672,0,2.0,5.835165,0.970944,0.018614,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11686,11687,129,25838,0,1.0,5.835165,0.970944,0.018614,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11687,11688,129,25703,0,2.0,5.835165,0.970944,0.018614,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11688,11689,129,27885,0,9.0,5.835165,0.970944,0.018614,0.162961,0.924722,...,0.940714,0.941054,0.946595,0.948784,0.952123,0.953655,0.963233,0.969483,0.970115,0.974327


–£–¥–∞–ª—è–µ–º –Ω–µ–∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏

In [52]:
unused_columns =['pair_id', 'doc_id','target', 'group_id'] 
trainframe = trainframe.drop(unused_columns, axis=1)

In [53]:
trainframe

Unnamed: 0,len_word,mean_len,dist_mean,dist_var,tif_0,tif_1,tif_2,tif_3,tif_4,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,5.0,7.343137,0.950332,0.017279,0.000000,0.101384,0.222929,0.378304,0.528382,0.542737,0.618136,0.618136,0.618136,0.618136,0.634562,0.663602,0.679844,0.693884,0.705194
1,9.0,7.343137,0.950332,0.017279,0.397848,0.453717,0.506597,0.603974,0.606002,0.623437,0.627031,0.633158,0.642637,0.804322,0.804322,0.804322,0.804322,0.877425,0.892640
2,10.0,7.343137,0.950332,0.017279,0.659347,0.693884,0.693884,0.721518,0.777200,0.807100,0.811677,0.812596,0.829409,0.832100,0.832603,0.861944,0.863614,0.870043,0.878626
3,1.0,7.343137,0.950332,0.017279,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,6.0,7.343137,0.950332,0.017279,0.378304,0.378304,0.434426,0.445434,0.464068,0.483746,0.654517,0.656743,0.667192,0.734024,0.766981,0.777200,0.785432,0.786615,0.806900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,2.0,5.835165,0.970944,0.018614,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11686,1.0,5.835165,0.970944,0.018614,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11687,2.0,5.835165,0.970944,0.018614,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11688,9.0,5.835165,0.970944,0.018614,0.162961,0.924722,0.924722,0.934495,0.937645,0.940714,0.941054,0.946595,0.948784,0.952123,0.953655,0.963233,0.969483,0.970115,0.974327


–ó–∞–¥–∞—ë–º —Å–∫–µ–π–ª–µ—Ä –¥–ª—è –∫–∞–∂–¥–æ–π –∫–æ–ª–æ–Ω–∫–∏

In [54]:
def col_scale(col_name):
    scaler = StandardScaler()
    trainframe[col_name] = scaler.fit_transform(trainframe[col_name].values.reshape(-1, 1))

—Å–∫–µ–π–ª 4—Ä—ë—Ö –≥–ª–∞–≤–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [55]:
bad_cols = ['mean_len', 'dist_mean', 'dist_var', 'len_word']
for name in bad_cols:
    col_scale(name)

In [56]:
trainframe

Unnamed: 0,len_word,mean_len,dist_mean,dist_var,tif_0,tif_1,tif_2,tif_3,tif_4,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,-0.393496,0.052130,0.264604,-0.233910,0.000000,0.101384,0.222929,0.378304,0.528382,0.542737,0.618136,0.618136,0.618136,0.618136,0.634562,0.663602,0.679844,0.693884,0.705194
1,0.305004,0.052130,0.264604,-0.233910,0.397848,0.453717,0.506597,0.603974,0.606002,0.623437,0.627031,0.633158,0.642637,0.804322,0.804322,0.804322,0.804322,0.877425,0.892640
2,0.479629,0.052130,0.264604,-0.233910,0.659347,0.693884,0.693884,0.721518,0.777200,0.807100,0.811677,0.812596,0.829409,0.832100,0.832603,0.861944,0.863614,0.870043,0.878626
3,-1.091996,0.052130,0.264604,-0.233910,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,-0.218871,0.052130,0.264604,-0.233910,0.378304,0.378304,0.434426,0.445434,0.464068,0.483746,0.654517,0.656743,0.667192,0.734024,0.766981,0.777200,0.785432,0.786615,0.806900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11685,-0.917371,-0.823665,0.646891,-0.089957,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11686,-1.091996,-0.823665,0.646891,-0.089957,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11687,-0.917371,-0.823665,0.646891,-0.089957,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
11688,0.305004,-0.823665,0.646891,-0.089957,0.162961,0.924722,0.924722,0.934495,0.937645,0.940714,0.941054,0.946595,0.948784,0.952123,0.953655,0.963233,0.969483,0.970115,0.974327


—Ç–∞—Ä–≥–µ—Ç –¥–ª—è –æ–±—É—á–µ–Ω–∏—è

In [57]:
y_train = df['target']
y_train = np.array(y_train)
y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [58]:
X_train = trainframe.to_numpy()
X_train.shape

(11690, 19)

–î–∞–ª–µ–µ –∞–Ω–∞–ª–æ–≥–∏—á–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ –Ω–æ –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö –±–µ–∑ —Ä–∞–∑–º–µ—Ç–∫–∏

In [59]:
df2 = test_data.copy()
x = np.zeros(df2.shape[0])
for i in range(df2.shape[0]):
    x[i] = len(df2.title[i])
df2['len_word'] = x
df2['mean_len'] = np.zeros(len(df2.index))
for i in df2['group_id'].unique():
        df2.loc[df2['group_id'] == i, 'mean_len'] = df2.loc[df2['group_id'] == i, 'len_word'].mean()

In [60]:
df2['title2'] = 'x'
for i in range(df2.shape[0]):
    tmp = ' '.join(df2.title[i])
    df2.title2[i] = tmp
df2

Unnamed: 0,pair_id,group_id,doc_id,title,len_word,mean_len,title2
0,11691,130,6710,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, —Å–µ–±—è, –∏–ª–∏, –¥—Ä—É–≥, ...",8.0,8.836735,–∫–∞–∫ –ø—Ä–æ–ø–∏—Å–∞—Ç—å –∞–¥–º–∏–Ω–∫ –∫—Å —Å–µ–±—è –∏–ª–∏ –¥—Ä—É–≥ youtube
1,11692,130,4030,"[—Å–∫–∞—á–∞—Ç—å, sgl-rp, –¥–æ—Ä–∞–±–æ—Ç–∫–∞, —Å–ª–∏–≤, –º–æ–¥–∞, mysql...",22.0,8.836735,—Å–∫–∞—á–∞—Ç—å sgl-rp –¥–æ—Ä–∞–±–æ—Ç–∫–∞ —Å–ª–∏–≤ –º–æ–¥–∞ mysql rp ro...
2,11693,130,5561,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, counter-strike, –∫...",12.0,8.836735,–∫–∞–∫ –ø—Ä–æ–ø–∏—Å–∞—Ç—å –∞–¥–º–∏–Ω–∫ –∫—Å counter-strike –∫–∞—Ç–∞–ª–æ–≥...
3,11694,130,4055,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –ø—Ä–æ—Å—Ç–æ–π, –∞–¥–º–∏–Ω–∫, –∫—Å]",5.0,8.836735,–∫–∞–∫ –ø—Ä–æ–ø–∏—Å–∞—Ç—å –ø—Ä–æ—Å—Ç–æ–π –∞–¥–º–∏–Ω–∫ –∫—Å
4,11695,130,4247,"[–ø–æ–¥–±–æ—Ä, –∞–¥–º–∏–Ω–æ–≤—ã–π, –¥–ª—è, —Å–µ—Ä–≤–µ—Ä, –∫–æ–¥_4, –∞—Ä—Ö–∏–≤,...",8.0,8.836735,–ø–æ–¥–±–æ—Ä –∞–¥–º–∏–Ω–æ–≤—ã–π –¥–ª—è —Å–µ—Ä–≤–µ—Ä –∫–æ–¥_4 –∞—Ä—Ö–∏–≤ —Ñ–æ—Ä—É–º ...
...,...,...,...,...,...,...,...
16622,28313,309,16637,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, –ø–æ–ª–µ–∑–Ω–æ, –ª–∏, –∫—É—à–∞—Ç—å, —Ç–≤–æ—Ä–æ–≥, ...",11.0,6.616438,–æ—Ç–≤–µ—Ç—ã@mail.ru –ø–æ–ª–µ–∑–Ω–æ –ª–∏ –∫—É—à–∞—Ç—å —Ç–≤–æ—Ä–æ–≥ —É—Ç—Ä–∞–º?...
16623,28314,309,16759,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, –ª–µ—á–µ–Ω–∏–µ, —Ç–≤–æ—Ä–æ–≥, ...",8.0,6.616438,—Ç–≤–æ—Ä–æ–≥ –ø–æ–ª–µ–∑–Ω—ã–π —Å–≤–æ–π—Å—Ç–≤–æ –ª–µ—á–µ–Ω–∏–µ —Ç–≤–æ—Ä–æ–≥ –∂–µ–Ω—Å–∫–∏...
16624,28315,309,15358,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, –æ–ø–∞—Å–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, —Ç–≤–æ—Ä–æ–≥]",5.0,6.616438,—Ç–≤–æ—Ä–æ–≥ –ø–æ–ª–µ–∑–Ω—ã–π –æ–ø–∞—Å–Ω—ã–π —Å–≤–æ–π—Å—Ç–≤–æ —Ç–≤–æ—Ä–æ–≥
16625,28316,309,17287,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, —á–µ–º, –ø–æ–ª–µ–∑–Ω—ã–π, —Ç–≤–æ—Ä–æ–≥]",4.0,6.616438,–æ—Ç–≤–µ—Ç—ã@mail.ru —á–µ–º –ø–æ–ª–µ–∑–Ω—ã–π —Ç–≤–æ—Ä–æ–≥


In [61]:
df2['dist_mean'] = np.zeros(len(df2.index))
df2['dist_var'] = np.zeros(len(df2.index))
for i in df2['group_id'].unique():
    vect = TfidfVectorizer()
    knn = NearestNeighbors(metric='cosine')
    X = vect.fit_transform(df2.loc[df2['group_id'] == i, 'title2'])
    knn.fit(X)
    distances = knn.kneighbors(n_neighbors=15)[0]
    for j in range(15):
        df2.loc[df2['group_id'] == i, f'tif_{j}'] = distances[:, j]
    # —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏—Å—Ç–∏–∫–∏ –ø–æ–ø–∞—Ä–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
    cd = cosine_distances(X)
    df2.loc[df2['group_id'] == i, 'dist_mean'] = cd.mean()
    df2.loc[df2['group_id'] == i, 'dist_var'] = cd.var()

In [62]:
testframe = df2.copy()
testframe

Unnamed: 0,pair_id,group_id,doc_id,title,len_word,mean_len,title2,dist_mean,dist_var,tif_0,...,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,11691,130,6710,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, —Å–µ–±—è, –∏–ª–∏, –¥—Ä—É–≥, ...",8.0,8.836735,–∫–∞–∫ –ø—Ä–æ–ø–∏—Å–∞—Ç—å –∞–¥–º–∏–Ω–∫ –∫—Å —Å–µ–±—è –∏–ª–∏ –¥—Ä—É–≥ youtube,0.963487,0.0137,0.488396,...,0.737696,0.744670,0.748952,0.757451,0.772466,0.793199,0.806924,0.810170,0.813987,0.817902
1,11692,130,4030,"[—Å–∫–∞—á–∞—Ç—å, sgl-rp, –¥–æ—Ä–∞–±–æ—Ç–∫–∞, —Å–ª–∏–≤, –º–æ–¥–∞, mysql...",22.0,8.836735,—Å–∫–∞—á–∞—Ç—å sgl-rp –¥–æ—Ä–∞–±–æ—Ç–∫–∞ —Å–ª–∏–≤ –º–æ–¥–∞ mysql rp ro...,0.963487,0.0137,0.553483,...,0.859346,0.860108,0.895505,0.905825,0.912780,0.915054,0.916071,0.923691,0.928608,0.939283
2,11693,130,5561,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, counter-strike, –∫...",12.0,8.836735,–∫–∞–∫ –ø—Ä–æ–ø–∏—Å–∞—Ç—å –∞–¥–º–∏–Ω–∫ –∫—Å counter-strike –∫–∞—Ç–∞–ª–æ–≥...,0.963487,0.0137,0.728770,...,0.864635,0.866905,0.871411,0.874796,0.879371,0.897640,0.901543,0.904048,0.910867,0.919621
3,11694,130,4055,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –ø—Ä–æ—Å—Ç–æ–π, –∞–¥–º–∏–Ω–∫, –∫—Å]",5.0,8.836735,–∫–∞–∫ –ø—Ä–æ–ø–∏—Å–∞—Ç—å –ø—Ä–æ—Å—Ç–æ–π –∞–¥–º–∏–Ω–∫ –∫—Å,0.963487,0.0137,0.278831,...,0.658097,0.679263,0.727836,0.763005,0.772054,0.791061,0.801990,0.804397,0.806867,0.838929
4,11695,130,4247,"[–ø–æ–¥–±–æ—Ä, –∞–¥–º–∏–Ω–æ–≤—ã–π, –¥–ª—è, —Å–µ—Ä–≤–µ—Ä, –∫–æ–¥_4, –∞—Ä—Ö–∏–≤,...",8.0,8.836735,–ø–æ–¥–±–æ—Ä –∞–¥–º–∏–Ω–æ–≤—ã–π –¥–ª—è —Å–µ—Ä–≤–µ—Ä –∫–æ–¥_4 –∞—Ä—Ö–∏–≤ —Ñ–æ—Ä—É–º ...,0.963487,0.0137,0.648926,...,0.872293,0.888566,0.892499,0.900860,0.902905,0.906545,0.907433,0.912780,0.917142,0.918093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16622,28313,309,16637,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, –ø–æ–ª–µ–∑–Ω–æ, –ª–∏, –∫—É—à–∞—Ç—å, —Ç–≤–æ—Ä–æ–≥, ...",11.0,6.616438,–æ—Ç–≤–µ—Ç—ã@mail.ru –ø–æ–ª–µ–∑–Ω–æ –ª–∏ –∫—É—à–∞—Ç—å —Ç–≤–æ—Ä–æ–≥ —É—Ç—Ä–∞–º?...,0.860556,0.0330,0.641822,...,0.823292,0.857726,0.903480,0.907608,0.914344,0.918670,0.924839,0.924839,0.929196,0.933880
16623,28314,309,16759,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, –ª–µ—á–µ–Ω–∏–µ, —Ç–≤–æ—Ä–æ–≥, ...",8.0,6.616438,—Ç–≤–æ—Ä–æ–≥ –ø–æ–ª–µ–∑–Ω—ã–π —Å–≤–æ–π—Å—Ç–≤–æ –ª–µ—á–µ–Ω–∏–µ —Ç–≤–æ—Ä–æ–≥ –∂–µ–Ω—Å–∫–∏...,0.860556,0.0330,0.662673,...,0.796252,0.813043,0.833242,0.833242,0.833242,0.833242,0.842371,0.850235,0.861256,0.861256
16624,28315,309,15358,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, –æ–ø–∞—Å–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, —Ç–≤–æ—Ä–æ–≥]",5.0,6.616438,—Ç–≤–æ—Ä–æ–≥ –ø–æ–ª–µ–∑–Ω—ã–π –æ–ø–∞—Å–Ω—ã–π —Å–≤–æ–π—Å—Ç–≤–æ —Ç–≤–æ—Ä–æ–≥,0.860556,0.0330,0.383708,...,0.712014,0.746516,0.746516,0.746516,0.746516,0.746516,0.751318,0.775371,0.802053,0.809065
16625,28316,309,17287,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, —á–µ–º, –ø–æ–ª–µ–∑–Ω—ã–π, —Ç–≤–æ—Ä–æ–≥]",4.0,6.616438,–æ—Ç–≤–µ—Ç—ã@mail.ru —á–µ–º –ø–æ–ª–µ–∑–Ω—ã–π —Ç–≤–æ—Ä–æ–≥,0.860556,0.0330,0.000000,...,0.512905,0.512905,0.512905,0.512905,0.618597,0.664118,0.680657,0.697564,0.720950,0.742675


In [63]:
testframe = testframe.drop(['title', 'title2'], axis = 1 )
testframe

Unnamed: 0,pair_id,group_id,doc_id,len_word,mean_len,dist_mean,dist_var,tif_0,tif_1,tif_2,...,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,11691,130,6710,8.0,8.836735,0.963487,0.0137,0.488396,0.631047,0.631047,...,0.737696,0.744670,0.748952,0.757451,0.772466,0.793199,0.806924,0.810170,0.813987,0.817902
1,11692,130,4030,22.0,8.836735,0.963487,0.0137,0.553483,0.737290,0.763007,...,0.859346,0.860108,0.895505,0.905825,0.912780,0.915054,0.916071,0.923691,0.928608,0.939283
2,11693,130,5561,12.0,8.836735,0.963487,0.0137,0.728770,0.804397,0.804397,...,0.864635,0.866905,0.871411,0.874796,0.879371,0.897640,0.901543,0.904048,0.910867,0.919621
3,11694,130,4055,5.0,8.836735,0.963487,0.0137,0.278831,0.479916,0.631047,...,0.658097,0.679263,0.727836,0.763005,0.772054,0.791061,0.801990,0.804397,0.806867,0.838929
4,11695,130,4247,8.0,8.836735,0.963487,0.0137,0.648926,0.746544,0.806007,...,0.872293,0.888566,0.892499,0.900860,0.902905,0.906545,0.907433,0.912780,0.917142,0.918093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16622,28313,309,16637,11.0,6.616438,0.860556,0.0330,0.641822,0.697564,0.697564,...,0.823292,0.857726,0.903480,0.907608,0.914344,0.918670,0.924839,0.924839,0.929196,0.933880
16623,28314,309,16759,8.0,6.616438,0.860556,0.0330,0.662673,0.693106,0.699206,...,0.796252,0.813043,0.833242,0.833242,0.833242,0.833242,0.842371,0.850235,0.861256,0.861256
16624,28315,309,15358,5.0,6.616438,0.860556,0.0330,0.383708,0.439310,0.450453,...,0.712014,0.746516,0.746516,0.746516,0.746516,0.746516,0.751318,0.775371,0.802053,0.809065
16625,28316,309,17287,4.0,6.616438,0.860556,0.0330,0.000000,0.000000,0.147178,...,0.512905,0.512905,0.512905,0.512905,0.618597,0.664118,0.680657,0.697564,0.720950,0.742675


In [64]:
def col_scale_t(col_name):
    scaler = StandardScaler()
    testframe[col_name] = scaler.fit_transform(testframe[col_name].values.reshape(-1, 1))

In [65]:
bad_cols = ['mean_len', 'dist_mean', 'dist_var', 'len_word']
for name in bad_cols:
    col_scale_t(name)

In [66]:
testframe = testframe.drop(['pair_id', 'doc_id', 'group_id'], axis=1)

In [67]:
testframe

Unnamed: 0,len_word,mean_len,dist_mean,dist_var,tif_0,tif_1,tif_2,tif_3,tif_4,tif_5,tif_6,tif_7,tif_8,tif_9,tif_10,tif_11,tif_12,tif_13,tif_14
0,0.114303,0.915674,0.448817,-0.554158,0.488396,0.631047,0.631047,0.705281,0.735933,0.737696,0.744670,0.748952,0.757451,0.772466,0.793199,0.806924,0.810170,0.813987,0.817902
1,2.521335,0.915674,0.448817,-0.554158,0.553483,0.737290,0.763007,0.810576,0.845661,0.859346,0.860108,0.895505,0.905825,0.912780,0.915054,0.916071,0.923691,0.928608,0.939283
2,0.802027,0.915674,0.448817,-0.554158,0.728770,0.804397,0.804397,0.814610,0.861238,0.864635,0.866905,0.871411,0.874796,0.879371,0.897640,0.901543,0.904048,0.910867,0.919621
3,-0.401489,0.915674,0.448817,-0.554158,0.278831,0.479916,0.631047,0.640080,0.646117,0.658097,0.679263,0.727836,0.763005,0.772054,0.791061,0.801990,0.804397,0.806867,0.838929
4,0.114303,0.915674,0.448817,-0.554158,0.648926,0.746544,0.806007,0.845567,0.862533,0.872293,0.888566,0.892499,0.900860,0.902905,0.906545,0.907433,0.912780,0.917142,0.918093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16622,0.630096,-0.438299,-1.355978,1.363017,0.641822,0.697564,0.697564,0.697564,0.736090,0.823292,0.857726,0.903480,0.907608,0.914344,0.918670,0.924839,0.924839,0.929196,0.933880
16623,0.114303,-0.438299,-1.355978,1.363017,0.662673,0.693106,0.699206,0.757018,0.775371,0.796252,0.813043,0.833242,0.833242,0.833242,0.833242,0.842371,0.850235,0.861256,0.861256
16624,-0.401489,-0.438299,-1.355978,1.363017,0.383708,0.439310,0.450453,0.627756,0.658432,0.712014,0.746516,0.746516,0.746516,0.746516,0.746516,0.751318,0.775371,0.802053,0.809065
16625,-0.573420,-0.438299,-1.355978,1.363017,0.000000,0.000000,0.147178,0.371631,0.506649,0.512905,0.512905,0.512905,0.512905,0.618597,0.664118,0.680657,0.697564,0.720950,0.742675


In [68]:
X_test = testframe.to_numpy()

–∏—Ç–æ–≥–æ–≤—ã–µ —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç–∏

In [69]:
X_test.shape

(16627, 19)

In [70]:
X_train.shape

(11690, 19)

In [71]:
y_train.shape

(11690,)

In [72]:
best_params = dict()
best_params['lr'] = -1
best_params['n_est'] = -1
best_params['max_depth'] = -1
best_score = 0

In [73]:
Folds = KFold(n_splits = 5)
lr = [0.01, 0.05, 0.1, 0.25, 0.5, 0.7, 0.8, 1]
n_est = [5, 10, 15, 20, 25]
max_depth = [1, 2, 3, 4, 5, 6, 7]
score = []
res = 0
for i in range(len(lr)):
    for j in range(len(n_est)):
        for k in range(len(max_depth)):
            for train_id , test_id in Folds.split(X_train):
                X_tr = X_train[train_id]
                X_tst  = X_train[test_id]
                y_tr = y_train[train_id]
                y_tst = y_train[test_id]
                my_clf = XGBClassifier(learning_rate = lr[i],n_estimators = n_est[j] , max_depth = max_depth[k])
                my_clf.fit(X_tr, y_tr)
                res += f1_score(y_pred=my_clf.predict(X_tst), y_true=y_tst)
            res = res/5
            score.append(res)
            print(res)
            if res > best_score:
                best_score = res
                best_params['lr'] = lr[i]
                best_params['n_est'] = n_est[j]
                best_params['max_depth'] = max_depth[k]
            res = 0

0.6974793249071958
0.6974793249071958
0.724439330732684
0.701683010221693
0.7110768840376511
0.6985019984902909
0.6925218114324727
0.6974793249071958
0.6974793249071958
0.7248579507288979
0.7009698310609491
0.7100240126880293
0.7050715407865116
0.6973184772012622
0.6974793249071958
0.6974793249071958
0.7251522095916568
0.7055117513593434
0.7134284318927901
0.706288151535339
0.7047339275522683
0.693150382332179
0.6941923312329513
0.7168429831931387
0.706543491390448
0.716211560844511
0.7071985528163893
0.7036753121443233
0.6825181258477423
0.6939461797352913
0.710618438220082
0.7131537962592887
0.7179889233193787
0.7080338868537
0.7065391144002029
0.6752925741701115
0.6941923312329513
0.7109270270858818
0.7164091773088981
0.7160427694721361
0.710592710187967
0.7048527436826689
0.6736899672412957
0.6800913651318179
0.7112616540465189
0.7121179289557216
0.7189842824233086
0.7197429111570458
0.7128201891578673
0.6688891057814015
0.6818665167265928
0.7154593451080491
0.7154138436694393
0.71

In [74]:
best_score

0.7251522095916568

In [75]:
best_params

{'lr': 0.01, 'n_est': 15, 'max_depth': 3}

In [76]:
best_model = XGBClassifier(learning_rate = 0.01, n_estimators = 15 , max_depth = 3)

In [77]:
best_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=15, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [78]:
test_gr = pd.read_csv('test_groups.csv')
y_pred = best_model.predict(X_test)
y_pred = y_pred.astype(int)
result = pd.DataFrame({'pair_id': np.asarray(test_gr['pair_id']), 'target': y_pred})
result = result.set_index(['pair_id'])
result.to_csv('TryTwo.csv')

# –ë–ª–æ–∫ –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏ —Å–∞–º–∏—Ö —Ç–µ–∫—Å—Ç–æ–≤

In [79]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from tqdm.notebook import tqdm
from multiprocessing import Pool
import os
from html.parser import HTMLParser

In [98]:
#mystem = Mystem() 
#russian_stopwords = stopwords.words("russian")
#stop_tokens = []

–í—ã–∫–∞—á–∫–∞

Uncomment to parse

In [99]:
#contents = ''
#
#class Parser(HTMLParser):
#    def handle_data(self, data):
#        global contents
#        tag = self.get_starttag_text()
#        if tag is not None and tag.find('script') == tag.find('style') == -1:
#            contents += data.strip() + ' '
#
#parser = Parser()
#
#for dirname, _, filenames in os.walk(os.getcwd() + '/content'):
#    for filename in filenames:
#        file = os.path.join(dirname,filename)
#        with open(file, 'r') as f:
#            lines = f.readlines()
#            lines = [s.strip() for s in lines[1:] ]
#            text = '\n'.join(lines)
#            parser.feed(text)
#            newfile = file[:file.rfind('/')] + '/preprocessed/' + filename[:-4] + '_preprocessed.dat'
#            with open(newfile, 'w') as f1:
#                print(contents, file = f1)
#            parser.reset()
#            contents = ''

–û–±—Ä–∞–±–æ—Ç–∫–∞

In [100]:
#def match(text, alphabet=set('–∞–±–≤–≥–¥–µ—ë–∂–∑–∏–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—â—ä—ã—å—ç—é—èabcdefghijklmnopqrstuvwxyz0123456789')):
#    return not alphabet.isdisjoint(text)
#
#def get_text(text):
#    file_tokens = mystem.lemmatize(text.lower())
#    file_tokens = [token.strip() for token in file_tokens if token not in russian_stopwords
#              and token not in english_stopwords
#              and token.strip() not in punctuation
#              and match(token)
#              and len(token) > 1]
#    text = " ".join(file_tokens)
#    return text
#
#def get_file(file_name):
#    with open(os.path.join(os.getcwd() + '/preprocessed', file_name)) as in_file:
#        new_file = file_name[:file_name.find('_')]
#        with open(f'new_texts/{new_file}.txt', 'w') as out_file:
#            out_file.write(get_text(in_file.read()))

In [101]:
#files = sorted(os.listdir(os.getcwd()))
#with Pool(4) as pool:
#    pages = list(tqdm(pool.imap(get_file, files), total=len(files)))

–ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ñ–∏—á–µ–π

In [102]:
#def add_features_to_df(group_id):
#    one_group = df.loc[df['group_id']==group_id]
#    one_group['text'] = ''
#    for index, row in one_group.iterrows():
#        file_name = os.path.join(os.getcwd(), '/preprocessed' , 'new_texts.' f'{row["doc_id"]}.txt')
#        if os.path.isfile(file_name):
#            with open(file_name) as f:
#                one_group['text'][index] = f.read()
#    one_group['text_len_group'] = list(map(lambda x: len(x.split()), one_group['text']))
#    df.loc[df['group_id'] == group_id, 'text_len'] = one_group['text_len_group']
#    df.loc[df['group_id'] == group_id, 'mean_text_len'] = one_group['text_len_group'].mean()
#    df.loc[df['group_id'] == group_id, 'var_text_len'] = one_group['text_len_group'].var()

In [103]:
#train_data = pd.read_csv('train_groups.csv')
#cdf = train_data
#for i in tqdm(cdf['group_id'].unique()):
#    add_features_to_df(i)
#cdf.to_csv('mean_var_texts_train.csv', index=False)

In [104]:
#test_data = pd.read_csv('test_groups.csv')
#cdf = test_data
#for i in tqdm(cdf['group_id'].unique()):
#    add_features_to_df(i)
#cdf.to_csv('mean_var_texts_test.csv', index=False)

# –ü–æ–ø—ã—Ç–∫–∞ ‚Ññ3

In [105]:
train_data['title'] = pd.Series(dtype='object')
for i, row in train_data.iterrows():
    train_data['title'][i] = doc_to_title_tokenized[row['doc_id']]

In [106]:
test_data = pd.read_csv('test_groups.csv')
test_data['title'] = pd.Series(dtype='object')
for i, row in test_data.iterrows():
    test_data['title'][i] = doc_to_title_tokenized[row['doc_id']]
test_data.to_csv('test_preprocessed.csv', index=False)
test_data

Unnamed: 0,pair_id,group_id,doc_id,title
0,11691,130,6710,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, —Å–µ–±—è, –∏–ª–∏, –¥—Ä—É–≥, ..."
1,11692,130,4030,"[—Å–∫–∞—á–∞—Ç—å, sgl-rp, –¥–æ—Ä–∞–±–æ—Ç–∫–∞, —Å–ª–∏–≤, –º–æ–¥–∞, mysql..."
2,11693,130,5561,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –∞–¥–º–∏–Ω–∫, –∫—Å, counter-strike, –∫..."
3,11694,130,4055,"[–∫–∞–∫, –ø—Ä–æ–ø–∏—Å–∞—Ç—å, –ø—Ä–æ—Å—Ç–æ–π, –∞–¥–º–∏–Ω–∫, –∫—Å]"
4,11695,130,4247,"[–ø–æ–¥–±–æ—Ä, –∞–¥–º–∏–Ω–æ–≤—ã–π, –¥–ª—è, —Å–µ—Ä–≤–µ—Ä, –∫–æ–¥_4, –∞—Ä—Ö–∏–≤,..."
...,...,...,...,...
16622,28313,309,16637,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, –ø–æ–ª–µ–∑–Ω–æ, –ª–∏, –∫—É—à–∞—Ç—å, —Ç–≤–æ—Ä–æ–≥, ..."
16623,28314,309,16759,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, –ª–µ—á–µ–Ω–∏–µ, —Ç–≤–æ—Ä–æ–≥, ..."
16624,28315,309,15358,"[—Ç–≤–æ—Ä–æ–≥, –ø–æ–ª–µ–∑–Ω—ã–π, –æ–ø–∞—Å–Ω—ã–π, —Å–≤–æ–π—Å—Ç–≤–æ, —Ç–≤–æ—Ä–æ–≥]"
16625,28316,309,17287,"[–æ—Ç–≤–µ—Ç—ã@mail.ru, —á–µ–º, –ø–æ–ª–µ–∑–Ω—ã–π, —Ç–≤–æ—Ä–æ–≥]"


In [107]:
df = train_data.copy()
x = np.zeros(df.shape[0])
for i in range(df.shape[0]):
    x[i] = len(df.title[i]) # –ü–æ–¥—Å—á—ë—Ç –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ —Å–ª–æ–≤ –î–õ–Ø –ö–ê–ñ–î–û–ì–û –ó–ê–ì–û–õ–û–í–ö–ê
df['len_word'] = x
df['mean_len'] = np.zeros(len(df.index))
for i in df['group_id'].unique():#–ü–û–î–°–ß–Å–¢ –°–†–ï–î–ù–ï–ì–û –ö–û–õ–ò–ß–ï–°–¢–í–ê –°–õ–û–í –î–õ–Ø –ö–ê–ñ–î–û–ô –ì–†–£–ü–ü–´
        df.loc[df['group_id'] == i, 'mean_len'] = df.loc[df['group_id'] == i, 'len_word'].mean()

In [108]:
df['title2'] = 'x'
for i in range(df.shape[0]):
    tmp = ' '.join(df.title[i])
    df.title2[i] = tmp

In [109]:
knn = NearestNeighbors(metric='cosine')# –ó–∞–¥–∞—ë–º knn
df['dist_mean'] = np.zeros(len(df.index))# –∫–æ–ª–æ–Ω–∫–∞ –¥–ª—è —Å—Ä–µ–¥–Ω–µ–≥–æ –∫–æ—Å–∏–Ω—É—Å–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
df['dist_var'] = np.zeros(len(df.index))#–ö–æ–ª–æ–Ω–∫–∞ –¥–ª—è –¥–∏—Å–ø–µ—Ä—Å–∏–∏ –∫–æ—Å–∏–Ω—É—Å–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
for i in df['group_id'].unique():
    X = TfidfVectorizer().fit_transform(df.loc[df['group_id'] == i, 'title2'])# TfIDF –≤–µ–∫—Ç–æ—Ä–∞–π–∑–µ—Ä –¥–ª—è –∫–∞–∂–¥–æ–π –≥—Ä—É–ø–ø—ã –ø–æ –∑–∞–≥–æ–ª–æ–≤–∫–∞–º
    knn.fit(X) # fit Knn –Ω–∞ –∫–∞–∂–¥–æ–π –≥—Ä—É–ø–ø–µ
    distances = knn.kneighbors(n_neighbors=15)[0] # –ø—Ä–µ–¥–∏–∫—Ç–∏–º –ø–µ—Ä–≤—ã–µ 15 –≤–µ—Å–æ–≤ –ø–æ Knn 
    #–ú–æ–∂–Ω–æ –ø–æ–ø—Ä–æ–±–æ–≤–∞—Ç—å –≤–∑—è—Ç—å –Ω–µ 15,–∞ 20 –∏–ª–∏ 25
    for j in range(15):
        df.loc[df['group_id'] == i, f'tif_{j}'] = distances[:, j] # –ó–∞–ø–∏—Å—ã–≤–∞–µ–º —ç—Ç–∏ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è –≤ —Å—Ç–æ–ª–±–µ—Ü
    # —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏—Å—Ç–∏–∫–∏ –ø–æ–ø–∞—Ä–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
    cd = cosine_distances(X)# –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ–º –≤—Å–µ –∫–æ—Å–∏–Ω—É—Å–Ω—ã–µ –ø–æ–ø–∞—Ä–Ω—ã–µ —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è –¥–ª—è –≥—Ä—É–ø–ø—ã
    df.loc[df['group_id'] == i, 'dist_mean'] = cd.mean()#—Å—Ä–µ–¥–Ω–µ–µ
    df.loc[df['group_id'] == i, 'dist_var'] = cd.var()#–¥–∏—Å–ø–µ—Ä—Å–∏—è

In [110]:
trainframe = df.copy()

In [111]:
text_features_train

Unnamed: 0,pair_id,group_id,doc_id,target
0,1,1,15731,0
1,2,1,14829,0
2,3,1,15764,0
3,4,1,17669,0
4,5,1,14852,0
...,...,...,...,...
11685,11686,129,26672,0
11686,11687,129,25838,0
11687,11688,129,25703,0
11688,11689,129,27885,0


In [112]:
text_features_train = pd.read_csv('mean_var_texts_train.csv')

trainframe['text_len'] = text_features_train['text_len']
trainframe['mean_text_len'] = text_features_train['mean_text_len']
trainframe['var_text_len'] = text_features_train['var_text_len']

In [113]:
def trainscale(col_name):
    scaler = StandardScaler()
    scaler.fit(trainframe[col_name].values.reshape(-1, 1))
    trainframe[col_name] = scaler.transform(trainframe[col_name].values.reshape(-1, 1))

In [114]:
trainscale('mean_len')
trainscale('dist_mean')
trainscale('dist_var')
trainscale('len_word')
trainscale('text_len')
trainscale('mean_text_len')
trainscale('var_text_len')

In [115]:
y_train = trainframe.target

In [116]:
trainframe = trainframe.drop(['pair_id', 'doc_id','target', 'group_id', 'title', 'title2'], axis=1)

In [117]:
y_train = np.array(y_train)
y_train.shape

(11690,)

In [118]:
X_train = trainframe.to_numpy()
X_train.shape

(11690, 22)

In [119]:
df2 = test_data.copy()
x = np.zeros(df2.shape[0])
for i in range(df2.shape[0]):
    x[i] = len(df2.title[i])
df2['len_word'] = x
df2['mean_len'] = np.zeros(len(df2.index))
for i in df2['group_id'].unique():
        df2.loc[df2['group_id'] == i, 'mean_len'] = df2.loc[df2['group_id'] == i, 'len_word'].mean()

In [120]:
df2['title2'] = 'x'
for i in range(df2.shape[0]):
    tmp = ' '.join(df2.title[i])
    df2.title2[i] = tmp

In [121]:
knn = NearestNeighbors(metric='cosine')
df2['dist_mean'] = np.zeros(len(df2.index))
df2['dist_var'] = np.zeros(len(df2.index))
for i in df2['group_id'].unique():
    X = TfidfVectorizer().fit_transform(df2.loc[df2['group_id'] == i, 'title2'])
    knn.fit(X)
    distances = knn.kneighbors(n_neighbors=15)[0]
    for j in range(15):
        df2.loc[df2['group_id'] == i, f'tif_{j}'] = distances[:, j]
    # —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏—Å—Ç–∏–∫–∏ –ø–æ–ø–∞—Ä–Ω—ã—Ö —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π
    cd = cosine_distances(X)
    df2.loc[df2['group_id'] == i, 'dist_mean'] = cd.mean()
    df2.loc[df2['group_id'] == i, 'dist_var'] = cd.var()

In [122]:
testframe = df2.copy()

In [123]:
testframe = testframe.drop(['title'], axis = 1 )
testframe = testframe.drop(['title2'], axis = 1 )

In [124]:
text_features_test = pd.read_csv('mean_var_texts_test.csv')

In [125]:
testframe['text_len'] = text_features_test['text_len']
testframe['mean_text_len'] = text_features_test['mean_text_len']
testframe['var_text_len'] = text_features_test['var_text_len']

In [126]:
def testscale(col_name):
    scaler = StandardScaler()
    scaler.fit(testframe[col_name].values.reshape(-1, 1))
    testframe[col_name] = scaler.transform(testframe[col_name].values.reshape(-1, 1))

In [127]:
testscale('mean_len')
testscale('dist_mean')
testscale('dist_var')
testscale('len_word')
testscale('text_len')
testscale('mean_text_len')
testscale('var_text_len')

In [128]:
testframe = testframe.drop(['pair_id', 'doc_id', 'group_id'], axis=1)

In [129]:
X_test = testframe.to_numpy()

In [130]:
X_test.shape

(16627, 22)

In [131]:
X_train.shape

(11690, 22)

In [132]:
Folds = KFold(n_splits = 5)
lr = [0.01,0.02, 0.05]
n_est = [35, 40, 45, 50, 55, 100]
max_depth = [1, 2, 3, 4, 5, 6, 7, 8]
score = []
res = 0
best_scores = 0
for i in tqdm(range(  len(lr)  )):
    for j in tqdm(range(  len(n_est)  )):
        for k in tqdm(range(  len(max_depth)  )):
            for train_id , test_id in Folds.split(X_train):
                X_tr = X_train[train_id]
                X_tst  = X_train[test_id]
                y_tr = y_train[train_id]
                y_tst = y_train[test_id]
                my_clf = XGBClassifier(learning_rate = lr[i],n_estimators = n_est[j] , max_depth = max_depth[k], eval_metric='mlogloss')
                my_clf.fit(X_tr, y_tr)
                res += f1_score(y_pred=my_clf.predict(X_tst), y_true=y_tst)
            res = res/5
            score.append(res)
            if res > best_scores:
                best_scores = res
                max_learning_rate = lr[i]
                max_n_estimators = n_est[j]
                max_max_depth = max_depth[k]
            res = 0

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))





HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))





HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))






In [133]:
print(max_learning_rate)
print(max_n_estimators)
print(max_max_depth)
print(max(score))

0.01
35
3
0.7389787111832353


In [134]:
best_model = my_clf = XGBClassifier(learning_rate = max_learning_rate,n_estimators = max_n_estimators , max_depth = max_max_depth)

In [135]:
best_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=35, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [136]:
test_gr = pd.read_csv('test_groups.csv')
y_pred = best_model.predict(X_test)
y_pred = y_pred.astype(int)
result = pd.DataFrame({'pair_id': np.asarray(test_gr['pair_id']), 'target': y_pred})
result = result.set_index(['pair_id'])
result.to_csv('add_features_texts.csv')