# Setup

In [5]:
!pip install corus
!pip install navec
!python3 -m spacy download ru_core_news_sm
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar

Collecting corus
  Downloading corus-0.10.0-py3-none-any.whl.metadata (31 kB)
Downloading corus-0.10.0-py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.7/83.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: corus
Successfully installed corus-0.10.0
Collecting navec
  Downloading navec-0.10.0-py3-none-any.whl.metadata (21 kB)
Downloading navec-0.10.0-py3-none-any.whl (23 kB)
Installing collected packages: navec
Successfully installed navec-0.10.0
Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.7.0)
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (f

In [26]:
import pandas as pd
import numpy as np
import re
import string
import os
import itertools
from corus import load_lenta
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import gensim.models
import gensim.downloader as api
import spacy
from navec import Navec
import warnings
import urllib.request

warnings.simplefilter(action='ignore', category=FutureWarning)

cool_random_state = 59

# Data proc

Загрузка данных как в hw_1

In [7]:
lenta_path = '/kaggle/working/lenta-ru-news.csv.gz'
records = load_lenta(lenta_path)

records_dlist = [{'text': rec.text, 'topic': rec.topic} for rec in records if rec.topic != '']
df = pd.DataFrame(records_dlist)

topic_counts = df['topic'].value_counts()
rare_topics = topic_counts[topic_counts < 4].index
df_filtered = df[~df['topic'].isin(rare_topics)]

In [8]:
n_sampl = 1e5
# в этот раз с random_state
df_sampl, _ = train_test_split(df_filtered, train_size=int(n_sampl), stratify=df_filtered['topic'], random_state=cool_random_state)
print(df_sampl['topic'].value_counts())
df_sampl.head(2)

topic
Россия               21717
Мир                  18492
Экономика            10761
Спорт                 8716
Культура              7279
Бывший СССР           7225
Наука и техника       7189
Интернет и СМИ        6044
Из жизни              3736
Дом                   2940
Силовые структуры     2651
Ценности              1051
Бизнес                1001
Путешествия            867
69-я параллель         171
Крым                    90
Культпросвет            46
Легпром                 15
Библиотека               9
Name: count, dtype: int64


Unnamed: 0,text,topic
515198,Власти Центрального округа Москвы планируют оз...,Дом
84305,В австралийском штате Виктория мужчина лишился...,Из жизни


Для обработки текстов воспользуюсь предобученной моделью из spacy

In [9]:
nlp = spacy.load('ru_core_news_sm')

Отключаю ненужные компоненты для ускорения обработки, привожу к нижнему регистру, фильтрую числа (топики слабо связаны с числами), стоп-слова и пунктуацию. Использую леммы, так как по опыту прошлого дз морфемы не дают прироста по качеству

In [10]:
%%time

data = df_sampl['text'].to_list()
res = []
for i, doc in tqdm(enumerate(nlp.pipe(data, disable=['ner', 'tok2vec']))):
    tokens = [token.lemma_.lower() for token in doc if (token.lemma_ not in string.punctuation) and (not token.is_stop) and (not token.lemma_.isdigit())]
    res.append(tokens)
    

100000it [12:36, 132.20it/s]

CPU times: user 12min 40s, sys: 33.5 s, total: 13min 13s
Wall time: 12min 36s





In [11]:
topic2int = {topic:i for i,topic in enumerate(df_sampl['topic'].unique())}
int2topic = {i:topic for i,topic in enumerate(df_sampl['topic'].unique())}
df_sampl['label'] = df_sampl['topic'].map(topic2int)

In [12]:
train_frac, val_frac, test_frac = 0.6, 0.2, 0.2

X, y = res, df_sampl['label']
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=(val_frac+test_frac), random_state=cool_random_state, stratify=df_sampl['label'])
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.5, random_state=cool_random_state, stratify=y_valtest)

print(len(X_train), len(X_val), len(X_test))

60000 20000 20000


# W2V gensim

Размер вектора 128 - чуть больше дефолтного. Стратегию обучения выбрал CBOW, кажется для нашей задачи классификации это будет более полезно, т.к. он усредняет контекст.
Контекстное окно стандартное - 5, min_count 12 - чтобы отсеять побольше редких слов. Тестовый сплит для обучения word2vec не используется.

In [13]:
%%time

model = gensim.models.Word2Vec(
    sentences=(X_train+X_val),
    vector_size=128,
    window=5,
    min_count=12,
    sg=2, # CBOW
    negative=7, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=cool_random_state,
)

CPU times: user 1h 17min 55s, sys: 4.6 s, total: 1h 18min
Wall time: 26min 13s


In [14]:
print(model.wv.most_similar(positive=['проведение'], topn=3))
print(model.wv.most_similar(positive=['россия'], topn=3))
print(model.wv.most_similar(positive=['слава'], topn=3))

print(model.wv.doesnt_match(['оружие', 'ракеты', 'яблоко']))

[('проведения', 0.6930421590805054), ('проведении', 0.654165506362915), ('осуществление', 0.6398216485977173)]
[('украина', 0.72777259349823), ('белоруссия', 0.7250326871871948), ('франция', 0.7061828374862671)]
[('богу', 0.5643595457077026), ('борщ', 0.5201302170753479), ('господи', 0.5135094523429871)]
яблоко


# Navec, rusvectores

In [15]:
urllib.request.urlretrieve(
    "https://vectors.nlpl.eu/repository/20/220.zip",
    "ruwikiruscorpora_upos_cbow_300_10_2021.zip"
)

('ruwikiruscorpora_upos_cbow_300_10_2021.zip',
 <http.client.HTTPMessage at 0x7edc01108400>)

In [16]:
!mkdir /kaggle/working/rusvec_model
!unzip /kaggle/working/ruwikiruscorpora_upos_cbow_300_10_2021.zip -d /kaggle/working/rusvec_model

Archive:  /kaggle/working/ruwikiruscorpora_upos_cbow_300_10_2021.zip
  inflating: /kaggle/working/rusvec_model/meta.json  
  inflating: /kaggle/working/rusvec_model/model.bin  
  inflating: /kaggle/working/rusvec_model/model.txt  
  inflating: /kaggle/working/rusvec_model/README  


In [17]:
model_path = '/kaggle/working/rusvec_model/model.bin'
model_rusvectores = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [18]:
model_rusvectores.most_similar(positive=['солнце_NOUN'], topn=6)

[('закат_NOUN', 0.7004451155662537),
 ('солнышко_ADV', 0.6614314317703247),
 ('небо_NOUN', 0.6606253385543823),
 ('солнце_PROPN', 0.6496840715408325),
 ('луный_NOUN', 0.6415917873382568),
 ('солнышко_NOUN', 0.6413697600364685)]

In [19]:
model_path = '/kaggle/working/navec_news_v1_1B_250K_300d_100q.tar'
navec = Navec.load(model_path)
#navec['гаврик'][:15]
navec['существо'][:15]

array([ 0.09148493, -0.14809133, -0.39783016,  0.5564699 ,  0.38129   ,
        0.12527302,  0.11904036, -0.11892173,  0.28735164, -0.35339865,
       -0.15817341,  0.4444368 , -0.01419532,  0.1782108 , -0.46234384],
      dtype=float32)

# Log reg

In [20]:
def vectorize_text(text, model, vector_size):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)


X_train_vectors = np.array([vectorize_text(text, model, 128) for text in X_train])
X_val_vectors = np.array([vectorize_text(text, model, 128) for text in X_val])
X_test_vectors = np.array([vectorize_text(text, model, 128) for text in X_test])

In [21]:
log_reg = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors, y_train)
val_preds = log_reg.predict(X_val_vectors)
print(classification_report(y_val, val_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.81      0.79      0.80       588
         Из жизни       0.64      0.58      0.61       747
            Спорт       0.96      0.96      0.96      1743
           Россия       0.75      0.80      0.78      4343
        Экономика       0.80      0.85      0.83      2152
              Мир       0.79      0.83      0.81      3699
         Культура       0.85      0.85      0.85      1456
      Бывший СССР       0.78      0.78      0.78      1445
   Интернет и СМИ       0.74      0.68      0.71      1209
  Наука и техника       0.81      0.82      0.82      1438
Силовые структуры       0.50      0.30      0.38       530
           Бизнес       0.41      0.17      0.24       200
         Ценности       0.82      0.77      0.79       210
      Путешествия       0.68      0.57      0.62       174
   69-я параллель       0.64      0.21      0.31        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
vector_size = model_rusvectores.vector_size

def find_pos(word, model):
    pos_tags = ["_NOUN", "_VERB", "_ADJ", "_ADV", "_PROPN", "_PRON", "_NUM", "_DET", "_ADP", "_CCONJ", "_SCONJ", "_PART", "_INTJ"]
    
    for pos in pos_tags:
        word_with_pos = word + pos
        if word_with_pos in model.key_to_index:
            return word_with_pos
    return None

def vectorize_text_rv(text, model, vector_size):
    vectors = []
    for word in text:
        word_with_pos = find_pos(word, model)
        if word_with_pos:
            vectors.append(model[word_with_pos])

    if not vectors:
        return np.zeros(vector_size)
        
    return np.mean(vectors, axis=0)

X_train_vectors_rv = np.array([vectorize_text_rv(text, model_rusvectores, vector_size) for text in X_train])
X_val_vectors_rv = np.array([vectorize_text_rv(text, model_rusvectores, vector_size) for text in X_val])
X_test_vectors_rv = np.array([vectorize_text_rv(text, model_rusvectores, vector_size) for text in X_test])

In [23]:
log_reg_rv = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_rv, y_train)
val_preds = log_reg_rv.predict(X_val_vectors_rv)
print(classification_report(y_val, val_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.74      0.72      0.73       588
         Из жизни       0.58      0.49      0.53       747
            Спорт       0.95      0.95      0.95      1743
           Россия       0.72      0.79      0.75      4343
        Экономика       0.75      0.83      0.79      2152
              Мир       0.77      0.79      0.78      3699
         Культура       0.82      0.83      0.83      1456
      Бывший СССР       0.74      0.73      0.73      1445
   Интернет и СМИ       0.68      0.62      0.65      1209
  Наука и техника       0.77      0.79      0.78      1438
Силовые структуры       0.41      0.19      0.26       530
           Бизнес       0.21      0.08      0.12       200
         Ценности       0.81      0.74      0.78       210
      Путешествия       0.57      0.43      0.49       174
   69-я параллель       0.33      0.15      0.20        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
vector_size = len(navec['существо'])

def vectorize_text_nv(text, model, vector_size):
    vectors = [model[word] for word in text if word in model]
    
    if not vectors:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

X_train_vectors_nv = np.array([vectorize_text_nv(text, navec, vector_size) for text in X_train])
X_val_vectors_nv = np.array([vectorize_text_nv(text, navec, vector_size) for text in X_val])
X_test_vectors_nv = np.array([vectorize_text_nv(text, navec, vector_size) for text in X_test])

In [25]:
log_reg_nv = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_nv, y_train)
val_preds = log_reg_nv.predict(X_val_vectors_nv)
print(classification_report(y_val, val_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.81      0.78      0.80       588
         Из жизни       0.63      0.59      0.60       747
            Спорт       0.96      0.96      0.96      1743
           Россия       0.76      0.79      0.78      4343
        Экономика       0.79      0.86      0.82      2152
              Мир       0.79      0.82      0.80      3699
         Культура       0.85      0.85      0.85      1456
      Бывший СССР       0.77      0.79      0.78      1445
   Интернет и СМИ       0.75      0.69      0.72      1209
  Наука и техника       0.80      0.82      0.81      1438
Силовые структуры       0.46      0.28      0.34       530
           Бизнес       0.42      0.16      0.23       200
         Ценности       0.81      0.76      0.78       210
      Путешествия       0.65      0.53      0.58       174
   69-я параллель       0.57      0.24      0.33        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Предобученный navec показал себя совсем немного лучше, чем обученные word2vec cbow-эмбеддинги.

# Tf-idf weightening

Navec и word2vec показали себя почти одинаково хорошо, сделаю tf-idf для обоих

In [54]:
%%time

tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(X_train)

CPU times: user 7.15 s, sys: 188 ms, total: 7.34 s
Wall time: 7.32 s


In [55]:
%%time

def vectorize_text_tfidf_w2v(text, model, vectorizer, vector_size):
    word_weights = vectorizer.transform([text]).toarray()[0]
    vectors = []
    weights = []

    for i, word in enumerate(text):
        if word in model.wv and word in vectorizer.vocabulary_:
            vectors.append(model.wv[word] * word_weights[vectorizer.vocabulary_[word]])
            weights.append(word_weights[vectorizer.vocabulary_[word]])

    if not vectors:
        return np.zeros(vector_size)
    return np.sum(vectors, axis=0) / np.sum(weights)

X_train_vectors_w2vtf = np.array([vectorize_text_tfidf_w2v(text, model, tfidf, 128) for text in X_train])
X_val_vectors_w2vtf = np.array([vectorize_text_tfidf_w2v(text, model, tfidf, 128) for text in X_val])
X_test_vectors_w2vtf = np.array([vectorize_text_tfidf_w2v(text, model, tfidf, 128) for text in X_test])

CPU times: user 6min 32s, sys: 52.5 ms, total: 6min 32s
Wall time: 6min 32s


In [56]:
log_reg = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_w2vtf, y_train)
val_preds = log_reg.predict(X_val_vectors_w2vtf)
print(classification_report(y_val, val_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.79      0.77      0.78       588
         Из жизни       0.61      0.54      0.57       747
            Спорт       0.95      0.96      0.96      1743
           Россия       0.74      0.79      0.76      4343
        Экономика       0.80      0.85      0.82      2152
              Мир       0.78      0.82      0.80      3699
         Культура       0.83      0.83      0.83      1456
      Бывший СССР       0.76      0.75      0.75      1445
   Интернет и СМИ       0.71      0.67      0.69      1209
  Наука и техника       0.80      0.82      0.81      1438
Силовые структуры       0.44      0.22      0.30       530
           Бизнес       0.40      0.17      0.24       200
         Ценности       0.79      0.75      0.77       210
      Путешествия       0.62      0.50      0.55       174
   69-я параллель       0.56      0.15      0.23        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


С tf-idf модель стала лучше распознавать один из редких топиков - Крым - f1 0.0 -> 0.29

In [57]:
%%time

vector_size = len(navec['существо'])

def vectorize_text_tfidf_navec(text, model, vectorizer, vector_size):
    word_weights = vectorizer.transform([text]).toarray()[0]
    vectors = []
    weights = []

    for i, word in enumerate(text):
        if word in model and word in vectorizer.vocabulary_:
            vectors.append(model[word] * word_weights[vectorizer.vocabulary_[word]])
            weights.append(word_weights[vectorizer.vocabulary_[word]])

    if not vectors:
        return np.zeros(vector_size)
    return np.sum(vectors, axis=0) / np.sum(weights)

X_train_vectors_navectf = np.array([vectorize_text_tfidf_navec(text, navec, tfidf, vector_size) for text in X_train])
X_val_vectors_navectf = np.array([vectorize_text_tfidf_navec(text, navec, tfidf, vector_size) for text in X_val])
X_test_vectors_navectf = np.array([vectorize_text_tfidf_navec(text, navec, tfidf, vector_size) for text in X_test])

CPU times: user 8min 28s, sys: 121 ms, total: 8min 28s
Wall time: 8min 28s


In [58]:
log_reg = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_navectf, y_train)
val_preds = log_reg.predict(X_val_vectors_navectf)
print(classification_report(y_val, val_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.79      0.77      0.78       588
         Из жизни       0.61      0.57      0.59       747
            Спорт       0.95      0.96      0.96      1743
           Россия       0.74      0.78      0.76      4343
        Экономика       0.78      0.85      0.81      2152
              Мир       0.77      0.81      0.79      3699
         Культура       0.84      0.85      0.84      1456
      Бывший СССР       0.77      0.74      0.75      1445
   Интернет и СМИ       0.74      0.68      0.71      1209
  Наука и техника       0.78      0.80      0.79      1438
Силовые структуры       0.38      0.22      0.28       530
           Бизнес       0.33      0.12      0.17       200
         Ценности       0.77      0.70      0.73       210
      Путешествия       0.54      0.46      0.50       174
   69-я параллель       0.64      0.26      0.37        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Navec c tf-idf помимо топика 'Крым' научился также частично классифицировать ещё один редкий топик - Легпром - f1 0.0 -> 0.5

# Test split

1) tf-idf word2vec -> test macro f1 0.53
2) rusvectores -> test macro f1 0.51
3) tf-idf navec -> test macro f1 0.53

In [61]:
log_reg_w2vtf = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_w2vtf, y_train)
test_preds = log_reg_w2vtf.predict(X_test_vectors_w2vtf)
print(classification_report(y_test, test_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.83      0.72      0.77       588
         Из жизни       0.61      0.52      0.56       747
            Спорт       0.95      0.95      0.95      1743
           Россия       0.72      0.79      0.75      4344
        Экономика       0.80      0.84      0.82      2153
              Мир       0.76      0.82      0.79      3698
         Культура       0.82      0.84      0.83      1456
      Бывший СССР       0.75      0.72      0.73      1445
   Интернет и СМИ       0.71      0.66      0.68      1209
  Наука и техника       0.81      0.80      0.81      1438
Силовые структуры       0.47      0.21      0.29       530
           Бизнес       0.57      0.19      0.28       200
         Ценности       0.86      0.79      0.82       210
      Путешествия       0.66      0.40      0.50       173
   69-я параллель       0.42      0.15      0.22        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [62]:
log_reg_rusve = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_rv, y_train)
test_preds = log_reg_rusve.predict(X_test_vectors_rv)
print(classification_report(y_test, test_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.78      0.71      0.74       588
         Из жизни       0.58      0.46      0.51       747
            Спорт       0.95      0.94      0.94      1743
           Россия       0.70      0.77      0.73      4344
        Экономика       0.76      0.80      0.78      2153
              Мир       0.75      0.79      0.77      3698
         Культура       0.81      0.84      0.82      1456
      Бывший СССР       0.75      0.72      0.74      1445
   Интернет и СМИ       0.69      0.65      0.67      1209
  Наука и техника       0.78      0.79      0.79      1438
Силовые структуры       0.38      0.19      0.26       530
           Бизнес       0.38      0.13      0.19       200
         Ценности       0.81      0.77      0.79       210
      Путешествия       0.52      0.36      0.43       173
   69-я параллель       0.31      0.12      0.17        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
log_reg_navectf = LogisticRegression(max_iter=128, C=10, random_state=cool_random_state).fit(X_train_vectors_navectf, y_train)
test_preds = log_reg_navectf.predict(X_test_vectors_navectf)
print(classification_report(y_test, test_preds, zero_division=0, target_names=[topic for topic in topic2int.keys()]))

                   precision    recall  f1-score   support

              Дом       0.81      0.76      0.79       588
         Из жизни       0.59      0.49      0.54       747
            Спорт       0.95      0.96      0.95      1743
           Россия       0.73      0.77      0.75      4344
        Экономика       0.78      0.83      0.81      2153
              Мир       0.76      0.81      0.78      3698
         Культура       0.82      0.85      0.83      1456
      Бывший СССР       0.74      0.72      0.73      1445
   Интернет и СМИ       0.71      0.67      0.69      1209
  Наука и техника       0.80      0.81      0.80      1438
Силовые структуры       0.48      0.26      0.34       530
           Бизнес       0.53      0.17      0.26       200
         Ценности       0.80      0.79      0.79       210
      Путешествия       0.58      0.42      0.48       173
   69-я параллель       0.33      0.12      0.17        34
    Культпросвет        0.00      0.00      0.00       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


По итогу на тестовом сплите совсем чуть-чуть лучше оказался наш обученный gensim word2vec. Если бы я хотел улучшить метрику, в первую очередь попробовал бы обучить skip-gram w2v, потом увеличил бы размер эмбеддинга, сделал бы гридсерч параметра min_df для tf-idf, потому что сейчас словарь довольно большой

In [70]:
len(tfidf.transform(['ночь']).toarray()[0])

358114