## Обучение моделей

### Загружаем необходимые библиотеки и модули

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import datetime
from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

sns.set(style="darkgrid")
%matplotlib inline

In [2]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from catboost import CatBoostClassifier, Pool, cv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

import gensim.downloader
from gensim.models import Word2Vec, KeyedVectors
from natasha import Doc, Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger

# import mlxtend
# from mlxtend.evaluate import paired_ttest_kfold_cv

# from plotly.offline import iplot
# import cufflinks as cf
# cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
random_state = 42

### Загружаем данные и разделяем выборку

In [3]:
df = pd.read_csv('df_lenta_interfax.csv', parse_dates=['date']).drop(['Unnamed: 0'],axis=1)

In [4]:
df['len_title'] = df['title'].str.len()
df['len_content'] = df['content'].str.len()

In [5]:
#Кол-во и распределение тематик
df['topic'].value_counts()

Russia                169748
world                 129589
ekonomika              77595
sport                  35053
science_technology     29867
kultura                24028
traveling              22255
Name: topic, dtype: int64

Отберем по 20000 примеров для обучения

In [6]:
unique_topics = df['topic'].unique()
df_1 = pd.DataFrame()
# Создаем выборку для каждой тематики
for topic in unique_topics:
    topic_data = df[df['topic'] == topic].sample(frac=1, random_state=random_state).head(20000) # Выбираем по 20000 примеров для каждой тематики
    df_1 = pd.concat([df_1, topic_data])  # Объединяем выборки

In [7]:
df_1['topic'].value_counts()

kultura               20000
Russia                20000
world                 20000
science_technology    20000
sport                 20000
ekonomika             20000
traveling             20000
Name: topic, dtype: int64

## Препроцессинг текста

In [8]:
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так',
                    'вот', 'быть', 'как',
                    'в', '—', 'к', 'за', 'из', 'из-за',
                    'на', 'ок', 'кстати',
                    'который', 'мочь', 'весь',
                    'еще', 'также', 'свой',
                    'ещё', 'самый', 'ул', 'комментарий',
                    'английский', 'язык'])

In [9]:
def text_prep(text) -> str:
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)

    lemmas = [_.lemma for _ in doc.tokens]
    words = [lemma for lemma in lemmas if lemma.isalpha() and len(lemma) > 2]
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [10]:
%%time
df['title_clean'] = df.title.apply(text_prep)
df['content_clean'] = df.content.apply(text_prep)

CPU times: total: 4h 56min 45s
Wall time: 7h 10min 38s


In [11]:
df.sample(1)[['title', 'title_clean']].values

array([['Bek Air возобновит работу после авиакатастрофы в Казахстане не ранее 10 января',
        'bek air возобновить работа авиакатастрофа казахстан ранее январь']],
      dtype=object)

## Строим модель

In [12]:

X_train, X_test, y_train, y_test = train_test_split(df.content_clean.str.split(),
                                                    df.topic.values,
                                                    test_size=0.3,
                                                    random_state=random_state)
model = Word2Vec(sentences=X_train,
                 vector_size=200,
                 min_count=10,
                 window=2,
                 seed=random_state)

In [13]:
class TfidfEmbeddingVectorizer(object):
    """Get tfidf weighted vectors"""
    def __init__(self, model):
        self.word2vec = model.wv
        self.word2weight = None
        self.dim = model.vector_size

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec.get_vector(w) * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

## Catboost

In [16]:
%%time
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

tfidf_vectorizer = TfidfEmbeddingVectorizer(model)
tfidf_vectorizer.fit(X_train, y_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)

params = {'iterations': 100, 
          'learning_rate': 0.13, 
          'depth': 7, 
          'verbose': False}

pipe = Pipeline([
    ('w2v', TfidfEmbeddingVectorizer(model)),
    ('clf', CatBoostClassifier(**params, random_state=random_state, task_type="GPU", devices='0'))
    ])

# Определение сетки значений для GridSearchCV
grid = {
    'clf__iterations': [100, 200, 300],
    'clf__learning_rate': [0.1, 0.01, 0.001],
    'clf__depth': [5, 7, 9],
}

# Инициализация GridSearchCV
grid_search = GridSearchCV(pipe, grid, cv=5, n_jobs=None)

# Обучение модели с использованием GridSearchCV
grid_search.fit(X_train, y_train_encoded)

# Оценка качества модели
print(classification_report(y_test_encoded, grid_search.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87     50876
           1       0.84      0.87      0.86     23453
           2       0.92      0.89      0.90      7066
           3       0.85      0.86      0.86      8922
           4       0.98      0.98      0.98     10528
           5       0.86      0.84      0.85      6701
           6       0.87      0.86      0.86     38895

    accuracy                           0.88    146441
   macro avg       0.89      0.88      0.88    146441
weighted avg       0.88      0.88      0.88    146441

CPU times: total: 16h 15min 50s
Wall time: 1d 6h 50min 26s


In [17]:
print(grid_search.best_params_)

{'clf__depth': 9, 'clf__iterations': 300, 'clf__learning_rate': 0.1}


In [21]:
import joblib

# Сохранение лучших параметров и весов модели
joblib.dump(grid_search.best_estimator_.named_steps['clf'], 'catboost_classifier.pkl')

['catboost_classifier.pkl']