# Вариант 1

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer
import gensim
from gensim.models import Word2Vec
import numpy as np

morph = MorphAnalyzer()
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kuprik01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Препроцессинг данных и обучение Word2Vec

In [3]:
def preprocess_text(text):
    # tokens = text.lower().split()
    tokens = text.split()
    tokens = [morph.parse(word)[0].normal_form for word in tokens if word not in stop_words]
    return tokens

data = pd.read_csv('../data/news.txt', sep='\t', names=['category', 'title', 'article'])
data['processed_text'] = data['article'].apply(preprocess_text)

In [4]:
model = Word2Vec(sentences=data['processed_text'], vector_size=100, window=5, min_count=2, workers=4)

## 2. Разделяем данные и считаем усредненные эмбеддинги 

In [6]:
def vectorize_text(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return np.zeros(model.vector_size)

data['embedding'] = data['processed_text'].apply(lambda x: vectorize_text(x, model))

In [8]:
X = data['embedding'].tolist()
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Обучение SVM на усредненных данных 

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    business       0.40      0.08      0.13        79
     culture       0.79      0.78      0.79       279
   economics       0.68      0.90      0.78       266
      forces       0.67      0.79      0.73       149
        life       0.73      0.76      0.74       288
       media       0.77      0.73      0.75       299
     science       0.81      0.79      0.80       288
       sport       0.95      0.95      0.95       276
       style       0.96      0.61      0.74        38
      travel       0.47      0.24      0.32        38

    accuracy                           0.77      2000
   macro avg       0.72      0.66      0.67      2000
weighted avg       0.76      0.77      0.76      2000



Итоговый accuracy для SVM на усредненных векторах получился 0.77

## Альтернативная аггрегация

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

vectorizer = TfidfVectorizer()
vectorizer.fit(data['article'])

def weighted_vectorize(tokens, model, vectorizer):
    weights = defaultdict(lambda: 1)
    for word in tokens:
        if word in vectorizer.vocabulary_:
            weights[word] = vectorizer.idf_[vectorizer.vocabulary_[word]]
    vectors = [weights[word] * model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return np.zeros(model.vector_size)

data['weighted_vector'] = data['processed_text'].apply(lambda x: weighted_vectorize(x, model, vectorizer))

X_weighted = data['weighted_vector'].tolist()
Xw_train, Xw_test, yw_train, yw_test = train_test_split(X_weighted, y, test_size=0.2, random_state=42)

svm_classifier.fit(Xw_train, yw_train)
yw_pred = svm_classifier.predict(Xw_test)
print(classification_report(yw_test, yw_pred))

              precision    recall  f1-score   support

    business       0.56      0.41      0.47        79
     culture       0.83      0.82      0.82       279
   economics       0.75      0.87      0.81       266
      forces       0.72      0.76      0.74       149
        life       0.76      0.77      0.77       288
       media       0.80      0.75      0.78       299
     science       0.78      0.81      0.80       288
       sport       0.95      0.96      0.95       276
       style       0.93      0.68      0.79        38
      travel       0.50      0.37      0.42        38

    accuracy                           0.79      2000
   macro avg       0.76      0.72      0.73      2000
weighted avg       0.79      0.79      0.79      2000



В данном случае я решил попробовать использовать взвешенное усреденение, через домножение на Idf, чтобы учитывать вес каждого слова относительно всей коллекции документов что позволило увеличить accuracy на 2%. 