In [89]:
import gzip
import re

import gensim

from gensim.models import Word2Vec

from dataclasses import dataclass
from typing import Iterator, List
from nltk.corpus import stopwords

@dataclass
class Text:
    label: str
    title: str
    text: str

# Чтение файла данных
def read_texts(fn: str="data/news.txt.gz") -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

ru_stopwords = set(stopwords.words("russian"))

# Разбиение текста на слова                 
def tokenize_text(text: str) -> List[str]:
    text = text.lower()
    words = re.findall(r'\b\w+\b', text.lower())
    words = [w for w in words if w not in ru_stopwords]
    return words

# Текст без знаков припенания (нужен для gensim)
def normalize_text(text: str) -> str:
    return ' '.join(tokenize_text(text))

Загружаем тексты

In [90]:
texts =[text for text in read_texts()]

Преобразуем тексты в списки слов

In [93]:
sentences = [tokenize_text(text.text) for text in texts]

Обучаем Word2Vec

In [43]:
w2v = Word2Vec(sentences)
w2v.wv.save_word2vec_format('w2v_vectors.bin')

Преобразуем документы в вектора, усредняя по векторизованным словам

In [69]:
vectorized_docs = []
for doc in sentences:
    vecs = []
    for word in doc:
        try:
            vecs.append(w2v.wv[word])
        except:
            pass
    vectorized_docs.append(sum(vecs)/len(vecs))

Обучаем SVM и смотрим результаты

In [None]:
labels = [text.label for text in read_texts()]

In [71]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(vectorized_docs, labels, test_size=0.3, random_state=123)

In [76]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, Y_train)

In [84]:
from sklearn.metrics import accuracy_score
Y_pred = svc.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.6966666666666667


Попробуем повторить эксперимент, используя Doc2Vec из gensim

In [114]:
normalized_texts = [tokenize_text(text.text) for text in texts]

In [115]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Создаем объекты TaggedDocument, которые содержат текст и уникальный идентификатор
tagged_texts = [TaggedDocument(words=normalized_texts[i], tags=[str(i)]) for i in range(len(normalized_texts))]
# Обучаем модель Doc2Vec
model = Doc2Vec(documents=tagged_texts, vector_size=100, min_count=5, epochs=10)



In [116]:
vectorized_docs = [model.infer_vector(text.words) for text in tagged_texts]

In [117]:
labels = [text.label for text in read_texts()]
X_train, X_test, Y_train, Y_test = train_test_split(vectorized_docs, labels, test_size=0.3, random_state=123)

In [118]:
svc = SVC()
svc.fit(X_train, Y_train)

In [119]:
Y_pred = svc.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.798


Видно, что стало лучше. Это вполне ожидаемо, т.к. векторизуем сразу весь документ моделькой