In [1]:
!pip install corus

Collecting corus
  Downloading corus-0.10.0-py3-none-any.whl.metadata (31 kB)
Downloading corus-0.10.0-py3-none-any.whl (83 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.7/83.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: corus
Successfully installed corus-0.10.0


In [3]:
import random
import numpy as np
import pandas as pd
import re
from corus import load_lenta
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

url = 'https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz'
data = pd.read_csv(url, compression='gzip', usecols=['title', 'text', 'topic'])

sample_size = 100_000
data = data.sample(n=sample_size, random_state=RANDOM_STATE)

In [5]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^а-яa-z ]', '', text)
    words = text.split()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('russian'))
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['processed_text'] = (data['title'].fillna('') + ' ' + data['text'].fillna('')).apply(preprocess_text)

class_counts = data['topic'].value_counts()
data = data[data['topic'].isin(class_counts[class_counts >= 2].index)]

data['topic'] = data['topic'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['topic'] = data['topic'].astype('category').cat.codes  # Преобразуем категориальный target в числа


In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['processed_text'], data['topic'], test_size=0.2, stratify=data['topic'], random_state=RANDOM_STATE)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.25, stratify=train_labels, random_state=RANDOM_STATE)  # 60/20/20

In [7]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(train_texts, train_labels)
dummy_preds = dummy.predict(val_texts)
print(f'Dummy Accuracy: {accuracy_score(val_labels, dummy_preds):.4f}')

Dummy Accuracy: 0.2188


In [10]:
count_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=2000, solver='saga', penalty='l2'))
])
count_pipeline.fit(train_texts, train_labels)
count_preds = count_pipeline.predict(val_texts)
print(f'Accuracy: {accuracy_score(val_labels, count_preds):.4f}')

CountVectorizer Accuracy: 0.8071


In [11]:
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=2000, solver='saga', penalty='l2'))
])
tfidf_pipeline.fit(train_texts, train_labels)
tfidf_preds = tfidf_pipeline.predict(val_texts)
print(f'TFIDF Accuracy: {accuracy_score(val_labels, tfidf_preds):.4f}')

TFIDF Accuracy: 0.7979


In [12]:
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1, 10]
}

gs = GridSearchCV(tfidf_pipeline, param_grid, cv=3, scoring='accuracy')
gs.fit(train_texts, train_labels)
print(f'Лучшие параметры: {gs.best_params_}')

Лучшие параметры: {'classifier__C': 10, 'vectorizer__ngram_range': (1, 1)}


In [13]:
best_model = gs.best_estimator_
test_preds = best_model.predict(test_texts)
print(f'Final Test Accuracy: {accuracy_score(test_labels, test_preds):.4f}')

Final Test Accuracy: 0.8138
