In [None]:
!pip install datasets



In [None]:
!python -m spacy download ru_core_news_sm
#!python -m spacy download ru_core_news_md
#!python -m spacy download ru_core_news_lg

Collecting ru-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from typing import List, Tuple

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import spacy
from datasets import load_dataset

In [None]:
def load_sib200_ru() -> Tuple[Tuple[List[str], List[int]], Tuple[List[str], List[int]], Tuple[List[str], List[int]], List[str]]:
    trainset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='train')
    X_train = trainset['text']
    y_train = trainset['category']
    valset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='validation')
    X_val = valset['text']
    y_val = valset['category']
    testset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='test')
    X_test = testset['text']
    y_test = testset['category']
    categories = set(y_train)
    unknown_categories = set(y_val) - categories
    if len(unknown_categories) > 0:
        err_msg = f'The categories {unknown_categories} are represented in the validation set, but they are not represented in the training set.'
        raise RuntimeError(err_msg)
    unknown_categories = set(y_test) - categories
    if len(unknown_categories) > 0:
        err_msg = f'The categories {unknown_categories} are represented in the test set, but they are not represented in the training set.'
        raise RuntimeError(err_msg)
    categories = sorted(list(categories))
    y_train = [categories.index(it) for it in y_train]
    y_val = [categories.index(it) for it in y_val]
    y_test = [categories.index(it) for it in y_test]
    return (X_train, y_train), (X_val, y_val), (X_test, y_test), categories

In [None]:
train_data, val_data, test_data, classes_list = load_sib200_ru()

In [None]:
!pip install stanza



In [None]:
# def normalize_text(s: str, nlp_pipeline: spacy.Language) -> str:
#     doc = nlp_pipeline(s)
#     lemmas = [('<NUM>' if token.like_num else token.lemma_.lower()) for token in filter(lambda it1: not it1.is_punct, doc)]
#     if len(lemmas) == 0:
#         return ''
#     return ' '.join(lemmas)

import stanza
stanza.download('ru')  # Загрузка модели для русского языка

def normalize_text(s: str, nlp=stanza.Pipeline('ru', use_gpu=True)) -> str:
    # Обработка текста
    doc = nlp(s)
    lemmas = []

    # Извлечение лемм из токенов
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.text.isdigit():
                lemmas.append('<NUM>')
            elif word.text not in (',', '.', '!', '?', ':', ';'):
                lemmas.append(word.lemma)  # Используем лемму

    if len(lemmas) == 0:
        return ''
    return ' '.join(lemmas)

# Пример использования функции
text = "В 2024 году машины быстро ездят."
normalized_text = normalize_text(text)
print(normalized_text)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ru (Russian) ...
INFO:stanza:File exists: /root/stanza_resources/ru/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| pos       | syntagrus_charlm   |
| lemma     | syntagrus_nocharlm |
| depparse  | syntagrus_charlm   |
| ner       | wikiner            |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: depparse
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)


в <NUM> год машина быстро ездить


In [None]:
print(f'Categories: {classes_list}')

Categories: ['entertainment', 'geography', 'health', 'politics', 'science/technology', 'sports', 'travel']


In [None]:
print(len(train_data[0]))
print(len(train_data[1]))

701
701


In [None]:
print(len(val_data[0]))
print(len(val_data[1]))

99
99


In [None]:
print(len(test_data[0]))
print(len(test_data[1]))

204
204


In [None]:
nlp = spacy.load('ru_core_news_sm') #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#nlp = spacy.load('ru_core_news_md')



In [None]:
print(train_data[0][0])

Турция с трёх сторон окружена морями: на западе — Эгейским, на севере — Чёрным и на юге — Средиземным.


In [None]:
print(normalize_text(train_data[0][0], nlp))
#print(normalize_text(train_data[0][0]))

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'sentences'

In [None]:
print(val_data[0][0])

Если увеличить расстояние для бега с четверти до половины мили, скорость становится не так важна, тогда как выносливость превращается в абсолютную необходимость.


In [None]:
print(normalize_text(val_data[0][0], nlp))
#print(normalize_text(val_data[0][0]))

In [None]:
print(test_data[0][0])

Мутация вносит новую генетическую вариацию, в то время как отбор убирает её из набора проявляющихся вариаций.


In [None]:
print(normalize_text(test_data[0][0], nlp))
#print(normalize_text(test_data[0][0]))

In [None]:
class_probability = 1.0 / len(classes_list)
max_df = 1.0 - 0.2 * class_probability
print(f'Maximal document frequency of term is {max_df}.')

Maximal document frequency of term is 0.9714285714285714.


In [None]:
!pip install xgboost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier

In [None]:
# classifier = Pipeline(steps=[
#     ('vectorizer', TfidfVectorizer(token_pattern='\w+', max_df=max_df, min_df=1, ngram_range=(1, 1))),
#     ('cls', VotingClassifier(estimators=[
#         ('lr', LogisticRegression(solver='saga', max_iter=100, random_state=42, C=1000, penalty='l1')),
#         ('rf', RandomForestClassifier(random_state=42)),
#         ('svc', SVC(probability=True, random_state=42)),
#         ('adaboost', AdaBoostClassifier(random_state=42))
#     ], voting='soft'))
# ])

classifier = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(token_pattern='\w+', max_df=max_df, min_df=1, ngram_range=(1, 1))),
    ('cls', LogisticRegression(solver='saga', max_iter=100, random_state=52, C=1000, penalty='l1'))
])

# classifier = Pipeline(steps=[
#     ('vectorizer', TfidfVectorizer(token_pattern='\w+', max_df=max_df, min_df=1, ngram_range=(1, 1))),
#     ('cls', AdaBoostClassifier(estimator=DecisionTreeClassifier(), random_state=42))
# ])

param_grid={
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'cls__C': [10, 100, 1_000, 10_000],
        'cls__penalty': ['l1', 'l2'],
}

#Best parameters:
#{'cls__rf__max_depth': 10, 'cls__rf__n_estimators': 100, 'cls__svc__kernel': 'linear'}

# param_grid = {
    # 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # 'cls__lr__C': [1e-1, 1, 10, 100, 1000],
    # 'cls__lr__penalty': ['l1', 'l2'],
    # 'cls__rf__n_estimators': [100, 200, 500],
    # 'cls__rf__max_depth': [None, 10, 20, 50],
    # 'cls__rf__min_samples_split': [2, 5, 10],
    # 'cls__svc__C': [0.1, 1, 10],
    # 'cls__svc__kernel': ['linear', 'rbf'],
# }

# param_grid = {
    # 'vectorizer__ngram_range': [(1, 1)],
#     'cls__n_estimators': [50, 100, 200],  # Количество базовых моделей
#     'cls__learning_rate': [0.01, 0.1, 0.5, 1.0],  # Скорость обучения
#     'cls__estimator__max_depth': [1, 2, 3, 5],  # Глубина базового классификатора
#     'cls__estimator__min_samples_split': [2, 5, 10],  # Минимальное количество образцов для разбиения узла
# }

# classifier = Pipeline(steps=[
#     ('vectorizer', TfidfVectorizer(token_pattern='\w+', max_df=max_df, min_df=1, stop_words=stop_words)),
#     ('feature_selection', SelectKBest(chi2, k=20000)),  # Выбор лучших признаков
#     ('cls', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
# ])

# classifier = Pipeline(steps=[
#     ('vectorizer', TfidfVectorizer(token_pattern=r'\w+', max_df=max_df, min_df=1, ngram_range=(1, 1))),
#     ('cls', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, device='cuda'))
# ])

# # Параметры для подбора
# param_grid = {
#     # 'vectorizer__ngram_range': [(1, 1)],
#     'cls__max_depth': [3, 5, 7],
#     'cls__learning_rate': [0.01, 0.1, 0.3],
#     'cls__n_estimators': [100, 200, 300]
# }

In [None]:
cv = GridSearchCV(
    estimator=classifier,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    refit=True,
    n_jobs=-1,
    verbose=True
)

In [None]:
nlp = stanza.Pipeline('ru', use_gpu=True)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| pos       | syntagrus_charlm   |
| lemma     | syntagrus_nocharlm |
| depparse  | syntagrus_charlm   |
| ner       | wikiner            |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: depparse
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)


# TRY TO USE optuna

In [None]:
#cv.fit([normalize_text(it) for it in train_data[0]], train_data[1])
cv.fit([normalize_text(it, nlp) for it in train_data[0]], train_data[1])
#classifier.fit([normalize_text(it, nlp) for it in train_data[0]], train_data[1])

Fitting 3 folds for each of 10 candidates, totalling 30 fits




In [None]:
print('Best parameters:')
print(cv.best_params_)

Best parameters:
{'cls__C': 1000, 'cls__penalty': 'l2'}


In [None]:
print('Best F1-macro:')
print(cv.best_score_)

Best F1-macro:
0.6353569340415711


In [None]:
#print(f'Vocabulary size is {len(cv.best_estimator_.named_steps["vectorizer"].vocabulary_)}.')

In [None]:
#y_pred = classifier.predict([normalize_text(it, nlp) for it in val_data[0]])
y_pred = cv.predict([normalize_text(it, nlp) for it in val_data[0]])
print(classification_report(y_true=val_data[1], y_pred=y_pred, target_names=classes_list))

                    precision    recall  f1-score   support

     entertainment       0.83      0.56      0.67         9
         geography       0.67      0.75      0.71         8
            health       1.00      0.64      0.78        11
          politics       0.91      0.71      0.80        14
science/technology       0.61      0.80      0.69        25
            sports       0.82      0.75      0.78        12
            travel       0.59      0.65      0.62        20

          accuracy                           0.71        99
         macro avg       0.77      0.69      0.72        99
      weighted avg       0.74      0.71      0.71        99



In [None]:
#y_pred = classifier.predict([normalize_text(it, nlp) for it in test_data[0]])
y_pred = cv.predict([normalize_text(it, nlp) for it in test_data[0]])
print(classification_report(y_true=test_data[1], y_pred=y_pred, target_names=classes_list))

                    precision    recall  f1-score   support

     entertainment       0.78      0.37      0.50        19
         geography       0.64      0.53      0.58        17
            health       0.56      0.45      0.50        22
          politics       0.81      0.83      0.82        30
science/technology       0.62      0.78      0.69        51
            sports       0.78      0.72      0.75        25
            travel       0.55      0.60      0.57        40

          accuracy                           0.65       204
         macro avg       0.68      0.61      0.63       204
      weighted avg       0.66      0.65      0.65       204



                    precision    recall  f1-score   support

     entertainment       0.89      0.42      0.57        19
         geography       0.60      0.53      0.56        17
            health       0.50      0.45      0.48        22
          politics       0.78      0.83      0.81        30  
science/technology       0.68      0.78      0.73        51  
            sports       0.87      0.80      0.83        25  
            travel       0.61      0.70      0.65        40

          accuracy                           0.69       204
         macro avg       0.70      0.65      0.66       204
      weighted avg       0.70      0.69      0.68       204