# Лабораторная работа №2 "Классификация текста"

In [None]:
from catboost import CatBoostClassifier
from collections import Counter
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from typing import Sequence
import fasttext
import nltk
import pandas as pd
import re


ONLY_WORDS = re.compile(r'[^\sa-z]', re.I)
ONLY_DIGITS = re.compile(r'[^\d]')
ALL_SPACE_SYMBOLS = re.compile(r'\s+')
LEMMATIZER = WordNetLemmatizer()


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


def sanitize_text(text: str) -> str:
    text = text.lower()
    text = ONLY_WORDS.sub('', text)
    text = ALL_SPACE_SYMBOLS.sub(' ', text)
    return text


def save_cls_data(filename: str, feature: Sequence[str], label: Sequence[int]) -> None:
    with open(filename, 'w') as file:
        for each_feature, each_label in zip(feature, label):
            file.writelines(f'__label__{each_label} {each_feature}\n')


def lemmatize(sentence: str) -> str:
    return ' '.join([LEMMATIZER.lemmatize(w) for w in sentence.split(' ')])

## Подготовка данных

In [None]:
df = pd.read_csv('data/indian_fake_news.tar.gz')
df

In [None]:
df = df[df['text'].notna()]
df['text'] = df['text'].apply(sanitize_text)
df['label'] = pd.factorize(df['label'], sort=True)[0]
df.rename({'label': 'is_real'}, inplace=True, axis=1)
df.info()

In [None]:
frequency_analysis = Counter(df['text'].str.cat())  # type: ignore

dec_sorted = [*sorted(frequency_analysis.items(), key=lambda x: x[1])]
keys = [str(k) for k, _ in dec_sorted]
values = [v for _, v in dec_sorted]

plt.barh(keys, values)
plt.xlabel('Частота')
plt.ylabel('Символ')
plt.title('Частотный анализ символов')
plt.show()

In [None]:
df['is_real'].value_counts().plot(kind='barh', title='Анализ значений классификации', xlabel='Число вхождений', ylabel='Метка')
plt.show()

df.drop_duplicates()['is_real'].value_counts().plot(kind='barh', title='Анализ значений классификации (только уникальные)', xlabel='Число вхождений', ylabel='Метка')
plt.show()

In [None]:
TRAIN_PROPORTION = 2e-1
RANDOM_SEED = 42

train, test = train_test_split(df, test_size=TRAIN_PROPORTION, random_state=RANDOM_SEED)

In [None]:
TRAIN_FILENAME = 'train.txt'
TEST_FILENAME = 'test.txt'

X_train, X_test, y_train, y_test = train['text'], test['text'], train['is_real'], test['is_real']

save_cls_data(TRAIN_FILENAME, X_train, y_train)
save_cls_data(TEST_FILENAME, X_test, y_test)

## Fasttext

In [None]:
EPOCHS = 20
LR = 1.0
WORD_NGRAMS = 3

fasttext_model = fasttext.train_supervised('train.txt', epoch=EPOCHS, lr=LR, wordNgrams=WORD_NGRAMS)
y_pred = X_test.apply(lambda x: int(ONLY_DIGITS.sub('', fasttext_model.predict(x)[0][0])))  # type: ignore

fasttext_out = (y_test.values, y_pred)

## CatBoost

In [None]:
catboost_model = CatBoostClassifier(cat_features=['text'], random_state=RANDOM_SEED, verbose=0)
catboost_model.fit(pd.DataFrame(X_train, columns=['text']), y_train)

y_pred = catboost_model.predict(pd.DataFrame(X_test, columns=['text']))
catboost_out = (y_test, y_pred)

## Случайный лес

In [None]:
COUNT_VECTORIZER = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('english'))

X_train, X_test = (series.apply(lemmatize) for series in (X_train, X_test))
X_train, X_test = COUNT_VECTORIZER.fit_transform(X_train), COUNT_VECTORIZER.transform(X_test)

In [None]:
N_ESTIMATORS = 42
MAX_DEPTH = 9

random_forest_model = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH, random_state=RANDOM_SEED) \
    .fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

random_forest_out = (y_test.values, y_pred)

## Тестирование моделей

In [None]:
for y_true, y_pred in (fasttext_out, catboost_out, random_forest_out):
    cm = confusion_matrix(y_true, y_pred)
    plt = ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
for y_true, y_pred in (fasttext_out, catboost_out, random_forest_out):
    print(classification_report(y_true, y_pred))