In [1]:
import pandas as pd 
import pymorphy3
import nlpaug.augmenter.word as naw

from tqdm import tqdm
tqdm.pandas()

import nltk
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet', quiet=True)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from xgboost import XGBClassifier

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\makan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\makan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
test = pd.read_csv(r"Lab1/data/data_rus_Cyrl_test.tsv", sep='\t')

train  = pd.read_csv(r"Lab1/data/data_rus_Cyrl_train.tsv", sep='\t')

In [3]:
categories = train.category.unique()

In [4]:
train = train.drop(columns="index_id")

test = test.drop(columns="index_id")

In [5]:
stop_words = stopwords.words('russian') # to delete common words in the language (а, и, на...)

morph = pymorphy3.MorphAnalyzer() # to bring the word to its normal form (“люди -> человек”)

aug = naw.ContextualWordEmbsAug(model_path='cointegrated/rubert-tiny2') # to extend train data

In [6]:
aug_text = train.text.apply(lambda x: aug.augment(x))

In [7]:
train = pd.concat([train, pd.DataFrame({'text':aug_text, 'category':train.category})])

In [8]:
def preprocess(text):
  text = re.sub(r'[^\w\s]', ' ', str(text).lower()).strip() # to remove punctuation marks and
  # transfer everything to the lower region

  tokens = word_tokenize(text, preserve_line=True) #  to tokenize text

  tokens = [word for word in tokens if word not in stop_words] # remove stopwords

  tokens = [morph.parse(word)[0].normal_form for word in tokens] # bring to normal form
  
  return " ".join(tokens) 

In [9]:
test.text = test.text.apply(lambda x: preprocess(x))

train.text = train.text.apply(lambda x: preprocess(x))

In [10]:
train

Unnamed: 0,category,text
0,geography,турция три сторона окружить море запад эгейски...
1,science/technology,начало война основное передвигаться поверхност...
2,science/technology,мера знание греческий язык ухудшаться запад ок...
3,entertainment,зимой это другой красота шарм горный деревушка...
4,entertainment,этот отель останавливаться богатый известный ч...
...,...,...
696,science/technology,хотя внутри соединение свой контактный данные ...
697,science/technology,отвечать задать вопрос почему обучение социали...
698,travel,некоторый другой аэропорт представить портал о...
699,sports,спасибо настоящий защитник быстро перейти шахт...


In [11]:
bow_vectorizer = CountVectorizer() # Bag of words
bow_vectorizer.fit(train.text)

tf_idf_vectorizer = TfidfVectorizer() # TF-idf
tf_idf_vectorizer.fit(train.text)

In [20]:
log_reg = LogisticRegression()

xgb = XGBClassifier() # Models 

tree = DecisionTreeClassifier()

In [13]:
tf_idf_train = tf_idf_vectorizer.transform(train.text)
tf_idf_test = tf_idf_vectorizer.transform(test.text)

In [14]:
log_reg.fit(tf_idf_train, train.category)
# Logistic Regression for TF-idf data
y_pred = log_reg.predict(tf_idf_test)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       1.00      0.21      0.35        19
science/technology       0.86      0.35      0.50        17
     entertainment       0.86      0.27      0.41        22
          politics       0.81      0.73      0.77        30
            health       0.47      0.88      0.61        51
            travel       0.89      0.68      0.77        25
            sports       0.64      0.70      0.67        40

          accuracy                           0.63       204
         macro avg       0.79      0.55      0.58       204
      weighted avg       0.73      0.63      0.61       204



In [15]:
bow_train = bow_vectorizer.transform(train.text)

bow_test = bow_vectorizer.transform(test.text)

In [16]:
log_reg.fit(bow_train, train.category)
# Logistic Regression for Bag of words data
y_pred = log_reg.predict(bow_test)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       0.71      0.26      0.38        19
science/technology       0.85      0.65      0.73        17
     entertainment       0.73      0.50      0.59        22
          politics       0.79      0.77      0.78        30
            health       0.57      0.86      0.69        51
            travel       0.91      0.80      0.85        25
            sports       0.66      0.68      0.67        40

          accuracy                           0.69       204
         macro avg       0.75      0.64      0.67       204
      weighted avg       0.72      0.69      0.68       204



In [17]:
tree.fit(tf_idf_train, train.category)
# Decision Tree for TF-idf data
y_pred = tree.predict(tf_idf_test)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       0.47      0.37      0.41        19
science/technology       0.50      0.29      0.37        17
     entertainment       0.61      0.50      0.55        22
          politics       0.90      0.60      0.72        30
            health       0.46      0.67      0.54        51
            travel       0.71      0.68      0.69        25
            sports       0.56      0.60      0.58        40

          accuracy                           0.57       204
         macro avg       0.60      0.53      0.55       204
      weighted avg       0.59      0.57      0.57       204



In [21]:
encoder = LabelEncoder()
train_enc =  encoder.fit_transform(train.category)
test_enc = encoder.transform(test.category)

xgb.fit(tf_idf_train, train_enc)

In [22]:
y_pred_enc = xgb.predict(tf_idf_test)
# XGBClassifier for TF-idf data
y_pred = encoder.inverse_transform(y_pred_enc)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       0.70      0.37      0.48        19
science/technology       0.53      0.47      0.50        17
     entertainment       0.57      0.36      0.44        22
          politics       0.72      0.60      0.65        30
            health       0.53      0.86      0.66        51
            travel       0.80      0.64      0.71        25
            sports       0.68      0.62      0.65        40

          accuracy                           0.62       204
         macro avg       0.65      0.56      0.59       204
      weighted avg       0.64      0.62      0.61       204



In [30]:
pca = PCA(n_components=701)

pca_train = pca.fit_transform(tf_idf_train.toarray())
pca_test = pca.transform(tf_idf_test.toarray())

In [31]:
log_reg.fit(pca_train, train.category)
# Logistic Regression for PCA data
y_pred = log_reg.predict(pca_test)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       1.00      0.16      0.27        19
science/technology       0.86      0.35      0.50        17
     entertainment       0.86      0.27      0.41        22
          politics       0.81      0.73      0.77        30
            health       0.47      0.86      0.61        51
            travel       0.89      0.68      0.77        25
            sports       0.62      0.72      0.67        40

          accuracy                           0.62       204
         macro avg       0.79      0.54      0.57       204
      weighted avg       0.72      0.62      0.60       204



In [32]:
tree.fit(pca_train, train.category)
# Decision Tree for PCA data
y_pred = tree.predict(pca_test)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       0.08      0.05      0.06        19
science/technology       0.38      0.35      0.36        17
     entertainment       0.13      0.09      0.11        22
          politics       0.44      0.47      0.45        30
            health       0.45      0.61      0.52        51
            travel       0.42      0.32      0.36        25
            sports       0.28      0.28      0.28        40

          accuracy                           0.36       204
         macro avg       0.31      0.31      0.31       204
      weighted avg       0.33      0.36      0.34       204



In [29]:
encoder = LabelEncoder()
train_enc =  encoder.fit_transform(train.category)
test_enc = encoder.transform(test.category)

xgb.fit(pca_train, train_enc)
# XGBClassifier for PCA data
y_pred_enc = xgb.predict(pca_test)
y_pred = encoder.inverse_transform(y_pred_enc)
print(classification_report(test.category, y_pred, target_names=categories))

                    precision    recall  f1-score   support

         geography       0.60      0.16      0.25        19
science/technology       0.73      0.47      0.57        17
     entertainment       0.55      0.27      0.36        22
          politics       0.71      0.80      0.75        30
            health       0.48      0.80      0.60        51
            travel       0.76      0.64      0.70        25
            sports       0.61      0.55      0.58        40

          accuracy                           0.59       204
         macro avg       0.63      0.53      0.54       204
      weighted avg       0.61      0.59      0.57       204

