В якості текстової моделі використати TF-IDF. Виконати класифікацію за допомогою алгоритмів логістична регресія та градієнтний бустинг, порівняти їх точність. Спробувати покращити моделі за допомогою  GridSearchCV.

## Завантаження файлу у датафрейм

In [1]:
import pandas as pd

comments_df = pd.read_csv('science.csv', index_col=0).reset_index(drop=True)
comments_df

Unnamed: 0,Comment,Topic
0,A few things. You might have negative- frequen...,Biology
1,Is it so hard to believe that there exist part...,Physics
2,There are bees,Biology
3,I'm a medication technician. And that's alot o...,Biology
4,Cesium is such a pretty metal.,Chemistry
...,...,...
8690,I make similar observations over the last week...,Biology
8691,You would know.,Biology
8692,Also use the correct number of sig figs,Chemistry
8693,"What about the ethical delimmas, groundbreaki...",Biology


## Видалення дублікатів

In [2]:
print(f'Duplicates count: {comments_df.duplicated().sum()}')
comments_df.isna().sum()

Duplicates count: 723


Comment    0
Topic      0
dtype: int64

In [3]:
comments_df = comments_df.drop_duplicates().reset_index(drop=True)
comments_df.head()

Unnamed: 0,Comment,Topic
0,A few things. You might have negative- frequen...,Biology
1,Is it so hard to believe that there exist part...,Physics
2,There are bees,Biology
3,I'm a medication technician. And that's alot o...,Biology
4,Cesium is such a pretty metal.,Chemistry


## Пошук порожніх документів

In [4]:
comments_df[comments_df['Comment'].str.strip() == '']

Unnamed: 0,Comment,Topic


## Попередня обробка документів

In [5]:
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
tokenizer = WordPunctTokenizer()


def preprocess_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = tokenizer.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [6]:
comments_df['CleanedComment'] = comments_df['Comment'].apply(preprocess_document)
comments_df.head()

Unnamed: 0,Comment,Topic,CleanedComment
0,A few things. You might have negative- frequen...,Biology,things might negative frequency dependent sele...
1,Is it so hard to believe that there exist part...,Physics,hard believe exist particulars cant detect any...
2,There are bees,Biology,bees
3,I'm a medication technician. And that's alot o...,Biology,im medication technician thats alot drugs live...
4,Cesium is such a pretty metal.,Chemistry,cesium pretty metal


## Розділення на навчальні та тестові набори

In [7]:
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_names, test_label_names = train_test_split(comments_df['CleanedComment'],
                                                                                  comments_df['Topic'], test_size=0.2,
                                                                                  stratify=comments_df['Topic'],
                                                                                  random_state=1234)

## Вилучення ознак з використанням моделі TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)

## Класифікація за допомогою логістичної регресії

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

unique_classes = list(set(test_label_names))
logistic_regression = LogisticRegression()
logistic_regression.fit(tfidf_train_features, train_label_names)
predicted_labels = logistic_regression.predict(tfidf_test_features)
print(classification_report(test_label_names, predicted_labels, labels=unique_classes))

              precision    recall  f1-score   support

   Chemistry       0.66      0.67      0.67       575
     Biology       0.64      0.79      0.70       611
     Physics       0.79      0.50      0.61       409

    accuracy                           0.67      1595
   macro avg       0.70      0.65      0.66      1595
weighted avg       0.69      0.67      0.67      1595


## Класифікація за допомогою градієнтого бустингу

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

gbc = GradientBoostingClassifier()
gbc.fit(tfidf_train_features, train_label_names)
predicted_labels = gbc.predict(tfidf_test_features)
print(classification_report(test_label_names, predicted_labels, labels=unique_classes))

              precision    recall  f1-score   support

   Chemistry       0.72      0.38      0.50       575
     Biology       0.49      0.89      0.63       611
     Physics       0.83      0.38      0.52       409

    accuracy                           0.57      1595
   macro avg       0.68      0.55      0.55      1595
weighted avg       0.66      0.57      0.56      1595


## Покращення моделей за допомогою GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV


def tune_model(model, param_grid):
    model_cv = GridSearchCV(model, param_grid=param_grid, n_jobs=-1)
    model_cv.fit(tfidf_train_features, train_label_names)
    print(f"Best parameters: {model_cv.best_params_}")
    print(f"Train accuracy: {model_cv.best_score_}")
    print(f"Test accuracy: {model_cv.score(tfidf_test_features, test_label_names)}")
    return model_cv.best_estimator_ 

In [12]:
logistic_regression = LogisticRegression()
param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10]}
logistic_regression_cv = tune_model(logistic_regression, param_grid)

Best parameters: {'C': 1}
Train accuracy: 0.667551662671338
Test accuracy: 0.671473354231975


In [13]:
import numpy as np

param_grid = {'n_estimators': np.arange(100, 200, 20)}
gbc = GradientBoostingClassifier()
gbc_cv = tune_model(gbc, param_grid)

Best parameters: {'n_estimators': 180}
Train accuracy: 0.607179298051509
Test accuracy: 0.6012539184952979
