In [5]:
# @title Загрузить набор данных
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load dataset
data = pd.read_csv('/content/data/train.csv')

# Extracting features and labels
texts = data['review_text'].tolist()
labels = data['class_index'].tolist()

# Splitting the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [7]:
# @title  Преобразовать в вектор объектов
# Method 1: Using CountVectorizer
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(train_texts)
X_test_count = count_vectorizer.transform(test_texts)

# Method 2: Using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

In [8]:
# @title Определить классификатор
# Define classifiers
knn_classifier = KNeighborsClassifier()
logistic_classifier = LogisticRegression()

In [9]:
# @title Обучение и тестирование с помощью CountVectorizer

print("Training and testing with CountVectorizer:")
knn_classifier.fit(X_train_count, train_labels)
knn_pred_count = knn_classifier.predict(X_test_count)
print("KNN Accuracy (CountVectorizer):", accuracy_score(test_labels, knn_pred_count))
print("KNN Classification Report (CountVectorizer):\n", classification_report(test_labels, knn_pred_count))

logistic_classifier.fit(X_train_count, train_labels)
logistic_pred_count = logistic_classifier.predict(X_test_count)
print("Logistic Regression Accuracy (CountVectorizer):", accuracy_score(test_labels, logistic_pred_count))
print("Logistic Regression Classification Report (CountVectorizer):\n", classification_report(test_labels, logistic_pred_count))

Training and testing with CountVectorizer:
KNN Accuracy (CountVectorizer): 0.6200407170167337
KNN Classification Report (CountVectorizer):
               precision    recall  f1-score   support

           0       0.48      0.22      0.30       760
           1       0.52      0.26      0.35      3287
           2       0.64      0.93      0.76     11129
           3       0.58      0.25      0.35      3982
           4       0.56      0.15      0.23       981

    accuracy                           0.62     20139
   macro avg       0.55      0.36      0.40     20139
weighted avg       0.60      0.62      0.57     20139

Logistic Regression Accuracy (CountVectorizer): 0.6651770197129947
Logistic Regression Classification Report (CountVectorizer):
               precision    recall  f1-score   support

           0       0.50      0.21      0.30       760
           1       0.54      0.37      0.44      3287
           2       0.71      0.90      0.79     11129
           3       0.59  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# @title Обучение и тестирование с помощью TfidfVectorizer
#Training and testing with TfidfVectorizer

print("\nTraining and testing with TfidfVectorizer:")
knn_classifier.fit(X_train_tfidf, train_labels)
knn_pred_tfidf = knn_classifier.predict(X_test_tfidf)
print("KNN Accuracy (TfidfVectorizer):", accuracy_score(test_labels, knn_pred_tfidf))
print("KNN Classification Report (TfidfVectorizer):\n", classification_report(test_labels, knn_pred_tfidf))

logistic_classifier.fit(X_train_tfidf, train_labels)
logistic_pred_tfidf = logistic_classifier.predict(X_test_tfidf)
print("Logistic Regression Accuracy (TfidfVectorizer):", accuracy_score(test_labels, logistic_pred_tfidf))
print("Logistic Regression Classification Report (TfidfVectorizer):\n", classification_report(test_labels, logistic_pred_tfidf))



Training and testing with TfidfVectorizer:
KNN Accuracy (TfidfVectorizer): 0.630766175083172
KNN Classification Report (TfidfVectorizer):
               precision    recall  f1-score   support

           0       0.40      0.30      0.35       760
           1       0.50      0.40      0.44      3287
           2       0.68      0.85      0.76     11129
           3       0.55      0.37      0.44      3982
           4       0.51      0.22      0.30       981

    accuracy                           0.63     20139
   macro avg       0.53      0.43      0.46     20139
weighted avg       0.61      0.63      0.61     20139

Logistic Regression Accuracy (TfidfVectorizer): 0.6521674363175928
Logistic Regression Classification Report (TfidfVectorizer):
               precision    recall  f1-score   support

           0       0.57      0.12      0.20       760
           1       0.54      0.33      0.41      3287
           2       0.69      0.90      0.78     11129
           3       0.58  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
