In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [24]:
data = pd.read_csv('../raw_data/fulltrain.csv', names=['label', 'text'])
data.shape

(48854, 2)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [26]:
n = 1 #change for bigram or trigram

#Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, n))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [27]:
X_train_vectorized.shape

(39083, 202377)

In [28]:
#naive bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

MultinomialNB()

In [29]:
y_pred = nb_classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.84      0.71      0.77      2793
           2       0.99      0.08      0.15      1371
           3       0.55      1.00      0.71      3587
           4       1.00      0.39      0.56      2020

    accuracy                           0.66      9771
   macro avg       0.84      0.54      0.55      9771
weighted avg       0.79      0.66      0.62      9771



In [32]:
#checking with balancedtest

test_data = pd.read_csv('../raw_data/balancedtest.csv', names=['label', 'text'])
X_test_main = vectorizer.transform(test_data['text'])
Y_test_main = test_data['label']

y_pred_main = nb_classifier.predict(X_test_main)
print(classification_report(Y_test_main, y_pred_main))

              precision    recall  f1-score   support

           1       0.62      0.32      0.42       750
           2       0.80      0.01      0.01       750
           3       0.30      1.00      0.47       750
           4       1.00      0.20      0.33       750

    accuracy                           0.38      3000
   macro avg       0.68      0.38      0.31      3000
weighted avg       0.68      0.38      0.31      3000

