In [1]:
import sys
import os

project_dir = os.getcwd().split("notebooks")[0]
sys.path.append(project_dir)


### test load_reviews

In [2]:
from src.handle_data import load_reviews

In [3]:
df_all = load_reviews("../data", load_all=True)
n_df_all = 1600
assert len(df_all) == n_df_all

In [4]:
df_train = load_reviews("../data")
n_df_train = len(df_train)
assert n_df_train == (0.8 * n_df_all)

In [5]:
df_eval = load_reviews("../data", eval=True)
assert len(df_eval) == n_df_all - n_df_train

### Test Classifier

In [6]:
from src.classifier import TextNaiveBayes
from src.preprocessing import vectorize_data, prepare_dataset
from sklearn.model_selection import train_test_split

classifier = TextNaiveBayes(smoothing=2.0)
X, y, vectorizer = vectorize_data(df_train, max_feats=3000, ngram=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

classifier.train(X_train, y_train)

In [7]:
accuracy, report = classifier.evaluate(X_test, y_test)
print(f"Train accuracy: {accuracy * 100:.1f}%")

Train accuracy: 89.1%


In [8]:
print("train report")
print(report)

train report
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       127
           1       0.86      0.94      0.90       129

    accuracy                           0.89       256
   macro avg       0.89      0.89      0.89       256
weighted avg       0.89      0.89      0.89       256



# Evaluating the model with the eval dataset

In [9]:
X_eval, y_eval= prepare_dataset(df_eval)
X_eval_vec = vectorizer.transform(X_eval)
acc, report = classifier.evaluate(X_eval_vec, y_eval)

In [10]:
print(f"Train accuracy: {acc * 100:.1f}%")

Train accuracy: 87.2%


In [11]:
print("train report")
print(report)

train report
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       160
           1       0.87      0.88      0.87       160

    accuracy                           0.87       320
   macro avg       0.87      0.87      0.87       320
weighted avg       0.87      0.87      0.87       320

