## Import necessary modules

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from utils import DatasetReader
from nltk.corpus import stopwords
import nltk
import numpy as np
from time import time

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load and split the data

In [2]:
reader = DatasetReader(encode_labels=True)
train = reader.read_dirs('../data/augmentation')
test = reader.read_dirs('../data/csv')

train

Unnamed: 0,Questions,Topic
0,"what is the essence of a brand, and how does i...",3
1,what is the concept of a brand that distinguis...,3
2,"what is meant by a brand, and how does it dist...",3
3,"what is a brand, and how does it differ from a...",3
4,what was a crucial gamble jack ma took in esta...,0
...,...,...
2247,true or false: do entrepreneurial motivations ...,0
2248,how do the approaches of a long-term planner v...,0
2249,what distinguishes an economist's long-term ap...,0
2250,what are the key distinctions between individu...,0


In [3]:
# Splitting the dataset into training and testing setsCr
X_train, y_train = np.array(train['Questions']), np.array(train['Topic'])
X_test, y_test = np.array(test['Questions']), np.array(test['Topic'])

## Training the model

We use Multinomial Naive Bayes, which often stands as the baseline model in various NLP-specific problems. 
As the vectorizer for Multinomial NB, we use TF-IDF (Term Frequency - Inverse Document Frequency), which is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

In [4]:
# Creating a text processing and model pipeline
model = make_pipeline(
    TfidfVectorizer(stop_words=stopwords.words('english')),
    MultinomialNB()
)

In [5]:
# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

## Evaluating the model

In [6]:
# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.872113676731794
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.87      0.83       103
           1       0.99      0.77      0.86        98
           2       0.93      0.90      0.92       103
           3       0.86      0.91      0.88       133
           4       0.85      0.89      0.87       126

    accuracy                           0.87       563
   macro avg       0.88      0.87      0.87       563
weighted avg       0.88      0.87      0.87       563



## Save model weights

In [7]:
import joblib
import json

joblib.dump(model, 'weights/naive_bayes.joblib')
with open('weights/naive_bayes_encodings.json', 'w') as f:
    json.dump(reader.get_encodings(), f)

## Summary

### Inference time

In [8]:
all_data = np.concatenate((X_train, X_test), axis=0)
for _ in range(8): 
    all_data = np.concatenate((all_data, all_data), axis=0)

print('N samples:\t\t', all_data.shape)

start = time()
model.predict(all_data)
end = time()

print(f'Inference time: {(end - start) / all_data.shape[0] * 1000:0.9f} [ms/sample]')
print(f'Inference time: {all_data.shape[0] / (end - start):9.4f} [samples/sec]')

N samples:		 (720640,)
Inference time: 0.008218431 [ms/sample]
Inference time: 121677.7205 [samples/sec]


### Metrics

In [9]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.87      0.83       103
           1       0.99      0.77      0.86        98
           2       0.93      0.90      0.92       103
           3       0.86      0.91      0.88       133
           4       0.85      0.89      0.87       126

    accuracy                           0.87       563
   macro avg       0.88      0.87      0.87       563
weighted avg       0.88      0.87      0.87       563

