## Import necessary modules

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from utils import DatasetReader
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load and split the data

In [16]:
reader = DatasetReader(encode_labels=True)
data = reader.read_dir('../data')
data

Unnamed: 0,Questions,Topic
0,define the term brand,3
1,explain one risk jack ma may have taken when s...,0
2,analyse two factors that may have increased de...,2
3,discuss if profit maximisation is the main bus...,0
4,assess the advantages of a paternalistic style...,1
...,...,...
558,what is meant by the term emotional intelligence?,0
559,true or false? entrepreneurs are often driven ...,0
560,what is meant by the term figurehead?,0
561,true or false? many entrepreneurs are driven m...,0


In [17]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Questions'], data['Topic'], test_size=0.25, random_state=0)

## Training the model

We use Multinomial Naive Bayes, which often stands as the baseline model in various NLP-specific problems. 
As the vectorizer for Multinomial NB, we use TF-IDF (Term Frequency - Inverse Document Frequency), which is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

In [18]:
# Creating a text processing and model pipeline
model = make_pipeline(
    TfidfVectorizer(stop_words=stopwords.words('english')), 
    MultinomialNB()  
)

In [19]:
# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

## Evaluating the model

In [20]:
# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7730496453900709
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.79      0.84        24
           1       1.00      0.55      0.71        29
           2       0.88      0.82      0.85        28
           3       0.60      0.89      0.71        28
           4       0.72      0.81      0.76        32

    accuracy                           0.77       141
   macro avg       0.82      0.77      0.78       141
weighted avg       0.82      0.77      0.77       141



## Save model weights

In [21]:
import joblib

joblib.dump(model, 'weights/naive_bayes.joblib')

['weights/naive_bayes.joblib']