## Import necessary modules

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from utils import DatasetReader
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/citadel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load and split the data

In [7]:
reader = DatasetReader()
data = reader.read('../data/csv/clean_data.csv', '../data/parsing/save_my_exams_data.csv')
data

Unnamed: 0,Questions,Topic
0,define the term brand,marketing_mix_and_strategy
1,explain one risk jack ma may have taken when s...,entreprenuers_and_leaders
2,analyse two factors that may have increased de...,market
3,discuss if profit maximisation is the main bus...,entreprenuers_and_leaders
4,assess the advantages of a paternalistic style...,managing_people
...,...,...
492,what is meant by the term emotional intelligence?,entreprenuers_and_leaders
493,true or false? entrepreneurs are often driven ...,entreprenuers_and_leaders
494,what is meant by the term figurehead?,entreprenuers_and_leaders
495,true or false? many entrepreneurs are driven m...,entreprenuers_and_leaders


In [9]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Questions'], data['Topic'], test_size=0.25, random_state=0)

## Training the model

We use Multinomial Naive Bayes, which often stands as the baseline model in various NLP-specific problems. 
As the vectorizer for Multinomial NB, we use TF-IDF (Term Frequency - Inverse Document Frequency), which is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

In [10]:
# Creating a text processing and model pipeline
model = make_pipeline(
    TfidfVectorizer(stop_words=stopwords.words('english')), 
    MultinomialNB()  
)

In [11]:
# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

## Evaluating the model

In [12]:
# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.776
Classification Report:
                             precision    recall  f1-score   support

 entreprenuers_and_leaders       0.82      0.64      0.72        22
           managing_people       1.00      0.65      0.79        20
                    market       0.83      0.86      0.84        22
marketing_mix_and_strategy       0.84      0.84      0.84        38
    meeting_customer_needs       0.56      0.83      0.67        23

                  accuracy                           0.78       125
                 macro avg       0.81      0.76      0.77       125
              weighted avg       0.81      0.78      0.78       125

