# In this model we use Multinomial Naive_Bayes for text_classification

# 1. Load the dataset and devide it into training and test set

In [1]:
from datasets import load_dataset

ds = load_dataset("ag_news")
train_texts = ds["train"]["text"]
train_labels = ds["train"]["label"]

test_texts = ds["test"]["text"]
test_labels = ds["test"]["label"]


In [2]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts,train_labels,stratify=train_labels,test_size=0.2) 

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, make_scorer

# TF-IDF + Multinomial NB pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

# Hyperparameter grid
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_features': [20000, 40000, 60000],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__max_df': [0.9, 0.95, 1.0],
    'clf__alpha': [0.1, 0.5, 1.0]
}

# Use macro F1 as the scoring metric
scorer = make_scorer(f1_score, average='macro')

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    n_jobs=1,
    scoring=scorer,
    verbose=2
)

grid.fit(train_texts, train_labels)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)


Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   3.0s
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   3.0s
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   3.0s
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   8.2s
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   8.1s
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   8.7s
[CV] END clf__alpha=0.1, tfidf__max_df=0.9, tfidf__max_features=20000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   3.4s

In [9]:
preds = grid.best_estimator_.predict(test_texts)
f1_score = f1_score(y_true=test_labels,y_pred=preds,average="macro")

In [13]:
print(f"With Multinomial_NB, we get {(f1_score * 100):.3} f1_score")

With Multinomial_NB, we get 90.9 f1_score
