In [None]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
df = pd.read_excel("Data.xlsx")

In [None]:
train_x, val_x, train_y, val_y = train_test_split(df["Text"], df["Class"], train_size=0.8, random_state=1)
print(f"Training Data Size: {train_x.shape[0]}")
print(f"Validation Data Size: {val_x.shape[0]}")

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
exclude = ["ner", "parser"]

def tokenizer(doc):
   with nlp.disable_pipes(*exclude):
       return [t.lemma_ for t in nlp(doc) if not t.is_punct and not t.is_space and not t.is_stop and t.is_alpha]

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None)
train_vectors = vectorizer.fit_transform(train_x)

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(train_vectors, train_y)
naive_bayes.get_params()

In [None]:
train_preds = naive_bayes.predict(train_vectors)
print("F1 Score on Initial Training Set: {}".format(f1_score(train_y, train_preds, average="macro")))

In [None]:
val_vectors = vectorizer.transform(val_x)
val_preds = naive_bayes.predict(val_vectors)
print("F1 Score on Initial Validation Set: {}".format(f1_score(val_y, val_preds, average="macro")))

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
disp = ConfusionMatrixDisplay.from_estimator(naive_bayes, val_vectors, val_y, normalize="true", xticks_rotation='vertical', ax=ax)

In [None]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0,]}

grid_search = GridSearchCV(MultinomialNB(), param_grid=params, scoring='f1_macro', n_jobs=-1, cv=2, verbose=5)
grid_search.fit(train_vectors, train_y)

In [None]:
grid_search.best_params_
best_naive_bayes = grid_search.best_estimator_
val_preds = best_naive_bayes.predict(val_vectors)
print('Validation F1 Score: {}'.format(f1_score(val_y, val_preds, average='macro')))

In [None]:
text_classifier = Pipeline([
  ('vectorizer', TfidfVectorizer(tokenizer=tokenizer, token_pattern=None)),
  ('classifier', MultinomialNB(alpha=0.01))
]).fit(train_x, train_y)

def classify_text(doc):
    probabilities = text_classifier.predict_proba([doc]).flatten()
    label = np.argmax(probabilities)
    return (label, probabilities[label])

In [None]:
classify_text('Predators be eating')