In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [None]:
df = pd.read_csv("data/hate_train_cleaned.csv")

In [None]:
assert "comment" in df.columns and "isHate" in df.columns, "Dataset must contain 'comment' and 'isHate' columns"

In [None]:
df["label"] = (df["isHate"] >= 0.5).astype(int)

In [None]:
X = df["comment"]
y = df["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_probs = nb_model.predict_proba(X_test_tfidf)[:, 1]  
y_pred_binary = (y_pred_probs >= 0.5).astype(int)  

In [None]:
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

In [None]:
print(f"\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_probs) 
roc_auc = auc(fpr, tpr)  

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Model (AUC = 0.5)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve - Naïve Bayes Model")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()