In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Load your data (adjust file paths as needed)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Identify text column
text_col = [c for c in train_df.columns if c not in ('id','label')][0]
X_text = train_df[text_col].astype(str).tolist()
y = train_df['label'].values
X_test_txt = test_df[text_col].astype(str).tolist()
test_ids = test_df['id'].values

# Preprocess text
def clean(text):
    t = text.lower()
    t = re.sub(r'http\S+|www\S+', '', t)
    t = re.sub(r'@\w+|#\w+', '', t)
    return re.sub(r'\s+', ' ', t).strip()

X_text = [clean(t) for t in X_text]
X_test_txt = [clean(t) for t in X_test_txt]

# TF-IDF Vectorization
print("Vectorizing text data...")
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=3,
    strip_accents='unicode'
)

X_tfidf = tfidf.fit_transform(X_text)
X_test_tfidf = tfidf.transform(X_test_txt)

print(f"Training set shape: {X_tfidf.shape}")
print(f"Test set shape: {X_test_tfidf.shape}")

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

# Train Naive Bayes Model
print("Training Naive Bayes model...")
nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train, y_train)

# Make predictions on validation set
print("Evaluating on validation set...")
y_val_pred = nb_model.predict(X_val)
y_val_proba = nb_model.predict_proba(X_val)

# EVALUATION CODE STARTS HERE
print("\n" + "="*50)
print("NAIVE BAYES EVALUATION RESULTS")
print("="*50)

# Calculate Macro F1 Score
macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Macro F1 Score: {macro_f1:.4f}")

# Calculate per-class F1 scores
classes = np.unique(y_val)
print(f"\nPer-Class F1 Scores:")
for class_label in classes:
    tp = np.sum((y_val == class_label) & (y_val_pred == class_label))
    fp = np.sum((y_val != class_label) & (y_val_pred == class_label))
    fn = np.sum((y_val == class_label) & (y_val_pred != class_label))
    
    if tp + fp + fn > 0:
        f1_class = tp / (tp + 0.5 * (fp + fn))
        print(f"  Class {class_label}: {f1_class:.4f}")
    else:
        print(f"  Class {class_label}: 0.0000")

# Detailed classification report
print(f"\nDetailed Classification Report:")
print(classification_report(y_val, y_val_pred))

# Confusion Matrix
print(f"\nConfusion Matrix:")
cm = confusion_matrix(y_val, y_val_pred)
print(cm)

# Manual Macro F1 calculation for verification
print(f"\nManual Macro F1 Verification:")
f1_scores = []
for class_label in classes:
    tp = np.sum((y_val == class_label) & (y_val_pred == class_label))
    fp = np.sum((y_val != class_label) & (y_val_pred == class_label))
    fn = np.sum((y_val == class_label) & (y_val_pred != class_label))
    
    if tp + fp + fn > 0:
        f1_class = tp / (tp + 0.5 * (fp + fn))
        f1_scores.append(f1_class)
    else:
        f1_scores.append(0.0)

manual_macro_f1 = np.mean(f1_scores)
print(f"Manual Macro F1: {manual_macro_f1:.4f}")
print(f"Sklearn Macro F1: {macro_f1:.4f}")

# Retrain on full dataset and make final predictions
print(f"\nRetraining on full dataset...")
nb_model_full = MultinomialNB(alpha=0.1)
nb_model_full.fit(X_tfidf, y)

# Make final predictions on test set
print("Making final predictions on test set...")
y_test_pred = nb_model_full.predict(X_test_tfidf)

# Save predictions
submission_df = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred
})

submission_df.to_csv('NaiveBayes_Prediction.csv', index=False)
print("Predictions saved to NaiveBayes_Prediction.csv")

# Feature analysis
print(f"\nTop Features Analysis:")
feature_names = tfidf.get_feature_names_out()
feature_log_prob = nb_model_full.feature_log_prob_

# Get top features for each class
if len(nb_model_full.classes_) == 2:
    # Class 0 (non-hateful)
    top_features_0 = np.argsort(feature_log_prob[0])[-15:][::-1]
    print(f"\nTop 15 features for non-hateful class:")
    for i, idx in enumerate(top_features_0):
        print(f"  {i+1}. {feature_names[idx]}: {feature_log_prob[0][idx]:.4f}")
    
    # Class 1 (hateful)
    top_features_1 = np.argsort(feature_log_prob[1])[-15:][::-1]
    print(f"\nTop 15 features for hateful class:")
    for i, idx in enumerate(top_features_1):
        print(f"  {i+1}. {feature_names[idx]}: {feature_log_prob[1][idx]:.4f}")

print(f"\nNaive Bayes Training Complete!")
print(f"Final Macro F1 Score on Validation Set: {macro_f1:.4f}")

Vectorizing text data...
Training set shape: (17184, 10000)
Test set shape: (4296, 10000)
Training Naive Bayes model...
Evaluating on validation set...

NAIVE BAYES EVALUATION RESULTS
Macro F1 Score: 0.6868

Per-Class F1 Scores:
  Class 0: 0.7867
  Class 1: 0.5869

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.84      0.79      2127
           1       0.67      0.52      0.59      1310

    accuracy                           0.72      3437
   macro avg       0.70      0.68      0.69      3437
weighted avg       0.71      0.72      0.71      3437


Confusion Matrix:
[[1783  344]
 [ 623  687]]

Manual Macro F1 Verification:
Manual Macro F1: 0.6868
Sklearn Macro F1: 0.6868

Retraining on full dataset...
Making final predictions on test set...
Predictions saved to NaiveBayes_Prediction.csv

Top Features Analysis:

Top 15 features for non-hateful class:
  1. the: -4.6158
  2. white: -4.6259
  3. to: -4.9181
  4. is: -5.