In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load a sample dataset (can replace with your own)
data = fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.hockey'])
X_text, y = data.data, data.target

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X_text).toarray()  # GaussianNB requires dense input

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train Gaussian Naive Bayes
model = GaussianNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"🔎 Precision: {precision:.2f}")
print(f"📈 Recall:    {recall:.2f}")
print(f"🏅 F1 Score:  {f1:.2f}")

# Cross-validation (5-fold)
cv_scores = cross_val_score(model, X_vec, y, cv=5, scoring='f1_macro')
print(f"🔁 Cross-Validation F1 Score (macro avg): {cv_scores.mean():.2f}")


🔎 Precision: 0.94
📈 Recall:    0.97
🏅 F1 Score:  0.96
🔁 Cross-Validation F1 Score (macro avg): 0.99
