# Naive Bayes Classification

Notebook ini melakukan klasifikasi sentimen menggunakan Multinomial Naive Bayes.

**Tahapan:**
1. Load data hasil clustering (dengan label sentimen)
2. Train-Test Split
3. Training Naive Bayes
4. Evaluasi Model
5. Simpan Model

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix
)
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data hasil clustering
df = pd.read_csv("data/hasil_clustering.csv")
print(f"Jumlah data: {len(df)}")
print(f"\nDistribusi sentimen:")
print(df['sentiment'].value_counts())
df.head()

## 1. Persiapan Data

In [None]:
# Load TF-IDF vectorizer yang sudah disimpan
vectorizer_path = 'output/models/tfidf_vectorizer.pkl'

if os.path.exists(vectorizer_path):
    with open(vectorizer_path, 'rb') as f:
        tfidf_vectorizer = pickle.load(f)
    print('TF-IDF vectorizer berhasil dimuat')
else:
    print('WARNING: TF-IDF vectorizer tidak ditemukan!')
    print('Silakan jalankan TF-IDF_KMeans.ipynb terlebih dahulu.')
    # Buat vectorizer baru jika tidak ada
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.95)
    tfidf_vectorizer.fit(df['preprocessed_text'])

# Transform data
X = tfidf_vectorizer.transform(df['preprocessed_text'])
y = df['sentiment']

print(f"Shape X: {X.shape}")
print(f"Shape y: {y.shape}")

## 2. Train-Test Split

In [None]:
# Split data 80:20 dengan stratify
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Menjaga proporsi kelas
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

print(f"\nDistribusi training set:")
print(y_train.value_counts())

print(f"\nDistribusi testing set:")
print(y_test.value_counts())

## 3. Training Multinomial Naive Bayes

In [None]:
# Inisialisasi dan training model
nb_model = MultinomialNB(alpha=1.0)  # alpha = Laplace smoothing

# Fit model
nb_model.fit(X_train, y_train)

print("Model Naive Bayes berhasil di-training!")
print(f"Classes: {nb_model.classes_}")

## 4. Evaluasi Model

In [None]:
# Prediksi pada test set
y_pred = nb_model.predict(X_test)

# Hitung metrik evaluasi
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("="*50)
print("HASIL EVALUASI MODEL NAIVE BAYES")
print("="*50)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

In [None]:
# Classification Report
print("\nClassification Report:")
print("="*50)
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Visualisasi Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=nb_model.classes_,
    yticklabels=nb_model.classes_
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Naive Bayes')
plt.tight_layout()
plt.savefig('output/confusion_matrix_nb.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Prediksi pada Seluruh Data

In [None]:
# Prediksi pada seluruh data
df['predicted_sentiment'] = nb_model.predict(X)

# Bandingkan dengan label dari K-Means
print("Perbandingan label K-Means vs Prediksi Naive Bayes:")
comparison = pd.crosstab(df['sentiment'], df['predicted_sentiment'])
print(comparison)

In [None]:
# Hitung kesesuaian
match_count = (df['sentiment'] == df['predicted_sentiment']).sum()
match_pct = match_count / len(df) * 100
print(f"\nKesesuaian K-Means vs Naive Bayes: {match_count}/{len(df)} ({match_pct:.2f}%)")

In [None]:
# Distribusi prediksi
print("\nDistribusi Prediksi Naive Bayes:")
print(df['predicted_sentiment'].value_counts())

## 6. Simpan Model dan Hasil

In [None]:
# Simpan model Naive Bayes
os.makedirs('output/models', exist_ok=True)

with open('output/models/naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

print("Model Naive Bayes berhasil disimpan ke output/models/naive_bayes_model.pkl")

In [None]:
# Simpan hasil klasifikasi
df.to_csv("data/hasil_klasifikasi.csv", index=False)
print("Data berhasil disimpan ke data/hasil_klasifikasi.csv")

In [None]:
# Preview data final
df[['cleaned_text', 'sentiment', 'predicted_sentiment']].head(10)

## 7. Contoh Prediksi Baru

In [None]:
def predict_sentiment(text):
    """
    Fungsi untuk memprediksi sentimen teks baru.
    """
    # Transform teks
    text_tfidf = tfidf_vectorizer.transform([text])
    
    # Prediksi
    prediction = nb_model.predict(text_tfidf)[0]
    probabilities = nb_model.predict_proba(text_tfidf)[0]
    
    return prediction, dict(zip(nb_model.classes_, probabilities))

# Test prediksi
test_texts = [
    "sertifikasi halal sangat bagus untuk masyarakat",
    "proses sertifikasi halal terlalu rumit dan mahal",
    "informasi tentang sertifikasi halal dari mui"
]

print("Contoh Prediksi:")
print("="*60)
for text in test_texts:
    pred, probs = predict_sentiment(text)
    print(f"\nTeks: {text}")
    print(f"Prediksi: {pred}")
    print(f"Probabilitas: {probs}")

## Ringkasan Hasil Naive Bayes

In [None]:
print("="*60)
print("RINGKASAN HASIL NAIVE BAYES CLASSIFICATION")
print("="*60)
print(f"\nJumlah data: {len(df)}")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"\nMetrik Evaluasi:")
print(f"  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")
print(f"\nDistribusi Prediksi:")
for sentiment, count in df['predicted_sentiment'].value_counts().items():
    pct = count / len(df) * 100
    print(f"  {sentiment}: {count} ({pct:.1f}%)")