In [15]:
# Import library yang digunakan
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle # Untuk menyimpan model

# --- A. Definisikan Base Estimators ---
clf1 = MultinomialNB()
clf2 = LogisticRegression(solver='liblinear', random_state=42)
clf3 = LinearSVC(random_state=42)

# --- B. Gabungkan dalam Voting Classifier (Hard) ---
# Mengambil keputusan berdasarkan mayoritas label prediksi dari 3 model
voting_model = VotingClassifier(
    estimators=[('mnb', clf1), ('lr', clf2), ('svc', clf3)], 
    voting='hard'
)

# --- C. Training Model Ensemble ---
print("Mulai training Voting Classifier...")
voting_model.fit(X_train_vec, y_train)
print("Training selesai.")

# --- D. Testing ---
y_pred_ensemble = voting_model.predict(X_test_vec)

# --- E. Evaluasi dan Visualisasi ---

# 1. Tampilkan Metrik Kunci
print("\n--- Evaluasi Model Ensemble (Voting Classifier) pada Data Test ---")
accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_test, y_pred_ensemble, output_dict=True)
print("\nClassification Report:\n", classification_report(y_test, y_pred_ensemble))

# 2. Visualisasi Confusion Matrix
cm_ensemble = confusion_matrix(y_test, y_pred_ensemble)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_ensemble, annot=True, fmt='d', cmap='Blues',
             xticklabels=['ham', 'spam'],
             yticklabels=['ham', 'spam'])
plt.xlabel('Prediksi (Predicted)')
plt.ylabel('Asli (Actual)')
plt.title('Confusion Matrix - Voting Classifier (Hard)')
plt.savefig('confusion_matrix_voting.png')
plt.close()

# 3. Visualisasi Metrik Performa (Bar Plot)
df_report = pd.DataFrame(report).transpose().iloc[:-3] # Ambil baris 'ham' dan 'spam'
df_report.index.name = 'Class'
df_report_melted = df_report.reset_index().melt('Class', var_name='Metric', value_name='Score')

# Filter metrik utama
df_report_melted = df_report_melted[df_report_melted['Metric'].isin(['precision', 'recall', 'f1-score'])]

plt.figure(figsize=(8, 5))
sns.barplot(x='Metric', y='Score', hue='Class', data=df_report_melted)
plt.title('Metrik Performa per Kelas (Voting Classifier)')
plt.ylabel('Score')
plt.ylim(0.0, 1.0)
plt.xticks(rotation=0)
plt.legend(title='Kategori')
plt.tight_layout()
plt.savefig('metrics_bar_plot_voting.png')
plt.close()

# --- F. Persistensi Model (PENTING untuk Streamlit) ---
# Simpan Vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Simpan Model Ensemble
with open('voting_classifier_model.pkl', 'wb') as file:
    pickle.dump(voting_model, file)
print("\nModel dan Vectorizer berhasil disimpan sebagai file .pkl.")

Mulai training Voting Classifier...
Training selesai.

--- Evaluasi Model Ensemble (Voting Classifier) pada Data Test ---
Accuracy: 0.9884

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99        86
        spam       0.99      0.99      0.99        86

    accuracy                           0.99       172
   macro avg       0.99      0.99      0.99       172
weighted avg       0.99      0.99      0.99       172


Model dan Vectorizer berhasil disimpan sebagai file .pkl.
