In [None]:
# 導入所需庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 設定檔案路徑 (與 main_pipeline.py 保持一致)
DATA_PATH = '../data/sms_spam_no_header.csv'
MODEL_PATH = '../model/spam_classifier_mnb.pkl'
VECTORIZER_PATH = '../model/tfidf_vectorizer.pkl'

In [None]:
print("--- 1. 數據載入與準備 ---")

# 修正後的數據載入邏輯：使用 header=None，並手動指定欄位名
df = pd.read_csv(DATA_PATH, encoding='latin-1', header=None)
df = df.rename(columns={0: 'label', 1: 'text'})
df = df[['label', 'text']]

# 轉換標籤：ham=0, spam=1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 顯示數據集基本資訊
print(f"總數據筆數: {len(df)}")
print(f"非垃圾郵件 (HAM): {df['label'].value_counts()[0]} 筆")
print(f"垃圾郵件 (SPAM): {df['label'].value_counts()[1]} 筆")

# 劃分訓練集和測試集
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\n訓練集大小: {len(X_train)}")
print(f"測試集大小: {len(X_test)}")

In [None]:
print("--- 2. 特徵工程 (TF-IDF) ---")

# 訓練 TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 保存 Vectorizer 供 Streamlit 使用
import os
os.makedirs('../model', exist_ok=True)
joblib.dump(tfidf_vectorizer, VECTORIZER_PATH)

print("TF-IDF Vectorizer 訓練完成並已保存。")

In [None]:
print("--- 3. 模型訓練 (Multinomial Naive Bayes) ---")

# 訓練模型
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# 保存訓練好的模型
joblib.dump(model, MODEL_PATH)

print("模型訓練完成並已保存到 'model/spam_classifier_mnb.pkl'。")

In [None]:
print("--- 4. 模型評估與指標計算 ---")

# 進行預測
y_pred = model.predict(X_test_tfidf)

# 計算指標
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# 顯示結果
print(f"✅ 準確度 (Accuracy): {accuracy:.4f}")
print(f"✅ 精確度 (Precision): {precision:.4f}")
print(f"✅ 召回率 (Recall): {recall:.4f}")
print(f"✅ F1 分數 (F1 Score): {f1:.4f}")

In [None]:
print("--- 5. 混淆矩陣可視化 ---")

# 根據 Streamlit 優化的參數來繪製混淆矩陣，確保緊湊且清晰
# 設置較小的圖表尺寸和字體
fig, ax = plt.subplots(figsize=(4, 3.5)) 

sns.heatmap(cm, 
            annot=True, 
            fmt='d', # 確保數字顯示為整數
            cmap='Blues', 
            cbar=False,
            xticklabels=['HAM (Pred 0)', 'SPAM (Pred 1)'], 
            yticklabels=['HAM (Actual 0)', 'SPAM (Actual 1)'],
            annot_kws={"fontsize": 14}, # 格子內數字大小
            ax=ax)

# 調整軸標籤和刻度字體大小
ax.set_ylabel('Actual Label', fontsize=12)
ax.set_xlabel('Predicted Label', fontsize=12)
ax.tick_params(axis='x', labelsize=10)
ax.tick_params(axis='y', labelsize=10, rotation=0)

plt.title("Confusion Matrix for Test Set", fontsize=14)
plt.tight_layout()

# 顯示圖表
plt.show()