In [None]:
# 導入函式庫以及讀取資料
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_text, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np

df = pd.read_csv("water_potability.csv")

In [None]:
# 特徵分布分析
df.hist(figsize=(15, 15))
plt.show()

In [None]:
# 分析特徵相依性
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), annot=True, linewidths=1, square=True)

In [None]:
# 填補空缺值
print(df.isnull().sum())
print("----------------")
print("填補空缺值...")
df["ph"].fillna(value=df["ph"].median(), inplace=True)
df["Sulfate"].fillna(value=df["Sulfate"].median(), inplace=True)
df["Trihalomethanes"].fillna(value=df["Trihalomethanes"].median(), inplace=True)
print("----------------")
print("確認填補結果：")
print(df.isnull().sum())

In [None]:
# 切分特徵和目標變量
X = df.drop("Potability", axis=1)
y = df["Potability"]
# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# SVM
# 初始化 SVM 分類器
svm_classifier = SVC(kernel="linear")

# 訓練 SVM 模型
svm_classifier.fit(X_train, y_train)

# 使用測試集進行預測
y_pred = svm_classifier.predict(X_test)

# 評估模型準確率
accuracy = accuracy_score(y_test, y_pred)
print(f"模型準確率：{accuracy:.2f}")

# 顯示分類報告
print("分類報告：")
print(classification_report(y_test, y_pred))

# 混淆矩陣
conf_matrix = confusion_matrix(y_test, y_pred)

# 顯示混淆矩陣
plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False,
    xticklabels=["Not Potable", "Potable"],
    yticklabels=["Not Potable", "Potable"],
)
plt.title("confusion_matrix")
plt.xlabel("predicted")
plt.ylabel("true")
plt.show()


# 添加目標變量到 DataFrame
df["Target"] = y

# 繪製 Pair Plot
sns.pairplot(df, hue="Target", markers=["o", "s"])
plt.title("Pair Plot for Multi-dimensional Data")
plt.show()

# 獲取特徵重要性（權重）
feature_importance = svm_classifier.coef_[0]

# 顯示特徵重要性
plt.bar(range(len(feature_importance)), feature_importance)
plt.xticks(range(len(feature_importance)), X.columns, rotation=45)
plt.xlabel("Features")
plt.ylabel("Coefficient Magnitude")
plt.title("Feature Importance for SVM")
plt.show()

In [None]:
# RandomForest
# 創建一個隨機森林分類器
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 訓練模型
rf_classifier.fit(X_train, y_train)

# 使用測試集進行預測
y_pred = rf_classifier.predict(X_test)

# 計算模型的準確度
accuracy = accuracy_score(y_test, y_pred)
print(f"模型的準確度: {accuracy}")

# 列印分類報告
report = classification_report(y_test, y_pred)
print("分類報告:\n", report)

# 混淆矩陣
conf_matrix = confusion_matrix(y_test, y_pred)

# 顯示混淆矩陣
plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False,
    xticklabels=["Not Potable", "Potable"],
    yticklabels=["Not Potable", "Potable"],
)
plt.title("confusion_matrix")
plt.xlabel("predicted")
plt.ylabel("true")
plt.show()