In [None]:
# 導入函式庫以及讀取資料
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_text, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np

df = pd.read_csv("water_potability.csv")

In [None]:
# 特徵分布分析
df.hist(figsize=(15, 15))
plt.show()

In [None]:
# 分析特徵相依性
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), annot=True, linewidths=1, square=True)

In [None]:
# 填補空缺值
print(df.isnull().sum())
print("----------------")
print("填補空缺值...")
df["ph"].fillna(value=df["ph"].median(), inplace=True)
df["Sulfate"].fillna(value=df["Sulfate"].median(), inplace=True)
df["Trihalomethanes"].fillna(value=df["Trihalomethanes"].median(), inplace=True)
print("----------------")
print("確認填補結果：")
print(df.isnull().sum())

In [None]:
# 切分特徵和目標變量
X = df.drop("Potability", axis=1)
y = df["Potability"]
# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# 決策樹
model_param = {
    "max_depth": [i for i in range(1, 21)],
    "max_features": [i for i in range(1, len(X.columns) + 1)],
}
grid_search = GridSearchCV(DecisionTreeClassifier(), model_param, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
best_param = grid_search.best_params_
# 建立決策樹模型
model = DecisionTreeClassifier(
    max_features=best_param["max_features"], max_depth=best_param["max_depth"]
)
model.fit(X_train, y_train)
# 進行預測
y_pred = model.predict(X_test)
# 評估模型性能
accuracy = accuracy_score(y_test, y_pred)
print(f"\nDecisionTree Accuracy: {accuracy}\n")
# 分類報告
class_report = classification_report(y_test, y_pred)
print("DecisionTree Classification Report:")
print(class_report)
# 混淆矩陣
conf_matrix = confusion_matrix(y_test, y_pred)
# 使用Seaborn來繪製熱力圖
plt.figure(figsize=(5, 3))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="YlOrBr",
    cbar=False,
    xticklabels=["Non-Potable", "Potable"],
    yticklabels=["Non-Potable", "Potable"],
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("DecisionTree Confusion Matrix")
plt.show()
# 特徵名稱
feature_names = X.columns.tolist()
# 可視化決策樹
plt.figure(figsize=(15, 10))
plot_tree(
    model,
    feature_names=feature_names,
    class_names=["Non-Potable", "Potable"],
    filled=True,
    rounded=True,
)
plt.show()


# 交叉驗證
cross_val_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Accuracy: {cross_val_scores.mean()}")

In [None]:
# KNearestNeighbor
# 特徵標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 使用網格搜索尋找最佳K值
model_param = {"n_neighbors": range(1, 21)}
grid_search = GridSearchCV(KNeighborsClassifier(), model_param, cv=5)
grid_search.fit(X_train_scaled, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
best_k = grid_search.best_params_["n_neighbors"]


# 使用最佳K值擬合新的KNN模型
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_scaled, y_train)


# 預測
y_pred_best = knn.predict(X_test_scaled)


# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_best)
classification_rep = classification_report(y_test, y_pred_best)


# 分類報告
print(f"\nKNN Accuracy: {accuracy}\n")
print(f"KNN Classification Report:\n{classification_rep}")


# 使用Seaborn來繪製混淆矩陣
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(5, 3))
sns.heatmap(
    conf_matrix_best,
    annot=True,
    fmt="d",
    cmap="YlOrBr",
    cbar=False,
    xticklabels=["Non-Potable", "Potable"],
    yticklabels=["Non-Potable", "Potable"],
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("KNN Confusion Matrix")
plt.show()


# 交叉驗證
cv_val_scores = cross_val_score(knn, X, y, cv=5)
print(f"\nCross-Validation Scores: {cv_val_scores}")
print(f"Mean Accuracy: {cv_val_scores.mean()}")

In [None]:
# 取前兩個特徵用於可視化
X_2d = X[["ph", "Hardness"]]
# 設定模型和參數
knn_vis = KNeighborsClassifier(n_neighbors=best_k)
# 訓練模型
knn_vis.fit(X_2d, y)
# 繪製決策邊界
h = 0.02
x_min, x_max = X_2d.iloc[:, 0].min() - 1, X_2d.iloc[:, 0].max() + 1
y_min, y_max = X_2d.iloc[:, 1].min() - 1, X_2d.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn_vis.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 繪製數據點和決策邊界
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.3)
plt.scatter(
    X_2d.iloc[:, 0], X_2d.iloc[:, 1], c=y, cmap=plt.cm.coolwarm, edgecolors="k", s=50
)
plt.xlabel("ph")
plt.ylabel("Hardness")
plt.title(f"KNN Decision Boundaries (k={best_k})")
plt.show()

In [None]:
# 邏輯回歸
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_y = lr_model.predict(X_test)
print(accuracy_score(lr_y, y_test))
feat_list = []
# Make a dataframe of Coefficients and Feature Names
for feature in zip(X, lr_model.coef_[0]):
    feat_list.append(feature)
# create DataFrame using data
df_imp = pd.DataFrame(feat_list, columns=["FEATURE", "COEFFICIENT"])
df_imp.sort_values(by="COEFFICIENT", ascending=False)