In [None]:
# 填補空缺值

import pandas as pd


df = pd.read_csv("water_potability.csv")
df.dropna(inplace=True)
df.reset_index(inplace=True)
while (df["Potability"] == 0).sum() != (df["Potability"] == 1).sum():
    # 找到第一個 0 的 index 並刪除
    index_to_remove = df.index[df["Potability"] == 0].tolist()[0]
    df = df.drop(index_to_remove)

df.drop("index", axis=1, inplace=True)

In [None]:
# 特徵分布分析
from matplotlib import pyplot as plt


df.hist(figsize=(15, 15))
plt.show()

In [None]:
# 分析特徵相依性
# 正數為有正關聯，負數則為負關聯
import numpy as np
import seaborn as sns

plt.figure(figsize=(8, 8))
corr = df.corr()
corrMask = np.triu(corr)

sns.heatmap(
    corr,
    linewidths=1,
    annot=True,
    square=True,
    mask=corrMask,
    cmap="Blues",
)
plt.show()

In [None]:
# 切分特徵和目標變量
from sklearn.model_selection import train_test_split


X = df.drop("Potability", axis=1)
y = df["Potability"]
# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


def grid_search_best_params(model_name):
    """
    Perform GridSearchCV to find the best hyperparameters for a classifier.

    Parameters:
    - X_train: Features for training.
    - y_train: Target variable for training.
    - model_name: Name of the model.

    Returns:
    - trained_model: The trained model with the best hyperparameters.
    """

    def get_model_with_params(model_name):
        # Convert the model name to lowercase for case-insensitive matching
        model_name_lower = model_name.lower()

        models = {
            "decision tree": DecisionTreeClassifier(),
            "random forest": RandomForestClassifier(),
            "svm": SVC(),
            "logistic regression": LogisticRegression(),
            "bagging classifier": BaggingClassifier(),
            "knn": KNeighborsClassifier(),
        }

        # Use the lowercase model name for case-insensitive matching
        return models.get(model_name_lower)

    print(f"Searching for best hyperparameters for {model_name}...")

    # Get the model instance
    model = get_model_with_params(model_name)

    if model is None:
        print(f"Model '{model_name}' not found.")
        return None

    # Define hyperparameter grid for the specific model
    param_grid = get_hyperparameter_grid(model_name)

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Apply best hyperparameters to the model instance
    model.set_params(**best_params)

    # Train the model with the full training set
    model.fit(X_train, y_train)

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(
        f"Best cross-validation score for {model_name}: {grid_search.best_score_:.2f}\n"
    )

    return model


def get_hyperparameter_grid(model_name):
    """
    Define hyperparameter grid for each model.
    Modify this function to add more models and their respective hyperparameter grids.

    Parameters:
    - model_name: Name of the model.

    Returns:
    - param_grid: Hyperparameter grid for the specified model.
    """
    param_grids = {
        "decision tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
        "random forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10]},
        "svm": {
            "C": [1, 10, 100],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"],
            "probability": ["True", "False"],
        },
        "logistic regression": {
            "C": [0.001, 0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"],
        },
        "bagging classifier": {"n_estimators": [50, 100, 200]},
        "knn": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]},
    }
    return param_grids.get(model_name, {})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
import numpy as np


def evaluate_model(model):
    """
    Train and evaluate a machine learning model, and display visualizations.

    Parameters:
    - model: The machine learning model to be trained and evaluated.
    - X_train, X_test: Features for training and testing.
    - y_train, y_test: Target variable for training and testing.

    Returns:
    - accuracy: Accuracy of the trained model on the test set.
    """

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Display the classification report
    print("Classification Report:")
    print(classification_report(y_test, predictions))

    # Display the confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        confusion_matrix(y_test, predictions),
        annot=True,
        fmt="d",
        cmap="Blues",
        cbar=False,
        xticklabels=np.unique(y_test),
        yticklabels=np.unique(y_test),
    )
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

    # Display feature importance or coefficients in a bar plot
    if hasattr(model, "feature_importances_"):
        plt.bar(X_train.columns, model.feature_importances_)
        plt.xlabel("Features")
        plt.ylabel("Importance")
        plt.title("Feature Importance")
        plt.xticks(rotation=45, ha="right")
        plt.show()
    elif hasattr(model, "coef_"):
        plt.bar(X_train.columns, model.coef_[0])
        plt.xlabel("Features")
        plt.ylabel("Coefficient")
        plt.title("Feature Coefficients")
        plt.xticks(rotation=45, ha="right")
        plt.show()

    # Calculate and return accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"Model Accuracy: {accuracy:.2f}")

    return accuracy


# Example usage:
# Assuming you have a model named 'your_model' and features 'X_train', 'X_test' with corresponding target variables 'y_train', 'y_test'
# Replace 'your_model', 'X_train', 'X_test', 'y_train', 'y_test' with your actual model and data
# accuracy = evaluate_model_with_visualizations(your_model, X_train, y_train, X_test, y_test)

In [None]:
# 決策樹
# 對水質數據進行分類

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

# 建立決策樹模型
dt = grid_search_best_params("Decision Tree")
dtAccuracy = evaluate_model(dt)

In [None]:
# RandomForest
# 創建一個隨機森林分類器
from sklearn.ensemble import RandomForestClassifier


rf = grid_search_best_params("Random Forest")
rfAccuracy = evaluate_model(rf)

In [None]:
# 邏輯回歸
from sklearn.linear_model import LogisticRegression


lr = grid_search_best_params("Logistic Regression")
lrAccuracy = evaluate_model(lr)

In [None]:
# BaggingClassifier
from sklearn.ensemble import BaggingClassifier


bagging = grid_search_best_params("Bagging Classifier")
baggingAccuracy = evaluate_model(bagging)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance


def explain_model(model):
    """
    Explain the predictions of a classifier using Permutation Importance.

    Parameters:
    - model: The trained classifier model.
    - X: Input data for explaining predictions.
    - y: True labels for the data. Required for permutation importance. Default is None.

    Returns:
    - Importance scores (Permutation Importance).
    """

    # Permutation Importance is used for any classifier
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=0)
    importance = result.importances_mean
    plot_feature_importance(importance, X.columns)  # Plot the feature importance
    return importance

In [None]:
# KNN

# 特徵標準化
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

knn = grid_search_best_params("KNN")
knnAccuracy = evaluate_model(knn)

knn_explanation = explain_model(knn)

In [None]:
# SVM
# 初始化 SVM 分類器
from sklearn.svm import SVC

svm = grid_search_best_params("SVM")
svmAccuracy = evaluate_model(svm)

In [None]:
scores = pd.DataFrame(
    {
        "models": [
            "Logistic Regression",
            "Decision Tree",
            "Random Forest",
            "KNN",
            "SVM",
            "Bagging Classifier",
        ],
        "accuracy": [
            lrAccuracy,
            dtAccuracy,
            rfAccuracy,
            knnAccuracy,
            svmAccuracy,
            baggingAccuracy,
        ],
    }
)
scores.sort_values(by=["accuracy"]).style.background_gradient(subset=["accuracy"])

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assume you have trained models: dt, rf, knn, svm, lr, bagging
# Replace them with the actual trained models

models = [
    ("dt", dt),
    ("rf", rf),
    ("knn", knn),
    ("svm", svm),
    ("lr", lr),
    ("bagging", bagging),
]

# Create a VotingClassifier using all base models
voting_model = VotingClassifier(estimators=models, voting="hard")

voting_model.fit(X_train, y_train)


# Assuming you have test data X_test, y_test
# Evaluate the VotingClassifier
voting_predictions = voting_model.predict(X_test)
accuracy = accuracy_score(y_test, voting_predictions)
print(f"Voting Classifier Accuracy: {accuracy:.2f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, voting_predictions))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    confusion_matrix(y_test, voting_predictions),
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False,
    xticklabels=["Not Potable", "Potable"],
    yticklabels=["Not Potable", "Potable"],
)
plt.title("Voting Classifier Confusion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Test Data")
plt.show()