# Introduction

This code will build some popular ML models using SKLearn using the same feature engineered dataset to see how well our final FNN performs

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import random

In [13]:
#set random seeds
random.seed(42)
np.random.seed(42)

In [5]:
#import data
url = 'https://raw.githubusercontent.com/JGasior-AI/WaterPotabilityDeepLearning/refs/heads/main/water_potability_featureengineered2.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,1,Solids/Sulfate,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,...,Conductivity Organic_carbon,Conductivity Trihalomethanes,Conductivity Turbidity,Organic_carbon^2,Organic_carbon Trihalomethanes,Organic_carbon Turbidity,Trihalomethanes^2,Trihalomethanes Turbidity,Turbidity^2,Potability
0,1.0,61.695917,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,...,6697.372024,36450.770473,1681.477347,339.905435,1849.951737,85.338441,10068.451614,464.458586,21.425517,0.0
1,1.0,57.971347,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,...,4604.943514,12748.346309,1623.554115,133.593824,369.841742,47.100861,1023.871539,130.394234,16.60624,0.0
2,1.0,88.003036,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,...,2355.856069,15402.698258,717.916032,70.555542,461.295466,21.50087,3015.971549,140.573703,6.552106,0.0
3,1.0,73.031216,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,...,3911.469602,23997.936909,758.197622,190.155697,1166.657262,36.8597,7157.761717,226.144353,7.144869,0.0
4,1.0,45.076331,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,...,5867.961926,29804.557523,2088.949819,152.863963,776.426781,54.418408,3943.627609,276.402029,19.37254,0.0


In [6]:
df.describe()

Unnamed: 0,1,Solids/Sulfate,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,...,Conductivity Organic_carbon,Conductivity Trihalomethanes,Conductivity Turbidity,Organic_carbon^2,Organic_carbon Trihalomethanes,Organic_carbon Turbidity,Trihalomethanes^2,Trihalomethanes Turbidity,Turbidity^2,Potability
count,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,...,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0
mean,1.0,67.491504,7.08599,195.968072,21917.441374,7.134338,333.224672,426.526409,14.357709,66.400859,...,6128.139193,28328.060334,1693.980752,217.193672,953.061434,56.956202,4667.419044,263.336378,16.367384,0.403282
std,0.0,30.944347,1.573337,32.635085,8642.239815,1.58482,41.205172,80.712572,3.324959,16.077109,...,1855.420049,8828.715296,469.896324,96.54876,324.909716,17.323296,2156.41573,82.77248,6.233563,0.490678
min,1.0,0.954908,0.227499,73.492234,320.942611,1.390871,129.0,201.619737,2.2,8.577013,...,833.573817,3134.706256,509.729039,4.84,127.974313,9.222966,73.565151,18.712591,2.1025,0.0
25%,1.0,46.022053,6.089723,176.744938,15615.66539,6.138895,307.632511,366.680307,12.124105,55.952664,...,4841.450807,22180.373865,1348.035582,146.993929,720.800272,44.953869,3130.700624,205.532631,11.853667,0.0
50%,1.0,62.99957,7.027297,197.191839,20933.51275,7.143907,332.232177,423.455906,14.322019,66.542198,...,5990.993223,27695.874264,1661.096083,205.120225,932.337186,55.77011,4427.864121,255.851093,15.746429,0.0
75%,1.0,84.026104,8.052969,216.44107,27182.587067,8.109726,359.330555,482.373169,16.683049,77.291925,...,7262.174029,33826.097085,1986.245963,278.324137,1151.832477,68.281524,5974.041765,313.763321,20.37778,1.0
max,1.0,437.896685,14.0,317.338124,56488.672413,13.127,481.030642,753.34262,27.006707,124.0,...,12744.9757,68632.269389,3750.358214,729.362202,2284.878298,134.766473,15376.0,579.133143,42.181759,1.0


In [8]:
#split into X and y
X = df.drop('Potability', axis=1)
y = df['Potability']

In [16]:
#Normalize data
scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X)
X_norm

array([[0.        , 0.13901397, 0.58734916, ..., 0.65315661, 0.79537767,
        0.48212011],
       [0.        , 0.13048979, 0.64365393, ..., 0.06210165, 0.19928185,
        0.36187645],
       [0.        , 0.19922134, 0.38893354, ..., 0.19228354, 0.21744583,
        0.11102017],
       ...,
       [0.        , 0.32651897, 0.81782618, ..., 0.10805756, 0.29061654,
        0.4238594 ],
       [0.        , 0.17086076, 0.42418706, ..., 0.23375376, 0.36224778,
        0.28354526],
       [0.        , 0.30034527, 0.32242529, ..., 0.28581618, 0.49445382,
        0.43848134]])

In [15]:
#split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.1, random_state=42)

In [89]:
#create different models

classifiers = {
    "KNeighborsClassifier": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": np.arange(1, 25),
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        }
    },
    "LogisticRegression": {
        "model": LogisticRegression(),
        "params": {
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "max_iter": [5000],
            "solver": ["saga", "newton-cg", "lbfgs"],
            "penalty": ["l1", "l2", "elasticnet", "none"]
        }
    },
    "SVC": {
        "model": SVC(),
        "params": {
            "C": [0.001, 0.01, 0.1, 1, 10]
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None],
            "criterion": ["gini", "entropy"]
        }
    },
    "GaussianNB": {
        "model": GaussianNB(),
        "params": {}
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [10, 50, 100, 200],
            "max_depth": [None, 10, 20],
            "criterion": ["gini", "entropy"]
        }
    },
    "GradientBoostingClassifier": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 10]
        }
    }
}

classifiers

{'KNeighborsClassifier': {'model': KNeighborsClassifier(),
  'params': {'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
          18, 19, 20, 21, 22, 23, 24]),
   'weights': ['uniform', 'distance'],
   'metric': ['euclidean', 'manhattan']}},
 'LogisticRegression': {'model': LogisticRegression(),
  'params': {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
   'max_iter': [5000],
   'solver': ['saga', 'newton-cg', 'lbfgs'],
   'penalty': ['l1', 'l2', 'elasticnet', 'none']}},
 'SVC': {'model': SVC(), 'params': {'C': [0.001, 0.01, 0.1, 1, 10]}},
 'DecisionTreeClassifier': {'model': DecisionTreeClassifier(),
  'params': {'max_depth': [None], 'criterion': ['gini', 'entropy']}},
 'GaussianNB': {'model': GaussianNB(), 'params': {}},
 'RandomForestClassifier': {'model': RandomForestClassifier(),
  'params': {'n_estimators': [10, 50, 100, 200],
   'max_depth': [None, 10, 20],
   'criterion': ['gini', 'entropy']}},
 'GradientBoostingClassifier': {'model': Grad

In [90]:
for name, clf in classifiers.items():
    print(name)

KNeighborsClassifier
LogisticRegression
SVC
DecisionTreeClassifier
GaussianNB
RandomForestClassifier
GradientBoostingClassifier


In [91]:
#grid search throught he classifiers and theu
def grid_search_classifiers(X_train, y_train, X_test, y_test, classifiers):
  model_params = dict()

  for name, clf in classifiers.items():
      print(f"Training {name}...")
      grid_search = GridSearchCV(clf['model'], clf['params'], cv=5, scoring='accuracy')
      grid_search.fit(X_train, y_train)

      #make prediction
      y_pred = grid_search.predict(X_test)

      #calculate metrics
      accuracy = accuracy_score(y_test, y_pred)
      precision = precision_score(y_test, y_pred)
      recall = recall_score(y_test, y_pred)
      f1 = f1_score(y_test, y_pred)
      roc_auc = roc_auc_score(y_test, y_pred)

      #add to dict
      model_params[name] = {
        "model": grid_search,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
      }

      print(f'Accuracy: {accuracy:.4f} | F1: {f1:.4f} | Precision: {precision:.3f} | Recall: {recall:.3f} | AUC: {roc_auc:.4f}')
      print(f'Best parameters: {grid_search.best_params_}')
      print('---')

  return model_params

In [92]:
model_runs = grid_search_classifiers(X_train, y_train, X_test, y_test, classifiers)

Training KNeighborsClassifier...
Accuracy: 0.6238 | F1: 0.2963 | Precision: 0.552 | Recall: 0.203 | AUC: 0.5484
Best parameters: {'metric': 'manhattan', 'n_neighbors': np.int64(18), 'weights': 'uniform'}
---
Training LogisticRegression...


280 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Accuracy: 0.7079 | F1: 0.5280 | Precision: 0.717 | Recall: 0.418 | AUC: 0.6560
Best parameters: {'C': 1000, 'max_iter': 5000, 'penalty': 'l2', 'solver': 'lbfgs'}
---
Training SVC...
Accuracy: 0.7178 | F1: 0.5512 | Precision: 0.729 | Recall: 0.443 | AUC: 0.6687
Best parameters: {'C': 10}
---
Training DecisionTreeClassifier...
Accuracy: 0.6040 | F1: 0.5062 | Precision: 0.494 | Recall: 0.519 | AUC: 0.5888
Best parameters: {'criterion': 'gini', 'max_depth': None}
---
Training GaussianNB...
Accuracy: 0.6584 | F1: 0.4298 | Precision: 0.619 | Recall: 0.329 | AUC: 0.5995
Best parameters: {}
---
Training RandomForestClassifier...
Accuracy: 0.7129 | F1: 0.5000 | Precision: 0.784 | Recall: 0.367 | AUC: 0.6510
Best parameters: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 200}
---
Training GradientBoostingClassifier...
Accuracy: 0.6881 | F1: 0.5039 | Precision: 0.667 | Recall: 0.405 | AUC: 0.6375
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
---
