<a href="https://colab.research.google.com/github/IrakizaGaius/PeerGroup8WaterQualityModel/blob/main/Peer_Group_8__formative_II__Gaius_Irakiza.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Excercise - Creating our own custom Model

This is a notebook that provides a quick overview of how to create your own custom model. You will be creating a simple model.
You will be utilizing Keras and Tensorflow


## Water Quality Dataset

This dataset contains water quality measurements and assessments related to potability, which is the suitability of water for human consumption. The dataset's primary objective is to provide insights into water quality parameters and assist in determining whether the water is potable or not. Each row in the dataset represents a water sample with specific attributes, and the "Potability" column indicates whether the water is suitable for consumption.

https://www.kaggle.com/datasets/uom190346a/water-quality-and-potability?select=water_potability.csv


In [None]:
# Import Necessary dependencies
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score




# Data Loading

In [None]:
data = pd.read_csv("/content/water_potability.csv")
data.head(10)

In [None]:
# Information on the data
data.info()

In [None]:
# Brief overview of the dataset statistics
data.describe()

In [None]:
# drop duplicates rows of data

data = data.drop_duplicates()

In [None]:
# percentage of missingness in the data for each column

missing = data.isnull().mean()*100
print(missing)

In [None]:
# MICE IMPUTATION to fill the missing data
# create the imputer using MICE

# separate the target variable from the rest of the data to make sure it is not changed or imputed
features = data.drop(columns='Potability')
target = data.Potability
imputer = IterativeImputer(random_state=0)
features_imputed = imputer.fit_transform(features)

# convert the data back into a dataframe
features_imputed = pd.DataFrame(features_imputed, columns=features.columns)

# merge target variable and data
data_imputed = pd.concat([features_imputed, target], axis=1)
data_imputed.head(10)


In [None]:
# confirm imputed data
data_imputed.info()

In [None]:
# Remove outliers that may affect the neural network's accuracy using IQR method
def remove_outliers_iqr(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    # anything above or below this is an outlier
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # place outliers in a data frame
    print(f"{df[column_name]}")
    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]
    print(f"Number of outliers: {len(outliers)}")
    print(f"Percentage of outliers: {len(outliers)/len(df)*100:.2f}%")

    # remove outliers

    df_clean = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

    return df_clean

# columns to remove outliers in
columns = ['Hardness', 'Solids', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']

data_imputed_copy = data_imputed.copy()

for i in columns:
  data_imputed_copy = remove_outliers_iqr(data_imputed_copy, i)

Plot the Data Appropriately

In [None]:
# Transforms data to have mean=0 and standard deviation=1

scaler = StandardScaler()

X = data_imputed_copy.drop(columns='Potability', axis=1)
y= data_imputed_copy['Potability']

X_scaled = scaler.fit_transform(X)


X_scaled.shape


In [None]:

# Split the data into training validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3,random_state=42,
    stratify=y               # Keep same class distribution in all splits
)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5,stratify=y_temp)

print(f"\n=== FINAL SHAPES ===")
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_val: {y_val.shape}")
print(f"y_test: {y_test.shape}")



# Each Member Defines their model Here

In [None]:
def model_gaius_irakiza(input_shape):

  model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=input_shape),

    tf.keras.layers.Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(32, kernel_regularizer=tf.keras.regularizers.l2(0.006)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(24, kernel_regularizer=tf.keras.regularizers.l2(0.006)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.006)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Dense(8, kernel_regularizer= tf.keras.regularizers.l2(0.006)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(4),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.PReLU(),


    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

  optimizer = tf.keras.optimizers.Nadam(
    learning_rate=0.0061,
)

  model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.AUC(name='auc')
        ]
    )

  return model

input_shape = X_train.shape[1:]

gaius_model = model_gaius_irakiza(input_shape)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=30,
    min_delta=0.0001,
    restore_best_weights=True
)

# Training
history = gaius_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=300,
    batch_size=48,
    verbose=1,
    callbacks=[early_stopping]
)

Epoch 1/300
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 29ms/step - accuracy: 0.4250 - auc: 0.4890 - loss: 1.4153 - precision: 0.3876 - recall: 0.8443 - val_accuracy: 0.6110 - val_auc: 0.5154 - val_loss: 1.0929 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/300
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6067 - auc: 0.5070 - loss: 1.0329 - precision: 0.5199 - recall: 0.0311 - val_accuracy: 0.6110 - val_auc: 0.4946 - val_loss: 0.8846 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/300
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5962 - auc: 0.5166 - loss: 0.8616 - precision: 0.3392 - recall: 0.0079 - val_accuracy: 0.6110 - val_auc: 0.4967 - val_loss: 0.7921 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/300
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6254 - auc: 0.5421 - loss: 0.7706 - precision: 

# Model Evaluation

In [None]:
# Best epoch based on validation loss
best_epoch = np.argmin(history.history['val_loss'])
print(f"📌 Best Epoch: {best_epoch + 1}")
print(f"Train Accuracy at Best Epoch: {history.history['accuracy'][best_epoch]:.4f}")
print(f"Val Accuracy at Best Epoch  : {history.history['val_accuracy'][best_epoch]:.4f}")

# Evaluate on test set
test_loss, test_accuracy, test_precision, test_recall, test_auc = gaius_model.evaluate(X_test, y_test, verbose=1)

print("\n📊 Test Evaluation Metrics")
print(f"Loss      : {test_loss:.4f}")
print(f"Accuracy  : {test_accuracy:.4f}")
print(f"Precision : {test_precision:.4f}")
print(f"Recall    : {test_recall:.4f}")
print(f"AUC       : {test_auc:.4f}")

# Predictions (binary classification threshold at 0.5)
y_pred_probs = gaius_model.predict(X_test)
y_pred_classes = (y_pred_probs > 0.5).astype("int32")

# Precision, Recall, F1
precision = precision_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes)
auc = roc_auc_score(y_test, y_pred_probs)

print("\n🧠 Additional Classification Metrics")
print(f"F1 Score  : {f1:.4f}")
print(f"AUC (sklearn): {auc:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Potable', 'Potable'], yticklabels=['Not Potable', 'Potable'])
plt.title("🧪 Confusion Matrix on Test Data")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()