In [None]:
import pandas as pd
df = pd.read_csv('water_potability.csv')
print("First 5 Rows:")
print(df.head())
print(f"\nDataset Shape: {df.shape}")

First 5 Rows:
         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  

Dataset Shape: (3276, 10)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


Q1 = df_imputed.quantile(0.25)
Q3 = df_imputed.quantile(0.75)
IQR = Q3 - Q1
df_clean = df_imputed[~((df_imputed < (Q1 - 1.5 * IQR)) | (df_imputed > (Q3 + 1.5 * IQR))).any(axis=1)]


df_clean['Chemical_Balance'] = df_clean['Chloramines'] / (df_clean['Sulfate'] + 1e-5)


X = df_clean.drop('Potability', axis=1)
y = df_clean['Potability']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Chemical_Balance'] = df_clean['Chloramines'] / (df_clean['Sulfate'] + 1e-5)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

### Primary Model Selection and Justification

##### Random Forest Classifier chosen, because the Water Potability dataset contains complex, non-linear interactions between chemical features like pH and Sulfate. Random Forest is an ensemble method that handles such complexity well, is robust against outliers remaining after cleaning, and naturally handles the variance in tabular data without over-relying on a single feature.

In [None]:
pipeline.fit(X_train, y_train)

Model Training Complete.


In [8]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print(f"CV Average Accuracy: {cv_scores.mean():.4f}")
print(f"CV Standard Deviation: {cv_scores.std():.4f}")

CV Average Accuracy: 0.6417
CV Standard Deviation: 0.0141


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Tuning Score: {grid_search.best_score_:.4f}")

Best Parameters: {'model__max_depth': 20, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best Tuning Score: 0.6491


In [None]:
best_model = grid_search.best_estimator_

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print("\n--- Final Model Evaluation ---")
print(classification_report(y_test, y_pred))


--- Final Model Evaluation ---
              precision    recall  f1-score   support

         0.0       0.69      0.93      0.79       335
         1.0       0.70      0.30      0.42       199

    accuracy                           0.69       534
   macro avg       0.70      0.61      0.60       534
weighted avg       0.69      0.69      0.65       534



In [None]:
import gradio as gr
import joblib


joblib.dump(best_model, 'water_model.pkl')

def predict_water_potability(ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity):
    chem_balance = chloramines / (sulfate + 1e-5)
    input_data = pd.DataFrame([[ph, hardness, solids, chloramines, sulfate, conductivity, 
                                organic_carbon, trihalomethanes, turbidity, chem_balance]], 
                              columns=X.columns)
    
    prediction = best_model.predict(input_data)[0]
    return "Potable (Safe for Drinking)" if prediction == 1 else "Not Potable (Unsafe)"


interface = gr.Interface(
    fn=predict_water_potability,
    inputs=[gr.Slider(0, 14, label="pH"), gr.Number(label="Hardness"), gr.Number(label="Solids"), 
            gr.Number(label="Chloramines"), gr.Number(label="Sulfate"), gr.Number(label="Conductivity"), 
            gr.Number(label="Organic Carbon"), gr.Number(label="Trihalomethanes"), gr.Number(label="Turbidity")],
    outputs="text",
    title="Water Potability Predictor",
    description="Enter water quality metrics to determine if it is safe for human consumption."
)


interface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


