In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("water_potability.csv")

In [3]:
print("Dataset loaded successfully!")
print("Shape before cleaning:", data.shape)
print("\nMissing values before cleaning:\n", data.isnull().sum())

Dataset loaded successfully!
Shape before cleaning: (3276, 10)

Missing values before cleaning:
 ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


In [4]:
imputer = KNNImputer(n_neighbors=5)
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
print("\nMissing values handled using KNN Imputer.")
print("Missing values after imputation:\n", data_imputed.isnull().sum())


Missing values handled using KNN Imputer.
Missing values after imputation:
 ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64


In [5]:
for col in data_imputed.columns[:-1]:  # Skip 'Potability' column
    Q1 = data_imputed[col].quantile(0.25)
    Q3 = data_imputed[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    data_imputed[col] = np.clip(data_imputed[col], lower_limit, upper_limit)

print("\nOutliers capped using IQR method.")


Outliers capped using IQR method.


In [6]:
data_imputed['ph'] = data_imputed['ph'].clip(lower=0, upper=14)

In [7]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_imputed.drop('Potability', axis=1))

data_scaled = pd.DataFrame(scaled_features, columns=data_imputed.columns[:-1])
data_scaled['Potability'] = data_imputed['Potability']

print("\nFeatures scaled using StandardScaler.")


Features scaled using StandardScaler.


In [8]:
before = data_scaled.shape[0]
data_scaled = data_scaled.drop_duplicates()
after = data_scaled.shape[0]
print(f"\nRemoved {before - after} duplicate rows (if any).")


Removed 0 duplicate rows (if any).


In [9]:
data_scaled.to_csv("cleaned_water_potability.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_water_potability.csv'")


Cleaned dataset saved as 'cleaned_water_potability.csv'


In [10]:
print("\nFinal Dataset Summary:")
print(data_scaled.describe())
print("\nFinal shape:", data_scaled.shape)


Final Dataset Summary:
                 ph      Hardness        Solids   Chloramines       Sulfate  \
count  3.276000e+03  3.276000e+03  3.276000e+03  3.276000e+03  3.276000e+03   
mean   1.127846e-16 -1.507409e-16 -2.271958e-16 -2.776235e-16 -7.894919e-16   
std    1.000153e+00  1.000153e+00  1.000153e+00  1.000153e+00  1.000153e+00   
min   -2.459160e+00 -2.476150e+00 -2.518320e+00 -2.575035e+00 -2.398398e+00   
25%   -6.201205e-01 -6.104492e-01 -7.321673e-01 -6.440698e-01 -5.990623e-01   
50%   -1.794576e-02  1.796822e-02 -1.198018e-01  5.508795e-03 -1.184244e-02   
75%    6.059060e-01  6.333513e-01  6.256933e-01  6.432406e-01  6.004946e-01   
max    2.444946e+00  2.499052e+00  2.662484e+00  2.574206e+00  2.399830e+00   

       Conductivity  Organic_carbon  Trihalomethanes     Turbidity  \
count  3.276000e+03    3.276000e+03     3.276000e+03  3.276000e+03   
mean  -9.353527e-16    1.735147e-17    -6.897210e-16 -7.157482e-17   
std    1.000153e+00    1.000153e+00     1.000153e+00  