IMPORTING THE LIBRARIES

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

IMPORTING THE DATASET

In [None]:
df = pd.read_csv('water_potability.csv')
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [None]:
# number of rows and Columns in this dataset
df.shape

(3276, 10)

DROP THE DUPLICATE DATA

In [None]:
df.drop_duplicates()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [None]:
df.groupby('Potability').mean()

Unnamed: 0_level_0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
Potability,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,7.085378,196.733292,21777.490788,7.092175,334.56429,426.730454,14.364335,66.303555,3.9658
1,7.073783,195.800744,22383.991018,7.169338,332.56699,425.3838,14.160893,66.539684,3.968328


In [None]:
df['Potability'].value_counts()

Unnamed: 0_level_0,count
Potability,Unnamed: 1_level_1
0,1998
1,1278


0 --> Non Potabable

1 --> Potabable

HANDLE MISSING DATA

In [None]:
#count the no of missing value
print(df.isnull().sum())

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


In [None]:
# Handle missing values using median imputation
imputer = SimpleImputer(strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [None]:
print("Missing values after imputation:")
print(df.isnull().sum())

Missing values after imputation:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64


ENCODING CATEGORICAL DATA

In [None]:
from sklearn.preprocessing import LabelEncoder

# Check for categorical columns in the dataset
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

# Apply Label Encoding or One-Hot Encoding
for col in categorical_cols:
    # Example with Label Encoding for binary or ordinal categorical features
    if df[col].nunique() <= 2:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])
    else:
        # Example with One-Hot Encoding for non-binary features
        df = pd.get_dummies(df, columns=[col], prefix=col)

# Verify the result
df.head()


Categorical columns: Index([], dtype='object')


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0.0
1,3.71608,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656,0.0
2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934,0.0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0.0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0.0


CLASS IMBALANCE USING SMOTE

In [None]:
# Check for class imbalance
print("Class distribution:\n", df['Potability'].value_counts())

# Apply SMOTE if there is class imbalance
X = df.drop(columns=['Potability'])
y = df['Potability']
if y.value_counts(normalize=True)[0] > 0.6:
    smote = SMOTE()
    X, y = smote.fit_resample(X, y)
    print("After applying SMOTE:\n", pd.Series(y).value_counts())


Class distribution:
 Potability
0.0    1998
1.0    1278
Name: count, dtype: int64
After applying SMOTE:
 Potability
0.0    1998
1.0    1998
Name: count, dtype: int64


In [None]:
# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

Train & Test Split data

In [None]:
X_train_resampled, X_test, y_train_resampled, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train_resampled.shape)
print(y_train_resampled.shape)
print(X_test.shape)
print(y_test.shape)

(3196, 9)
(3196,)
(800, 9)
(800,)


In [None]:
df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.469956,32.879761,8768.570828,1.583085,36.142612,80.824064,3.308162,15.769881,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.277673,176.850538,15666.690297,6.127421,317.094638,365.734414,12.065801,56.647656,3.439711,0.0
50%,7.080795,196.967627,20927.833607,7.130299,333.775777,421.884968,14.218338,66.396293,3.955028,0.0
75%,7.87005,216.667456,27332.762127,8.114887,350.385756,481.792304,16.557652,76.666609,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


Models to evaluate

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [None]:
#Use K-Fold Cross Validation to evaluate each model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

In [None]:
for model_name, model in models.items():
    # Cross-validate the model
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results[model_name] = {
        "Accuracy Mean": np.mean(cv_results),
        "Accuracy Std": np.std(cv_results)
    }
    print(f"{model_name} - Accuracy: {np.mean(cv_results):.4f} ± {np.std(cv_results):.4f}")

# Training and evaluation on a holdout set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    print(f"\n{model_name} Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))

Logistic Regression - Accuracy: 0.5005 ± 0.0087
Decision Tree - Accuracy: 0.6134 ± 0.0190
Random Forest - Accuracy: 0.6922 ± 0.0123
Support Vector Machine - Accuracy: 0.6592 ± 0.0169
K-Nearest Neighbors - Accuracy: 0.6446 ± 0.0133

Logistic Regression Evaluation:
Accuracy: 0.515
Precision: 0.5252808988764045
Recall: 0.4605911330049261
F1 Score: 0.49081364829396323

Decision Tree Evaluation:
Accuracy: 0.61875
Precision: 0.6199524940617577
Recall: 0.6428571428571429
F1 Score: 0.6311970979443773

Random Forest Evaluation:
Accuracy: 0.68375
Precision: 0.6946564885496184
Recall: 0.6724137931034483
F1 Score: 0.6833541927409261

Support Vector Machine Evaluation:
Accuracy: 0.66125
Precision: 0.6573426573426573
Recall: 0.6945812807881774
F1 Score: 0.6754491017964072

K-Nearest Neighbors Evaluation:
Accuracy: 0.67
Precision: 0.6666666666666666
Recall: 0.6995073891625616
F1 Score: 0.6826923076923077


Saving best model for further use

In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier

best_rf = RandomForestClassifier()
best_rf.fit(X_train, y_train)  # Train the best model on the entire training data

pickle.dump(best_rf, open('model_grad.pkl','wb'))