In [5]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

In [6]:
df_water = pd.read_csv('water_potability.csv')
df_water.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [7]:
potable_samples = df_water[df_water['Potability'] == 1].head(3)
potable_samples

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
250,9.44513,145.805402,13168.529156,9.444471,310.583374,592.659021,8.606397,77.57746,3.875165,1
251,9.024845,128.096691,19859.676476,8.016423,300.150377,451.143481,14.770863,73.778026,3.985251,1
252,,169.974849,23403.637304,8.51973,,475.573562,12.924107,50.861913,2.747313,1


In [8]:
df_water.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [9]:
df_water_cleaned = df_water.fillna(df_water.mean())

In [10]:
df_water_cleaned.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [11]:
print(df_water['Potability'].value_counts())

Potability
0    1998
1    1278
Name: count, dtype: int64


In [12]:
x = df_water.select_dtypes(include='number')
x.fillna(x.mean(), inplace=True)

In [14]:
# Splitting features and labels
x = df_water.drop('Potability', axis=1)
y = df_water['Potability']

# FEATURE SELECTION

In [24]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)

In [26]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(2620, 9)
(2620,)
(656, 9)
(656,)


# DATA BALANCING

In [29]:
ros = RandomOverSampler(random_state=42)
x_resampled_ros, y_resampled_ros = ros.fit_resample(x_train, y_train)

In [31]:
x_resampled_df = pd.DataFrame(x_resampled_ros, columns=x.columns)
y_resampled_df = pd.DataFrame(y_resampled_ros, columns=['Potability'])

oversampled_data = pd.concat([x_resampled_df, y_resampled_df], axis=1)

print("Shape of oversampled data:", oversampled_data.shape)

print("Class distribution after oversampling:")
print(oversampled_data['Potability'].value_counts())

Shape of oversampled data: (3172, 10)
Class distribution after oversampling:
Potability
0    1586
1    1586
Name: count, dtype: int64


# RandomForest Classifier

In [34]:
model_ros = RandomForestClassifier(random_state=42)
model_ros.fit(x_resampled_ros, y_resampled_ros)

pred_ros = model_ros.predict(x_test)

accurate = round(accuracy_score(y_test, pred_ros) * 100, 2)

print("RandomOverSampler Results:")
print(f"Accuracy {accurate}% ")

print(classification_report(y_test, pred_ros))

RandomOverSampler Results:
Accuracy 66.62% 
              precision    recall  f1-score   support

           0       0.70      0.82      0.75       412
           1       0.57      0.41      0.48       244

    accuracy                           0.67       656
   macro avg       0.64      0.61      0.62       656
weighted avg       0.65      0.67      0.65       656



In [36]:
# Import joblib
import joblib

# Save the trained Random Forest model
joblib.dump(model_ros, 'random_forest_model.joblib')

print("Random Forest model saved as random_forest_model.joblib")

Random Forest model saved as random_forest_model.joblib
