In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv("https://raw.githubusercontent.com/HamedAyani114/dataset/main/water_potability.csv")


## preprocessing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [5]:
data.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [6]:
data= data.dropna()

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2011 entries, 3 to 3271
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2011 non-null   float64
 1   Hardness         2011 non-null   float64
 2   Solids           2011 non-null   float64
 3   Chloramines      2011 non-null   float64
 4   Sulfate          2011 non-null   float64
 5   Conductivity     2011 non-null   float64
 6   Organic_carbon   2011 non-null   float64
 7   Trihalomethanes  2011 non-null   float64
 8   Turbidity        2011 non-null   float64
 9   Potability       2011 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 172.8 KB


In [8]:
data.Potability.value_counts()

0    1200
1     811
Name: Potability, dtype: int64

In [10]:
notpotable  = data[data['Potability']==0]
potable = data[data['Potability']==1]  

from sklearn.utils import resample
df_minority_upsampled = resample(potable, replace = True, n_samples = 1200) 

from sklearn.utils import shuffle
data = pd.concat([notpotable, df_minority_upsampled])
data = shuffle(data)

In [11]:
data.Potability.value_counts()

0    1200
1    1200
Name: Potability, dtype: int64

In [12]:
data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
2581,6.363836,207.422436,9844.702293,9.765921,320.630725,466.592544,19.156261,58.265305,4.038984,0
1186,9.484703,122.906991,56351.396304,4.219711,219.553437,480.848063,13.533433,41.731219,4.132274,1
490,6.624806,204.342928,21443.264749,5.980402,312.315229,412.996763,11.965107,62.520333,2.840911,0
2780,7.000523,219.442640,26012.130278,6.280887,395.241329,483.319881,19.538635,46.861384,4.333689,1
...,...,...,...,...,...,...,...,...,...,...
407,8.195765,214.517610,10389.542538,6.295405,327.193898,403.189913,15.067042,72.756812,3.218709,1
3021,10.337671,163.276225,21820.295875,7.899521,356.179637,561.206238,14.116324,56.193061,5.063109,0
135,6.242414,191.908730,28149.411792,7.130437,319.306785,448.242527,19.370091,60.939481,4.508705,0
2836,7.833361,249.340053,13394.731607,8.341670,393.137872,443.629123,10.983344,76.645734,4.270973,1


In [13]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=2)

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(n_estimators=500, criterion='gini', min_samples_leaf=2,)

In [16]:
rfc.fit(X_train, y_train)

In [17]:
predict_rfc = rfc.predict(X_test)
print(classification_report(y_test, predict_rfc))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       119
           1       0.89      0.91      0.90       121

    accuracy                           0.90       240
   macro avg       0.90      0.90      0.90       240
weighted avg       0.90      0.90      0.90       240



In [18]:
model = rfc
model.fit(X_train, y_train)
# save the model to disk
filename = 'model_waterPotability_RFC.pkl'
pickle.dump(model, open(filename, 'wb'))

In [19]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [20]:
datain=[
    [7.692524,	220.819152,	26686.847941,	8.286211,	299.958846,	564.526974,	13.861775,	77.015940,	4.522432  ],
    [8.291299,	211.217034,	20008.134957,	8.767875,	275.767321,	475.909130,	12.614669,	66.612984,	3.424702	]
]
loaded_model.predict(datain)

array([0, 1], dtype=int64)