In [29]:
import pandas as pd
import numpy as np


In [30]:
data = pd.read_csv('water_potability.csv')
data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [31]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [32]:
data.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [33]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [34]:

data["ph"] = data["ph"].fillna(data["ph"].mean())
data["Sulfate"] = data["Sulfate"].fillna(data["Sulfate"].mean())
data["Trihalomethanes"] = data["Trihalomethanes"].fillna(data["Trihalomethanes"].mean())

In [35]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [36]:
notpotable  = data[data['Potability']==0]
potable = data[data['Potability']==1]  

from sklearn.utils import resample
df_minority_upsampled = resample(potable, replace = True, n_samples = 1278) 

from sklearn.utils import shuffle
data = pd.concat([notpotable, df_minority_upsampled])
data = shuffle(data) 

In [37]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [38]:
data.shape

(3276, 10)

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

In [40]:
x = data.drop(['Potability'], axis = 1)
y = data['Potability']


In [41]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
col= x.columns
x[col] = st.fit_transform(x[col])
x[col]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
1461,0.239084,0.211885,-0.707132,-0.225229,0.009400,-0.953207,-0.997110,1.571472,-0.073325
1271,-0.010219,1.542854,2.482357,0.480470,-0.252412,-1.135731,0.715099,-0.765246,-0.533001
3052,-0.631674,-0.898155,0.938708,1.197962,0.009400,-0.920203,0.943835,0.558117,-0.386989
1990,0.455316,1.663447,-0.039723,-0.580340,1.693450,-0.868298,-0.003651,0.071920,0.069168
2297,-0.332273,0.284692,-0.273410,0.040780,0.291172,0.054118,-0.665771,-0.569576,-0.017435
...,...,...,...,...,...,...,...,...,...
2519,-0.618624,-0.086403,1.233595,2.404730,0.111863,-0.139156,-0.411760,0.215113,-0.829793
1666,-0.010219,0.542508,-0.041696,0.738969,0.009400,0.583110,0.421564,0.422265,0.333116
2779,1.584188,0.862041,-0.074855,-1.080133,-2.905094,0.067118,0.441694,-0.903402,0.091850
2092,0.435391,0.706895,0.998490,-0.202094,0.009400,-0.260985,-0.485292,0.246752,1.435351


In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.1)

In [43]:
knn = KNeighborsClassifier()

dt = DecisionTreeClassifier()

rf = RandomForestClassifier()

In [44]:
para_knn = {'n_neighbors':np.arange(1, 50)}  #parameters of knn
grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5) #search knn for 5 fold cross validation

#Decision Tree
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5) #grid search decision tree for 5 fold cv
params_rf = {'n_estimators':[100,200, 350, 500], 'min_samples_leaf':[2, 10, 30]}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)

In [45]:
grid_knn.fit(X_train, Y_train)
grid_dt.fit(X_train, Y_train)
grid_rf.fit(X_train, Y_train)

print("Best parameters for KNN:", grid_knn.best_params_)
print("Best parameters for Decision Tree:", grid_dt.best_params_)
print("Best parameters for Random Forest:", grid_rf.best_params_)

Best parameters for KNN: {'n_neighbors': 1}
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 45, 'min_samples_leaf': 1}
Best parameters for Random Forest: {'min_samples_leaf': 2, 'n_estimators': 350}


In [46]:
models = [('K Nearest Neighbours', knn),('Decision Tree', dt), ('Random Forest', rf)]


for model_name, model in models:
 
    model.fit(X_train, Y_train)    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test,y_pred)
    print('{:s} : {:.4f}'.format(model_name, accuracy))

K Nearest Neighbours : 0.6677
Decision Tree : 0.6982
Random Forest : 0.8598
