Model Libraries

In [2]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.tree import DecisionTreeClassifier # type: ignore
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.neighbors import KNeighborsClassifier # type: ignore
from sklearn.svm import SVC # type: ignore
from sklearn.metrics import accuracy_score, confusion_matrix # type: ignore

Loading Dataset

In [3]:
water_data = pd.read_csv('water_potability.csv')
water_data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [4]:
water_data.tail()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3271,4.668102,193.681736,47580.9916,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,,392.44958,19.903225,,2.798243,1
3273,9.41951,175.762646,33155.57822,7.350233,,432.044783,11.03907,69.8454,3.298875,1
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658,1
3275,7.874671,195.102299,17404.17706,7.509306,,327.459761,16.140368,78.698446,2.309149,1


In [5]:
water_data.shape

(3276, 10)

In [6]:
water_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [7]:
water_data.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.6903,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833605,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762125,8.114887,359.95017,481.792305,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.19601,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [8]:
water_data.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [9]:
water_data['Potability'].count()

3276

In [10]:
water_data["Potability"].value_counts()

Potability
0    1998
1    1278
Name: count, dtype: int64

Preprocess DataSet

In [11]:
# Filling Missing Values
water_data['ph'] = water_data['ph'].fillna(water_data['ph'].mean())
water_data['Sulfate'] = water_data['Sulfate'].fillna(water_data['Sulfate'].mean())
water_data['Trihalomethanes'] = water_data['Trihalomethanes'].fillna(water_data['Trihalomethanes'].mean())

In [12]:
# Splitting Data to indepndent (x), dependent (y) Features
x = water_data.iloc[:, :-1]
y = water_data.iloc[:,-1]

In [13]:
x.shape,y.shape

((3276, 9), (3276,))

In [14]:
# Feature Scaling
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [15]:
# Splitting Data to X train , X test, Y train, Y test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=0)
x_train.shape, x_test.shape

((2620, 9), (656, 9))

In [16]:
y_train.shape, y_test.shape

((2620,), (656,))

Logistic Regression Algorithm

In [17]:
lr_model = LogisticRegression()
lr_model.fit(x_train,y_train)
lr_predict = lr_model.predict(x_test)
# LR Accuracy Score 
lr_accuracy_score = accuracy_score(y_test,lr_predict)
print("LR Accuracy Score {:.2f}".format(lr_accuracy_score*100))

LR Accuracy Score 62.80


In [34]:
# Confusion Matrix To Logistic Regression 
cm = confusion_matrix(y_test, lr_predict)

TN, FP, FN, TP = cm.ravel()
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP)  =  0
False Positive(FP) =  0
True Negative(TN)  =  412
False Negative(FN) =  244


Decision Tree Algorithm

In [19]:
dt_model = DecisionTreeClassifier(max_depth=4)
dt_model.fit(x_train,y_train)
dt_predict = dt_model.predict(x_test)
dt_accuracy = accuracy_score(y_test,dt_predict)
print("DT Accuracy Score {:.2f}".format(dt_accuracy*100))

DT Accuracy Score 64.02


In [33]:
# Confusion Matrix To Decision Tree
cm = confusion_matrix(y_test, dt_predict)

TN, FP, FN, TP = cm.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
dt_precision = TP / (TP+FP)
print("DT Percision {:.2f}".format(dt_precision*100))

True Positive(TP)  =  18
False Positive(FP) =  10
True Negative(TN)  =  402
False Negative(FN) =  226
DT Percision 64.29


Random Forest Classifiers

In [21]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train,y_train)
rf_predict = rf_model.predict(x_test)
# RF Accuracy Score
rf_accuracy = accuracy_score(y_test,rf_predict)
print("RFC Accuracy Score {:.2f}".format(rf_accuracy*100))

RFC Accuracy Score 70.27


In [42]:
# Confusion Matrix to Random Forest Classifiers
cm = confusion_matrix(y_test, rf_predict)

TN, FP, FN, TP = cm.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
rf_precision = TP / (TP+FP)
print("DT Percision {:.2f}".format(rf_precision*100))

True Positive(TP)  =  86
False Positive(FP) =  37
True Negative(TN)  =  375
False Negative(FN) =  158
DT Percision 69.92


K_Neighbours Classifiers

In [23]:
knn_model = KNeighborsClassifier()
knn_model.fit(x_train,y_train)
knn_predict = knn_model.predict(x_test)
knn_accuracy = accuracy_score(y_test,knn_predict)
print("RFC Accuracy Score {:.2f}".format(knn_accuracy*100))

RFC Accuracy Score 63.57


In [44]:
# Confusion Matrix to K_Neighbors Classifiers
cm1 = confusion_matrix(y_test, knn_predict)

TN, FP, FN, TP = cm1.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
knn_precision = TP / (TP+FP)
print("DT Percision {:.2f}".format(knn_precision*100))

True Positive(TP)  =  94
False Positive(FP) =  89
True Negative(TN)  =  323
False Negative(FN) =  150
DT Percision 51.37


Support Vector Machine

In [25]:
svm_model = SVC()
svm_model.fit(x_train,y_train)
svm_predict = svm_model.predict(x_test)
svm_accuracy = accuracy_score(y_test,svm_predict)
print("SVM Accuracy Score {:.2f}".format(svm_accuracy*100))

SVM Accuracy Score 68.60


In [47]:
# Confusion Matrix to Support Vector Machine
cm1 = confusion_matrix(y_test, svm_predict)

TN, FP, FN, TP = cm1.ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
svm_precision = TP / (TP+FP)
print("DT Percision {:.2f}".format(svm_precision*100))

True Positive(TP)  =  66
False Positive(FP) =  28
True Negative(TN)  =  384
False Negative(FN) =  178
DT Percision 70.21


In [48]:
# Models Evaluation
models = pd.DataFrame({
    "Model": ["Logistic Regression","Decision Tree","Random Forest","K_Neighbours","Support Vector Machine"],
    "Accuracy Score": [lr_accuracy_score*100, dt_accuracy*100, rf_accuracy*100, knn_accuracy*100, svm_accuracy*100],
    "Precision ": ["Null", dt_precision*100, rf_precision*100, knn_precision*100, svm_precision*100]
})
models

Unnamed: 0,Model,Accuracy Score,Precision
0,Logistic Regression,62.804878,Null
1,Decision Tree,64.02439,64.285714
2,Random Forest,70.27439,69.918699
3,K_Neighbours,63.567073,51.36612
4,Support Vector Machine,68.597561,70.212766


In [49]:
# Making a Prediction
new_data = (7.080795,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135)
new_data_arr = np.array(new_data)
new_data_arr_reshape = new_data_arr.reshape(1,-1)
predict = svm_model.predict(new_data_arr_reshape)
if predict ==0:
    print("Water is Safe")
else:
    print("Water is not Safe")

Water is not Safe
