In [116]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [117]:
df = pd.read_csv('Water Quality Prediction.csv')
df.head()

Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Potability
0,0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,...,3.708178,2.27e-15,332.118789,,,43.493324,January,29.0,4.0,0
1,1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Near Colorless,0.166319,0.208454,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


In [118]:
# get a sample of 1000 rows
df = df.sample(n=10000, random_state=42)

In [119]:
df.isna().sum()

Index                       0
pH                        207
Iron                       67
Nitrate                   175
Chloride                  307
Lead                       36
Zinc                      265
Color                      12
Turbidity                  87
Fluoride                  335
Copper                    360
Odor                      320
Sulfate                   335
Conductivity              305
Chlorine                   82
Manganese                 175
Total Dissolved Solids      5
Source                    159
Water Temperature         271
Air Temperature            49
Month                     142
Day                       175
Time of Day               196
Potability                  0
dtype: int64

In [120]:
df.shape

(10000, 24)

In [121]:
# remove all rows with Nan values

df.dropna(inplace=True) # modifies the dataframe in place

In [122]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rowa")
else:
   print("Duplicate rows are present")

No duplicate rowa


In [123]:
# split the dataset into x and y
X = df.drop(['Potability', 'Month', 'Day', 'Time of Day', 'Index'], axis=1) # axis=1 indicates we are dropping a column, not a row
Y = df['Potability']
X

Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature
781974,7.004799,0.000006,7.114755,120.527769,4.620000e-168,1.564359,Near Colorless,0.613998,1.758451,0.255472,2.092090,120.745502,241.446886,3.099394,4.469775e-02,257.717511,Reservoir,22.900917,54.310518
937737,8.299823,0.001846,4.713490,220.284903,9.040000e-120,2.196456,Faint Yellow,1.543039,4.341496,0.316341,3.077392,228.707208,282.409585,3.749201,8.410000e-05,92.378364,Ground,64.103574,72.016863
907828,8.077128,0.001998,6.999236,157.332074,1.740000e-137,1.900052,Faint Yellow,0.363389,0.537449,0.032343,1.306127,136.219129,214.876158,2.215031,1.954040e-04,187.093504,Well,26.600483,74.400507
784628,7.813995,0.001145,6.168141,200.820979,1.970000e-06,4.961851,Near Colorless,0.731114,0.073730,0.700787,0.440061,72.935525,342.590598,3.030572,4.894026e-02,334.951667,Ground,16.434954,98.879709
662460,6.691067,0.506861,8.280426,143.161413,7.280000e-111,5.546948,Colorless,0.026614,1.532804,0.161636,2.780277,300.992636,291.962088,3.105734,3.490000e-18,211.253831,Lake,12.467716,46.854295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412869,7.221867,0.000001,6.107437,107.967559,5.340000e-36,0.156808,Near Colorless,0.048596,0.553593,0.123673,2.017747,162.248631,448.665008,2.941815,2.430376e-03,0.240609,Reservoir,13.493582,54.268061
54671,8.417396,0.044799,3.310804,184.082701,4.480000e-113,0.020234,Near Colorless,0.049878,0.008072,0.146716,2.488662,229.582491,514.969735,3.907563,2.649672e-03,499.705619,Aquifer,37.613858,75.033865
293804,6.963769,0.269067,3.639983,207.828247,3.160000e-100,0.754415,Light Yellow,0.398530,0.314208,0.098960,0.443538,187.925301,580.625621,4.377139,1.780000e-07,437.006004,Stream,9.822570,50.734422
739757,8.926421,0.020836,8.802112,230.161377,5.620000e-46,1.560606,Yellow,1.404549,0.988206,0.374856,3.618372,64.318623,698.267993,2.686135,8.140000e-12,7.044163,Reservoir,12.909049,49.704333


In [124]:
cols = X.columns
cols = cols.to_list()

categorical_cols = ['Color', 'Source']

# Create a new list that contains only non-string elements
new_list = [item for item in cols if item not in categorical_cols]
new_list

['pH',
 'Iron',
 'Nitrate',
 'Chloride',
 'Lead',
 'Zinc',
 'Turbidity',
 'Fluoride',
 'Copper',
 'Odor',
 'Sulfate',
 'Conductivity',
 'Chlorine',
 'Manganese',
 'Total Dissolved Solids',
 'Water Temperature',
 'Air Temperature']

In [125]:
# we need x, y values as numpy arrays
X = df.iloc[:, 1:-4].values
Y = df.iloc[:, -1].values

X

array([[7.004799273, 6.13e-06, 7.114755278, ..., 'Reservoir',
        22.90091727, 54.31051792],
       [8.299822905, 0.001845755, 4.713490289, ..., 'Ground',
        64.10357372, 72.01686324],
       [8.077127655, 0.001998475, 6.999235945, ..., 'Well', 26.60048284,
        74.40050714],
       ...,
       [6.963768514, 0.269067017, 3.639982552, ..., 'Stream', 9.82257044,
        50.73442198],
       [8.926421479, 0.020835562, 8.802112028, ..., 'Reservoir',
        12.90904898, 49.70433311],
       [8.07131408, 0.002899706, 4.85910161, ..., 'Stream', 30.22027379,
        64.10764584]], dtype=object)

In [126]:
# Label Encode categorical values (1, 2, 3 ... values)
le1 = LabelEncoder()
X[:, 6] = le1.fit_transform(X[:, 6])

le2 = LabelEncoder()
X[:, 16] = le2.fit_transform(X[:, 16])
X[0]


array([7.004799273, 6.13e-06, 7.114755278, 120.5277688, 4.62e-168,
       1.564359234, 3, 0.613997908, 1.758450685, 0.255472008, 2.092090468,
       120.745502, 241.4468855, 3.099393646, 0.044697746, 257.7175114, 3,
       22.90091727, 54.31051792], dtype=object)

In [127]:
# Column transform categorical columns (0, 1, 0 ...)
ct1 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [6])], remainder='passthrough')
X = ct1.fit_transform(X)

ct2 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [20])], remainder='passthrough')
X = ct2.fit_transform(X)


In [128]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train

array([[0.0, 0.0, 0.0, ..., 386.0548765, 12.14368357, 75.54909434],
       [0.0, 0.0, 0.0, ..., 575.1262244, 7.69282191, 81.53348652],
       [0.0, 0.0, 0.0, ..., 268.9093395, 12.05040711, 81.79239602],
       ...,
       [0.0, 1.0, 0.0, ..., 133.8839866, 16.64957917, 56.04885349],
       [0.0, 0.0, 0.0, ..., 495.7069391, 18.2373115, 70.62304544],
       [0.0, 0.0, 0.0, ..., 340.0161669, 29.56017489, 53.23461465]],
      dtype=object)

In [129]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 13:] = sc.fit_transform(x_train[:, 13:])  # 0,1,2 are dummy variables
x_test[:, 13:] = sc.transform(x_test[:, 13:])

print("X TRAIN", x_train[0])
print("Y TRAIN", y_train)

X TRAIN [0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.9361780759622257
 -0.2661931013105642 -0.4038227947950076 0.8577434462842697
 -0.05530364631558721 -0.8694596931713295 -0.4815787345649024
 -0.005306199903703585 -0.0814839991152836 0.7869865507075614
 0.4885550479922847 1.4983108431298098 -0.4756626981609242
 -0.2109070693005881 0.8045106177812631 -0.620739602184889
 0.8634505007786758]
Y TRAIN [0 1 0 ... 0 0 1]


### Create the Logistic Regression classification model

In [130]:
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(x_train, y_train)

In [131]:
y_pred_lr = lr_classifier.predict(x_test)

In [132]:
np.set_printoptions(precision=2)
print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

[[1 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 1]]


In [133]:
confMatrix = confusion_matrix(y_test, y_pred_lr)
print(confMatrix)

[[950  58]
 [207 111]]


In [134]:
lr_model_accuracy = accuracy_score(y_test, y_pred_lr)
print(lr_model_accuracy)

0.8001508295625943


### Create the Support Vector Machine classification model

In [135]:
# support vector classifier
svm_classifier = SVC(kernel='linear', random_state=0)  # default is rbf
svm_classifier.fit(x_train, y_train)

In [136]:
y_pred_svm = svm_classifier.predict(x_test)

In [137]:
confMatrix = confusion_matrix(y_test, y_pred_svm)
print(confMatrix)

[[949  59]
 [207 111]]


In [138]:
svm_model_accuracy = accuracy_score(y_test, y_pred_svm)
print(svm_model_accuracy)

0.799396681749623


### Create the K-Nearest Neighbour classification model

In [139]:
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
knn_classifier.fit(x_train, y_train)

In [140]:
y_pred_knn = knn_classifier.predict(x_test)

In [141]:
confMatrix = confusion_matrix(y_test, y_pred_knn)
print(confMatrix)

[[949  59]
 [218 100]]


In [142]:
knn_model_accuracy = accuracy_score(y_test, y_pred_knn)
print(knn_model_accuracy)

0.7911010558069381


### Create the decision tree classification model

In [143]:
from sklearn.tree import DecisionTreeClassifier

dtree_classifier = DecisionTreeClassifier(criterion='gini', random_state=0)
dtree_classifier.fit(x_train, y_train)

In [144]:
y_pred_dtree = dtree_classifier.predict(x_test)

In [145]:
confMatrix = confusion_matrix(y_test, y_pred_dtree)
print(confMatrix)

[[902 106]
 [124 194]]


In [146]:
dtc_model_accuracy = accuracy_score(y_test, y_pred_dtree)
print(dtc_model_accuracy)

0.8265460030165912


### Create the random forest classification model

In [162]:
from sklearn.ensemble import RandomForestClassifier
rfc_classifier = RandomForestClassifier(n_estimators = 100, criterion='entropy', random_state = 0)
rfc_classifier.fit(x_train, y_train)

In [163]:
y_pred_rdf = rfc_classifier.predict(x_test)

In [164]:
confMatrix = confusion_matrix(y_test, y_pred_rdf)
print(confMatrix)

[[855 153]
 [ 47 271]]


In [165]:
rfc_model_accuracy = accuracy_score(y_test, y_pred_rdf)
print(rfc_model_accuracy)

0.8491704374057315


### Preprocess the input data

In [166]:
values = [8.510801988, 9.16E-05, 5.920902064, 304.4845891, 3.60E-07, 1.635760979, 'Faint Yellow', 3.739693, 0.559295096, 0.880587373, 3.965759996, 62.38685835, 580.4796606, 3.84064004, 2.00E-09, 346.8499604, 'Reservoir', 12.80967626, 61.24561392]

# Convert the input values to a DataFrame
input_data = pd.DataFrame([values])

# Preprocess the input data
input_data = input_data.values  # Convert to NumPy array

# label encode the values
input_data[:, 6] = le1.transform(input_data[:, 6])
input_data[:, 16] = le2.transform(input_data[:, 16])

# column transform the values
input_data = ct1.transform(input_data)
input_data = ct2.transform(input_data)

# scale the values
input_data[:, 13:] = sc.transform(input_data[:, 13:])

print(input_data)

[[0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.2504505300104902
  -0.26600904679056847 -0.0569959209147435 1.9368778111107363
  -0.055277520511392765 0.08005165641726997 3.6783504600273798
  -0.4715796702703018 0.6601795354185298 2.0726231283358834
  -1.1895507601927064 0.8378210438812618 0.8620912303901445
  -0.22820335724081833 0.5473112721228695 -0.564394244878262
  0.07335674055373219]]


In [167]:
y_single = knn_classifier.predict(input_data)

print(y_single)

[1]


## CONCLUSIONS

In [168]:
print("ACCURACY SCORES OF EACH MODEL\n")

print("Logistic Regression Classifier\t", round(lr_model_accuracy * 100, 2))
print("SVM Classifier\t\t\t", round(svm_model_accuracy * 100, 2))
print("K-NN Classifier\t\t\t", round(knn_model_accuracy * 100, 2))
print("Decision Tree Classifier\t", round(dtc_model_accuracy * 100, 2))
print("Random Forest Classifier\t", round(rfc_model_accuracy * 100, 2))

ACCURACY SCORES OF EACH MODEL

Logistic Regression Classifier	 80.02
SVM Classifier			 79.94
K-NN Classifier			 79.11
Decision Tree Classifier	 82.65
Random Forest Classifier	 84.92
