In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('Water Quality Prediction.csv')
df.head()

In [None]:
# get a sample of 1000 rows
df = df.sample(n=10000, random_state=42)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
# remove all rows with Nan values

df.dropna(inplace=True) # modifies the dataframe in place

In [None]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rowa")
else:
   print("Duplicate rows are present")

In [None]:
# split the dataset into x and y
X = df.drop(['Potability', 'Month', 'Day', 'Time of Day', 'Index'], axis=1) # axis=1 indicates we are dropping a column, not a row
Y = df['Potability']
X

In [None]:
cols = X.columns
cols = cols.to_list()

categorical_cols = ['Color', 'Source']

# Create a new list that contains only non-string elements
new_list = [item for item in cols if item not in categorical_cols]
new_list

In [None]:
# we need x, y values as numpy arrays
X = df.iloc[:, 1:-4].values
Y = df.iloc[:, -1].values

X

In [None]:
# Label Encode categorical values (1, 2, 3 ... values)
le1 = LabelEncoder()
X[:, 6] = le1.fit_transform(X[:, 6])

le2 = LabelEncoder()
X[:, 16] = le2.fit_transform(X[:, 16])
X[0]


In [None]:
# Column transform categorical columns (0, 1, 0 ...)
ct1 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [6])], remainder='passthrough')
X = ct1.fit_transform(X)

ct2 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [20])], remainder='passthrough')
X = ct2.fit_transform(X)


In [None]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train

In [None]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 13:] = sc.fit_transform(x_train[:, 13:])  # 0,1,2 are dummy variables
x_test[:, 13:] = sc.transform(x_test[:, 13:])

print("X TRAIN", x_train)
print("Y TRAIN", y_train)

### Create the Logistic Regression classification model

In [None]:
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred_lr = classifier.predict(x_test)

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

In [None]:
confMatrix = confusion_matrix(y_test, y_pred_lr)
print(confMatrix)

In [30]:
print(accuracy_score(y_test, y_pred_lr))

0.8001508295625943


### Create the Support Vector Machine classification model

In [None]:
# support vector classifier
classifier = SVC(kernel='linear', random_state=0)  # default is rbf
classifier.fit(x_train, y_train)

In [None]:
y_pred_svm = classifier.predict(x_test)

In [None]:
confMatrix = confusion_matrix(y_test, y_pred_svm)
print(confMatrix)

In [29]:
print(accuracy_score(y_test, y_pred_svm))

0.799396681749623


### Create the K-Nearest Neighbour classification model

In [None]:
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
classifier.fit(x_train, y_train)

In [None]:
y_pred_knn = classifier.predict(x_test)

In [None]:
confMatrix = confusion_matrix(y_test, y_pred_knn)
print(confMatrix)

In [28]:
print(accuracy_score(y_test, y_pred_knn))

0.7911010558069381
