In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('Water Quality Prediction.csv')
df.head()

In [None]:
# get a sample of 1000 rows
df = df.sample(n=10000, random_state=42)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
# remove all rows with Nan values

df.dropna(inplace=True) # modifies the dataframe in place

In [None]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rowa")
else:
   print("Duplicate rows are present")

In [None]:
# split the dataset into x and y
X = df.drop(['Potability', 'Month', 'Day', 'Time of Day', 'Index'], axis=1) # axis=1 indicates we are dropping a column, not a row
Y = df['Potability']
X

In [None]:
cols = X.columns
cols = cols.to_list()

categorical_cols = ['Color', 'Source']

# Create a new list that contains only non-string elements
new_list = [item for item in cols if item not in categorical_cols]
new_list

In [None]:
# we need x, y values as numpy arrays
X = df.iloc[:, 1:-4].values
Y = df.iloc[:, -1].values

X

In [None]:
# Label Encode categorical values (1, 2, 3 ... values)
le1 = LabelEncoder()
X[:, 6] = le1.fit_transform(X[:, 6])

le2 = LabelEncoder()
X[:, 16] = le2.fit_transform(X[:, 16])
X[0]


In [None]:
# Column transform categorical columns (0, 1, 0 ...)
ct1 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [6])], remainder='passthrough')
X = ct1.fit_transform(X)

ct2 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [20])], remainder='passthrough')
X = ct2.fit_transform(X)


In [None]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train

In [31]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 13:] = sc.fit_transform(x_train[:, 13:])  # 0,1,2 are dummy variables
x_test[:, 13:] = sc.transform(x_test[:, 13:])

print("X TRAIN", x_train[0])
print("Y TRAIN", y_train)

X TRAIN [0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.9361780759622343
 -0.2661931013105643 -0.40382279479500627 0.8577434462842768
 -0.05530364631559166 -0.8694596931713279 -0.4815787345649022
 -0.0053061999037031385 -0.08148399911528374 0.7869865507075632
 0.48855504799228644 1.4983108431298104 -0.4756626981609095
 -0.21090706930058872 0.8045106177812643 -0.620739602184893
 0.8634505007786829]
Y TRAIN [0 1 0 ... 0 0 1]


### Create the Logistic Regression classification model

In [None]:
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred_lr = classifier.predict(x_test)

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

In [None]:
confMatrix = confusion_matrix(y_test, y_pred_lr)
print(confMatrix)

In [None]:
print(accuracy_score(y_test, y_pred_lr))

### Create the Support Vector Machine classification model

In [None]:
# support vector classifier
classifier = SVC(kernel='linear', random_state=0)  # default is rbf
classifier.fit(x_train, y_train)

In [None]:
y_pred_svm = classifier.predict(x_test)

In [None]:
confMatrix = confusion_matrix(y_test, y_pred_svm)
print(confMatrix)

In [None]:
print(accuracy_score(y_test, y_pred_svm))

### Create the K-Nearest Neighbour classification model

In [None]:
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
classifier.fit(x_train, y_train)

In [None]:
y_pred_knn = classifier.predict(x_test)

In [None]:
confMatrix = confusion_matrix(y_test, y_pred_knn)
print(confMatrix)

In [None]:
print(accuracy_score(y_test, y_pred_knn))

### Preprocess the input data

In [30]:
values = [8.510801988, 9.16E-05, 5.920902064, 304.4845891, 3.60E-07, 1.635760979, 'Faint Yellow', 3.739693, 0.559295096, 0.880587373, 3.965759996, 62.38685835, 580.4796606, 3.84064004, 2.00E-09, 346.8499604, 'Reservoir', 12.80967626, 61.24561392]

# Convert the input values to a DataFrame
input_data = pd.DataFrame([values])

# Preprocess the input data
input_data = input_data.values  # Convert to NumPy array

# label encode the values
input_data[:, 6] = le1.transform(input_data[:, 6])
input_data[:, 16] = le2.transform(input_data[:, 16])

# column transform the values
input_data = ct1.transform(input_data)
input_data = ct2.transform(input_data)

# scale the values
input_data[:, 13:] = sc.transform(input_data[:, 13:])

print(input_data)

[[0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.2504505300104902
  -0.26600904679056847 -0.0569959209147435 1.9368778111107363
  -0.055277520511392765 0.08005165641726997 3.6783504600273798
  -0.4715796702703018 0.6601795354185298 2.0726231283358834
  -1.1895507601927064 0.8378210438812618 0.8620912303901445
  -0.22820335724081833 0.5473112721228695 -0.564394244878262
  0.07335674055373219]]


In [33]:
y_single = classifier.predict(input_data)

print(y_single)

[1]
