In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [None]:
df = pd.read_csv('Water Quality Prediction.csv')
df.head()

In [None]:
# get a sample of 1000 rows
df = df.sample(n=100000, random_state=42)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
# remove all rows with Nan values

df.dropna(inplace=True) # modifies the dataframe in place

In [None]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rowa")
else:
   print("Duplicate rows are present")

In [None]:
# split the dataset into x and y
X = df.drop(['Potability', 'Month', 'Day', 'Time of Day', 'Index'], axis=1) # axis=1 indicates we are dropping a column, not a row
Y = df['Potability']
X

In [None]:
cols = X.columns
cols = cols.to_list()

categorical_cols = ['Color', 'Source']

# Create a new list that contains only non-string elements
new_list = [item for item in cols if item not in categorical_cols]
new_list

In [None]:
# we need x, y values as numpy arrays
X = df.iloc[:, 1:-4].values
Y = df.iloc[:, -1].values

X

In [None]:
# Label Encode categorical values (1, 2, 3 ... values)
le1 = LabelEncoder()
X[:, 6] = le1.fit_transform(X[:, 6])

le2 = LabelEncoder()
X[:, 16] = le2.fit_transform(X[:, 16])
X[0]


In [None]:
# Column transform categorical columns (0, 1, 0 ...)
ct1 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [6])], remainder='passthrough')
X = ct1.fit_transform(X)

ct2 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [20])], remainder='passthrough')
X = ct2.fit_transform(X)


In [None]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train

In [None]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 13:] = sc.fit_transform(x_train[:, 13:])  # 0,1,2 are dummy variables
x_test[:, 13:] = sc.transform(x_test[:, 13:])

print("X TRAIN", x_train[0])
print("Y TRAIN", y_train)

### Create the Logistic Regression classification model

In [None]:
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(lr_classifier, 'trained_LR_model.joblib')

In [None]:
y_pred_lr = lr_classifier.predict(x_test)

In [None]:
# np.set_printoptions(precision=2)
# print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_lr)
# print(confMatrix)

In [None]:
lr_model_accuracy = accuracy_score(y_test, y_pred_lr)
print(lr_model_accuracy)

### Create the Support Vector Machine classification model

In [None]:
# support vector classifier
svm_classifier = SVC(kernel='linear', random_state=0)  # default is rbf
svm_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(svm_classifier, 'trained_SVM_model.joblib')

In [None]:
y_pred_svm = svm_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_svm)
# print(confMatrix)

In [None]:
svm_model_accuracy = accuracy_score(y_test, y_pred_svm)
print(svm_model_accuracy)

### Create the K-Nearest Neighbour classification model

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
knn_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(knn_classifier, 'trained_KNN_model.joblib')

In [None]:
y_pred_knn = knn_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_knn)
# print(confMatrix)

In [None]:
knn_model_accuracy = accuracy_score(y_test, y_pred_knn)
print(knn_model_accuracy)

### Create the decision tree classification model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree_classifier = DecisionTreeClassifier(criterion='gini', random_state=0)
dtree_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(dtree_classifier, 'trained_DTR_model.joblib')

In [None]:
y_pred_dtree = dtree_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_dtree)
# print(confMatrix)

In [None]:
dtc_model_accuracy = accuracy_score(y_test, y_pred_dtree)
print(dtc_model_accuracy)

### Create the random forest classification model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_classifier = RandomForestClassifier(n_estimators = 100, criterion='entropy', random_state = 0)
rfc_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(rfc_classifier, 'trained_RDF_model.joblib')

In [None]:
y_pred_rdf = rfc_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_rdf)
# print(confMatrix)

In [None]:
rfc_model_accuracy = accuracy_score(y_test, y_pred_rdf)
print(rfc_model_accuracy)

### Preprocess the input data

In [None]:
values = [8.510801988, 9.16E-05, 5.920902064, 304.4845891, 3.60E-07, 1.635760979, 'Faint Yellow', 3.739693, 0.559295096, 0.880587373, 3.965759996, 62.38685835, 580.4796606, 3.84064004, 2.00E-09, 346.8499604, 'Reservoir', 12.80967626, 61.24561392]

# Convert the input values to a DataFrame
input_data = pd.DataFrame([values])

# Preprocess the input data
input_data = input_data.values  # Convert to NumPy array

# label encode the values
input_data[:, 6] = le1.transform(input_data[:, 6])
input_data[:, 16] = le2.transform(input_data[:, 16])

# column transform the values
input_data = ct1.transform(input_data)
input_data = ct2.transform(input_data)

# scale the values
input_data[:, 13:] = sc.transform(input_data[:, 13:])

print(input_data)

In [None]:
y_single = rfc_classifier.predict(input_data)

print(y_single)

## CONCLUSIONS

In [None]:
# print("ACCURACY SCORES OF EACH MODEL\n")

# print("Logistic Regression Classifier\t", round(lr_model_accuracy * 100, 2))
# print("SVM Classifier\t\t\t", round(svm_model_accuracy * 100, 2))
# print("K-NN Classifier\t\t\t", round(knn_model_accuracy * 100, 2))
# print("Decision Tree Classifier\t", round(dtc_model_accuracy * 100, 2))
# print("Random Forest Classifier\t", round(rfc_model_accuracy * 100, 2))