In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib
from sklearn.impute import SimpleImputer

from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

In [27]:
df = pd.read_csv('Water Quality Prediction.csv')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [28]:
# get a sample of 1000 rows
df = df.sample(n=300000, random_state=42)

In [29]:
df.isna().sum()

Index                         0
pH                         5890
Iron                       2029
Nitrate                    5355
Chloride                   8814
Lead                       1327
Zinc                       8022
Color                       292
Turbidity                  2497
Fluoride                   9599
Copper                    10082
Odor                       9006
Sulfate                    9767
Conductivity               8356
Chlorine                   2862
Manganese                  5409
Total Dissolved Solids       88
Source                     4411
Water Temperature          8477
Air Temperature            1464
Month                      4767
Day                        4951
Time of Day                5814
Potability                    0
dtype: int64

In [30]:
# df.dropna(inplace=True)

In [31]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rows")
else:
   print("Duplicate rows are present")

No duplicate rows


In [32]:
# remove month and index columns
df = df.drop(['Month', 'Index'], axis=1) # axis=1 indicates we are dropping a column, not a row


In [33]:
# drop rows with null values in color and source columns
df = df.dropna(subset=["Color", "Source"])
# df

In [34]:
# Identify the numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns

# # Create a SimpleImputer and apply it to the numeric columns
imputer = SimpleImputer(strategy='median')  # You can choose a different strategy if needed
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])


In [35]:
# we need x, y values as numpy arrays
X = df.iloc[:, 0:-1].values
Y = df.iloc[:, -1].values

# X = df.iloc[:, 1:-4].values
# Y = df.iloc[:, -1].values

In [36]:
# Label Encode categorical values (1, 2, 3 ... values)
le1 = LabelEncoder()
X[:, 6] = le1.fit_transform(X[:, 6])

le2 = LabelEncoder()
X[:, 16] = le2.fit_transform(X[:, 16])
X[0]


array([7.004799273, 6.13e-06, 7.114755278, 120.5277688, 4.62e-168,
       1.564359234, 3, 0.613997908, 1.758450685, 0.255472008, 2.092090468,
       120.745502, 241.4468855, 3.099393646, 0.044697746, 257.7175114, 3,
       22.90091727, 54.31051792, 7.0, 6.0], dtype=object)

In [37]:
# Column transform categorical columns (0, 1, 0 ...)
ct1 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [6])], remainder='passthrough')
X = ct1.fit_transform(X)

ct2 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [20])], remainder='passthrough')
X = ct2.fit_transform(X)

X

array([[0.0, 0.0, 0.0, ..., 54.31051792, 7.0, 6.0],
       [0.0, 1.0, 0.0, ..., 72.01686324, 6.0, 1.0],
       [0.0, 0.0, 0.0, ..., 74.40050714, 6.0, 14.0],
       ...,
       [1.0, 0.0, 0.0, ..., 44.94757749, 16.0, 17.0],
       [0.0, 0.0, 1.0, ..., 50.26662387, 3.0, 9.0],
       [1.0, 0.0, 0.0, ..., 20.89056308, 16.0, 0.0]], dtype=object)

In [38]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train

array([[0.0, 0.0, 0.0, ..., 25.80560754, 4.0, 17.0],
       [0.0, 0.0, 0.0, ..., 45.31816114, 16.0, 8.0],
       [0.0, 0.0, 0.0, ..., 75.71125799, 16.0, 19.0],
       ...,
       [0.0, 0.0, 0.0, ..., 60.3323639, 19.0, 8.0],
       [0.0, 0.0, 0.0, ..., 45.24376613, 21.0, 21.0],
       [0.0, 0.0, 0.0, ..., 36.67733054, 1.0, 7.0]], dtype=object)

In [39]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 13:] = sc.fit_transform(x_train[:, 13:])
x_test[:, 13:] = sc.transform(x_test[:, 13:])

print("X TRAIN", x_train[0])
print("Y TRAIN", y_train)

X TRAIN [0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.08964742005388752
 0.22250950554342272 0.15996135975192427 -0.6381875866222126
 -0.042288407823216426 -0.8072420424603828 0.06677454259727253
 1.3226740845347544 -0.8490761665366783 0.9445235266694276
 -0.7634630583229903 -0.1073774995704066 -0.46782054283570396
 -0.20849567405751046 -1.018199837239626 -0.5522348960288287
 -1.8997789033146155 -1.3448568906243474 0.804588997837725]
Y TRAIN [0. 0. 0. ... 0. 0. 0.]


In [45]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

classifier = joblib.load('trained_RDF_model_3.joblib')

y_pred_rdf = classifier.predict(x_test)
print(y_pred_rdf)

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision = precision_score(y_test, y_pred_rdf)
recall = recall_score(y_test, y_pred_rdf)
f1 = f1_score(y_test, y_pred_rdf)

print("F1 Score:", f1)

print("Precision:", precision)
print("Recall:", recall)

[1. 0. 0. ... 1. 0. 0.]
F1 Score: 0.7731525911708254
Precision: 0.6480969380059329
Recall: 0.9580081753994798


### Create the Logistic Regression classification model

In [None]:
# lr_classifier = LogisticRegression(random_state=0)
# lr_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
# joblib.dump(lr_classifier, 'trained_LR_model.joblib')

In [None]:
# y_pred_lr = lr_classifier.predict(x_test)

In [None]:
# np.set_printoptions(precision=2)
# print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_lr)
# print(confMatrix)

In [None]:
# lr_model_accuracy = accuracy_score(y_test, y_pred_lr)
# print(lr_model_accuracy)

### Create the Support Vector Machine classification model

In [None]:
# support vector classifier
# svm_classifier = SVC(kernel='linear', random_state=0)  # default is rbf
# svm_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
# joblib.dump(svm_classifier, 'trained_SVM_model.joblib')

In [None]:
# y_pred_svm = svm_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_svm)
# print(confMatrix)

In [None]:
# svm_model_accuracy = accuracy_score(y_test, y_pred_svm)
# print(svm_model_accuracy)

### Create the K-Nearest Neighbour classification model

In [None]:
# knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
# knn_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
# joblib.dump(knn_classifier, 'trained_KNN_model.joblib')

In [None]:
# y_pred_knn = knn_classifier.predict(x_test)

In [None]:
# knn_model_accuracy = accuracy_score(y_test, y_pred_knn)
# print(knn_model_accuracy)

### Create the decision tree classification model

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# dtree_classifier = DecisionTreeClassifier(criterion='gini', random_state=0)
# dtree_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
# joblib.dump(dtree_classifier, 'trained_DTR_model.joblib')

In [None]:
# y_pred_dtree = dtree_classifier.predict(x_test)

In [None]:
# dtc_model_accuracy = accuracy_score(y_test, y_pred_dtree)
# print(dtc_model_accuracy)

### Create the random forest classification model

In [None]:
from sklearn.ensemble import RandomForestClassifier
# rfc_classifier = RandomForestClassifier(n_estimators = 100, criterion='entropy', random_state = 0)
# rfc_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
# joblib.dump(rfc_classifier, 'trained_RDF_model_3.joblib')

In [None]:
# y_pred_rdf = rfc_classifier.predict(x_test)

In [None]:
# rfc_model_accuracy = accuracy_score(y_test, y_pred_rdf)
# print(rfc_model_accuracy)

# Accuracy

# remove missing values - 0.8642686209991026
# remove missing of categorical and replace others with median - 0.87615241140421

### Preprocess the input data

In [None]:
# values = [7.66857169, 7.06e-08, 7.541255359, 198.1312335, 1.31e-95,
#        0.767179279, 'Colorless', 0.137766996, 1.008886456, 2.391833449,
#        0.750761234, 148.9474344, 242.7039915, 3.709734571, 2.301398715,
#        100.9851033, 'Stream', 9.674425593, 35.25315137, 12.0, 1.0]

classifier = joblib.load('trained_RDF_model_3.joblib')
values = [1, 7.06e-08, 7.541255359, 198.1312335, 1.31e-95,
       0.767179279, 'Colorless', 0.137766996, 1.008886456, 2.391833449,
       0.750761234, 148.9474344, 242.7039915, 3.709734571, 2.301398715,
       100.9851033, 'Stream', 9.674425593, 35.25315137, 12.0, 1.0]

# Convert the input values to a DataFrame
input_data = pd.DataFrame([values])

# Preprocess the input data
input_data = input_data.values  # Convert to NumPy array

# label encode the values
input_data[:, 6] = le1.transform(input_data[:, 6])
input_data[:, 16] = le2.transform(input_data[:, 16])

# column transform the values
input_data = ct1.transform(input_data)
input_data = ct2.transform(input_data)

# scale the values
input_data[:, 13:] = sc.transform(input_data[:, 13:])

# print(input_data)

y_single = classifier.predict(input_data)

print(y_single[0])

In [None]:
# y_single = rfc_classifier.predict(input_data)

# print(y_single[0])

### Check overfitting

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    classifier, x_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.title("Learning Curve")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Accuracy")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Test Accuracy")
plt.legend(loc="best")
plt.grid()
plt.show()

## CONCLUSIONS

In [None]:
# print("ACCURACY SCORES OF EACH MODEL\n")

# print("Logistic Regression Classifier\t", round(lr_model_accuracy * 100, 2))
# print("SVM Classifier\t\t\t", round(svm_model_accuracy * 100, 2))
# print("K-NN Classifier\t\t\t", round(knn_model_accuracy * 100, 2))
# print("Decision Tree Classifier\t", round(dtc_model_accuracy * 100, 2))
# print("Random Forest Classifier\t", round(rfc_model_accuracy * 100, 2))