In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [6]:
df = pd.read_csv('Water Quality Prediction.csv')
df.head()

Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Potability
0,0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,...,3.708178,2.27e-15,332.118789,,,43.493324,January,29.0,4.0,0
1,1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Near Colorless,0.166319,0.208454,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


In [7]:
# get a sample of 1000 rows
df = df.sample(n=100000, random_state=42)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [8]:
# remove all rows with Nan values

df.dropna(inplace=True) # modifies the dataframe in place

In [None]:
# check for duplicate rows
duplicate_rows = df[df.duplicated()]
if duplicate_rows.count().sum() == 0:
   print("No duplicate rowa")
else:
   print("Duplicate rows are present")

In [9]:
# split the dataset into x and y
X = df.drop(['Potability', 'Month', 'Day', 'Time of Day', 'Index'], axis=1) # axis=1 indicates we are dropping a column, not a row
Y = df['Potability']
X

Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature
781974,7.004799,6.130000e-06,7.114755,120.527769,4.620000e-168,1.564359,Near Colorless,0.613998,1.758451,0.255472,2.092090,120.745502,241.446886,3.099394,4.469775e-02,257.717511,Reservoir,22.900917,54.310518
937737,8.299823,1.845755e-03,4.713490,220.284903,9.040000e-120,2.196456,Faint Yellow,1.543039,4.341496,0.316341,3.077392,228.707208,282.409585,3.749201,8.410000e-05,92.378364,Ground,64.103574,72.016863
907828,8.077128,1.998475e-03,6.999236,157.332074,1.740000e-137,1.900052,Faint Yellow,0.363389,0.537449,0.032343,1.306127,136.219129,214.876158,2.215031,1.954040e-04,187.093504,Well,26.600483,74.400507
784628,7.813995,1.145000e-03,6.168141,200.820979,1.970000e-06,4.961851,Near Colorless,0.731114,0.073730,0.700787,0.440061,72.935525,342.590598,3.030572,4.894026e-02,334.951667,Ground,16.434954,98.879709
662460,6.691067,5.068609e-01,8.280426,143.161413,7.280000e-111,5.546948,Colorless,0.026614,1.532804,0.161636,2.780277,300.992636,291.962088,3.105734,3.490000e-18,211.253831,Lake,12.467716,46.854295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914455,8.120564,1.170000e-09,7.355084,96.230006,2.890000e-08,0.810747,Colorless,0.002375,0.373423,0.006626,1.416142,188.546064,405.156118,3.177922,1.552390e-04,481.708343,Ground,46.682743,68.763669
80809,8.318530,4.287108e-03,6.326976,72.114445,6.400000e-243,1.071254,Colorless,0.332185,1.934085,0.677406,3.186319,60.223525,407.393556,3.182622,7.210000e-10,330.291977,Aquifer,11.003228,75.988999
671949,5.339506,4.004972e+00,2.171121,190.990878,8.620000e-129,1.982674,Faint Yellow,0.031429,0.516413,0.012284,4.122576,90.129952,577.525383,3.218093,5.987480e-01,320.506649,Spring,27.909774,18.998035
499176,7.255511,1.295289e-01,3.661624,127.477024,1.000000e-06,1.755221,Near Colorless,0.000847,1.630685,0.052678,0.111688,159.647437,521.888509,2.118294,1.900254e-03,28.497639,Aquifer,8.461737,40.424152


In [10]:
cols = X.columns
# cols = cols.to_list()

categorical_cols = ['Color', 'Source']

# Create a new list that contains only non-string elements
new_list = [item for item in cols if item not in categorical_cols]
new_list

['pH',
 'Iron',
 'Nitrate',
 'Chloride',
 'Lead',
 'Zinc',
 'Turbidity',
 'Fluoride',
 'Copper',
 'Odor',
 'Sulfate',
 'Conductivity',
 'Chlorine',
 'Manganese',
 'Total Dissolved Solids',
 'Water Temperature',
 'Air Temperature']

In [11]:
# we need x, y values as numpy arrays
X = df.iloc[:, 1:-4].values
Y = df.iloc[:, -1].values

X

array([[7.004799273, 6.13e-06, 7.114755278, ..., 'Reservoir',
        22.90091727, 54.31051792],
       [8.299822905, 0.001845755, 4.713490289, ..., 'Ground',
        64.10357372, 72.01686324],
       [8.077127655, 0.001998475, 6.999235945, ..., 'Well', 26.60048284,
        74.40050714],
       ...,
       [5.33950555, 4.004971858, 2.171120684, ..., 'Spring', 27.90977427,
        18.99803523],
       [7.255510881, 0.129528938, 3.661623969, ..., 'Aquifer',
        8.461736843, 40.42415154],
       [7.063004388, 0.04632639, 8.890720149, ..., 'River', 10.47610764,
        24.28328988]], dtype=object)

In [12]:
# Label Encode categorical values (1, 2, 3 ... values)
le1 = LabelEncoder()
X[:, 6] = le1.fit_transform(X[:, 6])

le2 = LabelEncoder()
X[:, 16] = le2.fit_transform(X[:, 16])
X[0]


array([7.004799273, 6.13e-06, 7.114755278, 120.5277688, 4.62e-168,
       1.564359234, 3, 0.613997908, 1.758450685, 0.255472008, 2.092090468,
       120.745502, 241.4468855, 3.099393646, 0.044697746, 257.7175114, 3,
       22.90091727, 54.31051792], dtype=object)

In [13]:
# Column transform categorical columns (0, 1, 0 ...)
ct1 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [6])], remainder='passthrough')
X = ct1.fit_transform(X)

ct2 = ColumnTransformer(transformers=[('encode', OneHotEncoder(), [20])], remainder='passthrough')
X = ct2.fit_transform(X)


In [14]:
# Splitting the data set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train

array([[0.0, 0.0, 0.0, ..., 425.254835, 8.76875878, 53.03647794],
       [0.0, 0.0, 0.0, ..., 157.5866344, 18.75635626, 49.39815037],
       [0.0, 0.0, 0.0, ..., 314.171592, 7.330355678, 44.73654664],
       ...,
       [0.0, 1.0, 0.0, ..., 402.6288004, 8.976191009, 36.22737099],
       [0.0, 0.0, 0.0, ..., 201.9966985, 16.44010914, 51.02353447],
       [0.0, 1.0, 0.0, ..., 399.1981209, 13.58481817, 55.4285618]],
      dtype=object)

In [15]:
# Normalize input values

sc = StandardScaler()  # range: -3 to +3
x_train[:, 13:] = sc.fit_transform(x_train[:, 13:])  # 0,1,2 are dummy variables
x_test[:, 13:] = sc.transform(x_test[:, 13:])

print("X TRAIN", x_train[0])
print("Y TRAIN", y_train)

X TRAIN [0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 -2.026182568461654
 -0.2529507762163661 -0.5797267348310291 2.314937586419409
 -0.04106201582840569 -0.8088429221020856 -0.5497859234001802
 0.42247855341433305 -0.46977029185120317 -1.5607185071376262
 1.8026731972032313 -1.1509555181761484 -0.337940235403826
 0.09338824401324686 1.0293839305850232 -0.8996875109646076
 -0.3884205285232413]
Y TRAIN [1 1 1 ... 0 0 0]


### Create the Logistic Regression classification model

In [None]:
lr_classifier = LogisticRegression(random_state=0)
lr_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(lr_classifier, 'trained_LR_model.joblib')

In [None]:
y_pred_lr = lr_classifier.predict(x_test)

In [None]:
# np.set_printoptions(precision=2)
# print(np.concatenate([y_pred_lr.reshape(len(y_pred_lr), 1), y_test.reshape(len(y_test), 1)], axis=1))

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_lr)
# print(confMatrix)

In [None]:
lr_model_accuracy = accuracy_score(y_test, y_pred_lr)
print(lr_model_accuracy)

### Create the Support Vector Machine classification model

In [None]:
# support vector classifier
svm_classifier = SVC(kernel='linear', random_state=0)  # default is rbf
svm_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(svm_classifier, 'trained_SVM_model.joblib')

In [None]:
y_pred_svm = svm_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_svm)
# print(confMatrix)

In [None]:
svm_model_accuracy = accuracy_score(y_test, y_pred_svm)
print(svm_model_accuracy)

### Create the K-Nearest Neighbour classification model

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # classic euclidean distance
knn_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(knn_classifier, 'trained_KNN_model.joblib')

In [None]:
y_pred_knn = knn_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_knn)
# print(confMatrix)

In [None]:
knn_model_accuracy = accuracy_score(y_test, y_pred_knn)
print(knn_model_accuracy)

### Create the decision tree classification model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree_classifier = DecisionTreeClassifier(criterion='gini', random_state=0)
dtree_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(dtree_classifier, 'trained_DTR_model.joblib')

In [None]:
y_pred_dtree = dtree_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_dtree)
# print(confMatrix)

In [None]:
dtc_model_accuracy = accuracy_score(y_test, y_pred_dtree)
print(dtc_model_accuracy)

### Create the random forest classification model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_classifier = RandomForestClassifier(n_estimators = 100, criterion='entropy', random_state = 0)
rfc_classifier.fit(x_train, y_train)

In [None]:
# save the trained model
joblib.dump(rfc_classifier, 'trained_RDF_model.joblib')

In [None]:
y_pred_rdf = rfc_classifier.predict(x_test)

In [None]:
# confMatrix = confusion_matrix(y_test, y_pred_rdf)
# print(confMatrix)

In [None]:
rfc_model_accuracy = accuracy_score(y_test, y_pred_rdf)
print(rfc_model_accuracy)

In [22]:
# Feature importance for RDF model
X_original_1 = ct1.named_transformers_['encode'].inverse_transform(X[:, :5])

# rfc_model = joblib.load('trained_RDF_model.joblib')


# # Get feature importances
# feature_importances = rfc_model.feature_importances_
# print(feature_importances)

# # Create a DataFrame to store feature names and their importances
# importance_df = pd.DataFrame({'Feature': cols[1:-4], 'Importance': feature_importances})


# # Sort the DataFrame by importance in descending order
# importance_df = importance_df.sort_values(by='Importance', ascending=False)

# # Print or visualize the feature importances
# print(importance_df)

ValueError: Samples [    2    10    12 ... 66849 66850 66855] can not be inverted when drop=None and handle_unknown='error' because they contain all zeros

### Preprocess the input data

In [None]:
values = [8.510801988, 9.16E-05, 5.920902064, 304.4845891, 3.60E-07, 1.635760979, 'Faint Yellow', 3.739693, 0.559295096, 0.880587373, 3.965759996, 62.38685835, 580.4796606, 3.84064004, 2.00E-09, 346.8499604, 'Reservoir', 12.80967626, 61.24561392]

# Convert the input values to a DataFrame
input_data = pd.DataFrame([values])

# Preprocess the input data
input_data = input_data.values  # Convert to NumPy array

# label encode the values
input_data[:, 6] = le1.transform(input_data[:, 6])
input_data[:, 16] = le2.transform(input_data[:, 16])

# column transform the values
input_data = ct1.transform(input_data)
input_data = ct2.transform(input_data)

# scale the values
input_data[:, 13:] = sc.transform(input_data[:, 13:])

print(input_data)

In [None]:
y_single = rfc_classifier.predict(input_data)

print(y_single)

## CONCLUSIONS

In [None]:
# print("ACCURACY SCORES OF EACH MODEL\n")

# print("Logistic Regression Classifier\t", round(lr_model_accuracy * 100, 2))
# print("SVM Classifier\t\t\t", round(svm_model_accuracy * 100, 2))
# print("K-NN Classifier\t\t\t", round(knn_model_accuracy * 100, 2))
# print("Decision Tree Classifier\t", round(dtc_model_accuracy * 100, 2))
# print("Random Forest Classifier\t", round(rfc_model_accuracy * 100, 2))