In [135]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score


In [136]:
def label_solution(ph):
    if ph < 7:
        return 0   # Acidic
    elif ph > 7:
        return 1   # Basic
    else:
        return None 




In [137]:
def report(models,y_test):
    for name, pred in models:
        print(f"{name} Report\n{confusion_matrix(y_test,pred)}\n Accuracy:{accuracy_score(y_test,pred)}\n")

In [None]:
df = pd.read_csv("water_potability.csv")
features = ["pH", "Hardness", "Solids", "Conductivity", "Sulfate"]
df=df.drop(["Potability"],axis=1)
df = df.dropna(subset=["ph"])
df["Acidic/Basic"] = df["ph"].apply(label_solution)

x=  df.drop(["Acidic/Basic","Chloramines","Trihalomethanes","Turbidity","ph"],axis=1)
imputer = SimpleImputer(missing_values=np.nan,strategy="median")
imputer.fit(x)
x = imputer.transform(x)
y = df["Acidic/Basic"]


In [139]:
x_train, x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=0)
sc= StandardScaler()
x_train = sc.fit_transform(x_train)
x_test= sc.transform(x_test)

In [140]:
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)
log_pred =log_reg.predict(x_test)
log_reg.score(x_test,y_test)

0.5745062836624776

In [141]:
svm=SVC(kernel="rbf")
svm.fit(x_train,y_train)
svm_pred=svm.predict(x_test)

In [142]:
bayes = GaussianNB()
bayes.fit(x_train,y_train)
bayes_pred=bayes.predict(x_test)

In [143]:
knn=KNeighborsClassifier(n_neighbors=5,metric="euclidean")
knn.fit(x_train,y_train)
knn_pred=knn.predict(x_test)

In [144]:
dt=DecisionTreeClassifier(random_state=0)
dt.fit(x_train,y_train)
dt_pred=dt.predict(x_test)


In [145]:
rf =RandomForestClassifier(n_estimators=150)
rf.fit(x_train,y_train)
rf_pred=rf.predict(x_test)
rf.score(x_test,y_test)

0.5493716337522442

In [146]:
prediction =(["LogisticRegression",log_pred],["SVM",svm_pred],["Naive Bayes",bayes_pred],["KNeighborsClassifier",knn_pred],["Decision Tree",dt_pred],["Random Forest",rf_pred])

In [147]:
report(prediction,y_test)

LogisticRegression Report
[[132 153]
 [ 84 188]]
 Accuracy:0.5745062836624776

SVM Report
[[140 145]
 [ 89 183]]
 Accuracy:0.5798922800718133

Naive Bayes Report
[[147 138]
 [ 97 175]]
 Accuracy:0.578096947935368

KNeighborsClassifier Report
[[143 142]
 [116 156]]
 Accuracy:0.5368043087971275

Decision Tree Report
[[136 149]
 [134 138]]
 Accuracy:0.4919210053859964

Random Forest Report
[[143 142]
 [109 163]]
 Accuracy:0.5493716337522442



In [149]:
import joblib
joblib.dump(log_reg, "acid_base.pkl")


['acid_base.pkl']