### Water Potability

In [21]:
# import modules
import pandas as pd

# train-test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  # suppport vector machine
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

Data Processing

In [6]:
df = pd.read_csv('../datasets/water_potability.csv')

df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [7]:
df.shape

(3276, 10)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.0 KB


In [9]:
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [10]:
# fill missing values
df["ph"].fillna(value = df["ph"].mean(), inplace = True)
df["Sulfate"].fillna(value = df["Sulfate"].mean(), inplace = True)
df["Trihalomethanes"].fillna(value = df["Trihalomethanes"].mean(), inplace = True)

In [11]:
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

Ml Training

In [19]:
X = df.drop('Potability',axis=1)
Y = df['Potability']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [30]:
print("x - train shape:",X_train.shape)
print("x - test shape:",X_test.shape)
print("y - train shape:",y_train.shape)
print("y - test shape:",y_test.shape)

x - train shape: (2194, 9)
x - test shape: (1082, 9)
y - train shape: (2194,)
y - test shape: (1082,)


In [29]:
# normalizing values 
min_max_scale = MinMaxScaler()
X_train = min_max_scale.fit_transform(X_train)
X_test = min_max_scale.fit_transform(X_test)

array([[0.50577104, 0.15813584, 0.8132832 , ..., 0.3258781 , 0.33895476,
        0.57834489],
       [0.5679608 , 0.45007002, 0.09648588, ..., 0.74217904, 0.64121875,
        0.51017301],
       [0.50791751, 0.56873263, 0.43606198, ..., 0.61251473, 0.69645632,
        0.89243596],
       ...,
       [0.50577104, 0.48358899, 0.23536226, ..., 0.29740754, 0.45897908,
        0.46544586],
       [0.50577104, 0.4820426 , 0.30574557, ..., 0.32114444, 0.45897908,
        0.50845248],
       [0.64477055, 0.73269368, 0.48845695, ..., 0.6003052 , 0.39068106,
        0.11913167]])

In [24]:
# various models
models = [('Logistic Reg.',LogisticRegression()),
          ('SVM',SVC()),
          ('Decision Tree',DecisionTreeClassifier()),
          ('Random Foresr',RandomForestClassifier())]

In [31]:
res = []

for name,model in models:

    model.fit(X_train,y_train)
    score = model.score(X_test,y_test)
    res.append((name,score))

res

[('Logistic Reg.', 0.6044362292051756),
 ('SVM', 0.634011090573013),
 ('Decision Tree', 0.532347504621072),
 ('Random Foresr', 0.6404805914972274)]

In [None]:
# Do Hyperparameter training Later...

In [33]:
# Saving model
import joblib
model = RandomForestClassifier().fit(X_train,y_train)
joblib.dump(model,'water_quality')

['water_quality']

In [34]:
mj = joblib.load('water_quality')
mj.score(X_test,y_test)

0.6201478743068392

In [36]:
#Predicitng results
res = [8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771]

mj.predict([res])[0]

1