Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Loading the Dataset

In [None]:
df = pd.read_csv('water_potability.csv')
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


Data Pre-processing

In [None]:
dup = df.duplicated().sum()
print('Any Duplicate Value:',dup)

Any Duplicate Value: 0


In [None]:
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [None]:
df["ph"].fillna(value = df["ph"].mean(), inplace = True)
df["Sulfate"].fillna(value = df["Sulfate"].mean(), inplace = True)
df["Trihalomethanes"].fillna(value = df["Trihalomethanes"].mean(), inplace = True)

In [None]:
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [None]:
x = df.drop(['Potability'],axis=True)
y = df['Potability']

Splitting Data to Train and Test

In [None]:
x_train,x_test,y_train,y_test = tts(x,y,test_size=0.32, random_state = 50)

In [None]:
x.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786
std,1.469956,32.879761,8768.570828,1.583085,36.142612,80.824064,3.308162,15.769881,0.780382
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45
25%,6.277673,176.850538,15666.690297,6.127421,317.094638,365.734414,12.065801,56.647656,3.439711
50%,7.080795,196.967627,20927.833607,7.130299,333.775777,421.884968,14.218338,66.396293,3.955028
75%,7.87005,216.667456,27332.762127,8.114887,350.385756,481.792304,16.557652,76.666609,4.50032
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739


In [None]:
y.describe()

count    3276.000000
mean        0.390110
std         0.487849
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Potability, dtype: float64

Random Forest Classifier

In [None]:
RFC = RandomForestClassifier()
RFC.fit(x_train,y_train)
y_RFC = RFC.predict(x_test)

In [None]:
Acc_rfc= accuracy_score(y_RFC,y_test)
print( Acc_rfc)

0.6587225929456625


In [None]:
print(classification_report(y_RFC,y_test))

              precision    recall  f1-score   support

           0       0.88      0.66      0.76       847
           1       0.31      0.63      0.42       202

    accuracy                           0.66      1049
   macro avg       0.60      0.65      0.59      1049
weighted avg       0.77      0.66      0.69      1049



In [None]:
confusion_matrix(y_test,y_RFC)

array([[563,  74],
       [284, 128]])

AutoML Classifier

In [None]:
import autosklearn.classification as classifier
# ac = classifier.AutoSklearnClassifier()
ac = classifier.AutoSklearnClassifier(time_left_for_this_task=520,per_run_time_limit=40)

In [None]:
ac.fit(x_train,y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      per_run_time_limit=40, time_left_for_this_task=520)

In [None]:
ac_pred = ac.predict(x_test)

AutoML with Hyper Parameters

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
metrics.accuracy_score(y_test, ac_pred)

0.6549094375595805

In [None]:
model_params = {
    "Random Forest":
    {
        "model":RandomForestClassifier(),
        "params":
        {
            "n_estimators":[10, 50, 100],
            "max_features":["auto","sqrt","log2"],
            "max_depth":list(range(1,21,3))
        }
    }
    
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, train_test_split
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2)
scores = []
for model_name, params in model_params.items():
    rs = RandomizedSearchCV(params["model"], params["params"], cv = cv, n_iter = 10)
    rs.fit(x_train,y_train)
    scores.append([model_name, dict(rs.best_params_),rs.best_score_])

In [None]:
scores

[['Random Forest',
  {'n_estimators': 100, 'max_features': 'log2', 'max_depth': 16},
  0.6679387312944022]]

In [None]:
import joblib
joblib.dump(rs,"model.pkl")

['model.pkl']