<a href="https://colab.research.google.com/github/MrSR3112/ml-final-project/blob/main/Quality_prediction_of_(Water).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing required libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings("ignore")

# **Exploratory Data Analysis (EDA)**

In [None]:
# Import the CSV file

df = pd.read_csv('/content/water_potability.csv')
df.head()

In [None]:
# Dimensions of the Dataset

df.shape

In [None]:
# Data Info

df.info()

In [None]:
# Data Description

df.describe()

In [None]:
# Skewness

df.drop(['Potability'], axis=1).skew()

In [None]:
# Plot Histogram for each feature

df.drop(['Potability'], axis=1).hist(figsize=(18,10));

In [None]:
# Correlation between each feature

df.corr()

In [None]:
# Heat Map of correlation between each feature

fig = go.Figure(go.Heatmap(x=df.corr().columns.tolist(), y=df.corr().columns.tolist(), z=df.corr(), colorscale='bluered'))
fig.show()

In [None]:
# Target Variable

df['Potability'].value_counts(normalize=True)


In [None]:
fig = px.bar(df, x=['Not Potable','Potable'], y=df['Potability'].value_counts(normalize=True), title="Target Variable Value Count")
fig.show()

In [None]:
# Missing Values

df.isnull().sum()

In [None]:
df.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Features')
plt.title('Missing Data in Percentages');

In [None]:
# Use KNN Imputer to impute NaN Values

from sklearn.impute import KNNImputer

imputer = KNNImputer()
df[['ph','Sulfate','Trihalomethanes']] = imputer.fit_transform(df[['ph','Sulfate','Trihalomethanes']])

In [None]:
# Checking for Missing Values after Imputing

df.isnull().sum()

In [None]:
df.head()

# **Modelling**

In [None]:
# Split the Dataset into Training and Test Datasets

from sklearn.model_selection import train_test_split

X = df.drop(['Potability'],axis=1)
y = df['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state =42)

#### Machine Learning Algorithms Params

##### Random Forest Params

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {'n_estimators':[100, 200, 300],
             'criterion':['gini','entropy'],
             'min_samples_split':[3,5,10],
             'min_samples_leaf':[1,3,5],
             'max_features':['auto','sqrt','log2'],
             'bootstrap':[False]
            }

##### Gradient Boosting Classifier Params

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {'loss':['deviance', 'exponential'],
             'learning_rate':[0.001,0.01,0.1,0.5],
             'n_estimators':[100,300,500],
             'min_samples_split':[3,5,10],
             'max_depth':[3,5,10],
             'max_features':['auto','sqrt','log2']
            }

##### Multi-layer Perceptron (MLP) Classifier Params

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_params = {'hidden_layer_sizes':[(100,50,10),(100,100,100),(100,100),(5,5),(5,3,2)],
              'activation':['identity', 'logistic', 'tanh', 'relu'],
              'solver':['lbfgs','sgd','adam'],
              'alpha':[0.1,0.01,0.005,0.0001,0.00001],
              'learning_rate':['constant','invscaling','adaptive']
             }

##### KNeighbors Classifier Params

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn_params = {'n_neighbors':[3,5,10],
             'weights':['uniform','distance'],
             'algorithm':['auto','ball_tree','kd_tree','brute'],
             'p':[1,2,5]
            }

##### XGBoost Params

In [None]:
from xgboost import XGBClassifier

xg_params = {'learning_rate':[0.0001,0.001,0.01,0.1,0.3],
             'min_split_loss':[2,5,10],
             'max_depth':[3,5,10],
             'n_estimators':[100,300,500],
             'tree_method':['auto','exact','approx','hist','gpu_hist']
            }

##### C-Support Vector Classifier (SVC) Params

In [None]:
from sklearn.svm import SVC

svc_params = {'C':[0.001,0.01,0.1,0.3],
              'kernel':['linear','poly','rbf','sigmoid'],
              'degree':[3,5,10],
              'gamma':['auto','scale']
             }

##### AdaBoost Classifier Params

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_params = {'n_estimators':[50,100,200,300],
              'learning_rate':[0.0001,0.001,0.01,0.1,0.3]
             }

##### Logistic Regression Params

In [None]:
from sklearn.linear_model import LogisticRegression

lg_params = {'penalty':['l1','l2','elasticnet'],
             'dual':[True, False],
             'C':[0.001,0.01,0.1,0.3],
             'solver':['newton-cg','lbfgs','sag','saga'],
             'max_iter':[100,200,300],
             'l1_ratio':[0.001,0.01,0.1,0.3,0.8]
             }

##### Decision Tree Classifier Params

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_params = {'criterion':['gini','entropy'],
             'splitter':['best','random'],
             'max_depth':[3,5,10],
             'min_samples_split':[2,5,10],
             'min_samples_leaf':[1,3,5,10],
             'max_features':['auto','sqrt','log2']
            }

In [None]:
# Grid Search CV to find Optimized model

models = [RandomForestClassifier(),
         GradientBoostingClassifier(),
         MLPClassifier(),
         KNeighborsClassifier(),
         XGBClassifier(),
         SVC(),
         AdaBoostClassifier(),
         LogisticRegression(),
         DecisionTreeClassifier()
        ]

model_params = [rf_params,
                gb_params,
                mlp_params,
                kn_params,
                xg_params,
                svc_params,
                ada_params,
                lg_params,
                dt_params
               ]

##### Standard Scaling


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##### Grid Search CV

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

#cross validation results
cv_results = []

#to use in ensemble modeling
best_estimators = []

for i in range(len(models)):
    clf = GridSearchCV(models[i],
                       param_grid = model_params[i],
                       cv = StratifiedKFold(n_splits=2),
                       scoring = 'roc_auc',
                       n_jobs = -1,
                       verbose=1)

    clf.fit(X_train_scaled,y_train)
    cv_results.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print('Method: {}  Score: {}'.format(models[i],cv_results[i]))



results = pd.DataFrame({'Cross Validation Means':cv_results,
                       'ML Models':[
                        'Random Forest',
                        'Gradient Boosting',
                        'MLP Classifier',
                        'KNeighbors Classifier',
                        'XGBoost',
                        'SVC',
                        'AdaBoost',
                        'Logistic Regression',
                        'Decision Tree'
                       ]})

Fitting 2 folds for each of 162 candidates, totalling 324 fits


In [None]:
results

##### Ensemble Model

In [None]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('rf',best_estimators[0]),
                                        ('gb',best_estimators[1]),
                                        ('mlp',best_estimators[2]),
                                        ('kn',best_estimators[3]),
                                        ('xgb',best_estimators[4]),
                                        ('svc',best_estimators[5]),
                                        ('adb',best_estimators[6]),
                                        ('lg',best_estimators[7]),
                                        ('dt',best_estimators[8]),
                                       ],
                            voting='hard',
                            n_jobs= -1)


In [None]:
from sklearn.metrics import accuracy_score

voting = voting.fit(X_train_scaled,y_train)
my_score = accuracy_score(voting.predict(X_test_scaled),y_test)
print(my_score)