In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("../input/water-potability/water_potability.csv")
df.head()

In [None]:
#shape of the data
df.shape

There are 3276 rows and 10 columns

In [None]:
#Check for missing values
df.isnull().sum()

In [None]:
#Dropping missing values
#because water quality is a sensitive data, we cannot tamper with the data by imputing mean, median, mode
df= df.dropna()


In [None]:
df.Potability.value_counts()

In [None]:
#Plots
import matplotlib.pyplot as plt
import seaborn as sns


df.Potability.value_counts().plot(kind ='pie')

Thus it is an imbalanced dataset, since 0 is much more 1 (1998>1278)
So we need to balance the data so that there is no biasedness.

In [None]:
zero  = df[df['Potability']==0]   #zero values in Potability column
one = df[df['Potability']==1]  # one values in Potability column
from sklearn.utils import resample
#minority class that  is 1, we need to upsample/increase that class so that there is no bias
#n_samples = 1998 means we want 1998 sample of class 1, since there are 1998 samples of class 0
df_minority_upsampled = resample(one, replace = True, n_samples = 1200) 
#concatenate
df = pd.concat([zero, df_minority_upsampled])

from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence

In [None]:
df.Potability.value_counts().plot(kind ='pie')

Now it's perfect!

In [None]:
#understanding correlation
plt.figure(figsize = (15,9))
sns.heatmap(df.corr(), annot = True)

In [None]:
sns.scatterplot(x=df["ph"], y=df["Hardness"], hue=df.Potability,
data=df)


In [None]:
sns.scatterplot(x=df["ph"], y=df["Chloramines"], hue=df.Potability,
data=df)

There is no particular pattern!

In [None]:
df.corr().abs()['Potability'].sort_values(ascending = False)

import matplotlib.pyplot as plt
import pandas as pd

# calculate absolute correlation values between each feature and target variable
corr_values = df.corr().abs()['Potability'].sort_values(ascending=False)

# create bar chart
plt.figure(figsize=(10,6))
plt.bar(corr_values.index, corr_values.values, color='blue')

# set chart title and axis labels
plt.title('Correlation Between Features and Potability', fontsize=16)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Correlation Value', fontsize=12)

# rotate x-axis labels for better readability
plt.xticks(rotation=90)

# display plot
plt.show()


highest correlation with Potability is solids with 5.24% only

In [None]:
X = df.drop(['Potability'], axis = 1)
y = df['Potability']

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features= X.columns
X[features] = sc.fit_transform(X[features])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC



from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1)

## Hyper-parameter Tuning ;)

In [None]:
#Hyperparameter tuning ;)

lr = LogisticRegression(random_state=42)

svm = SVC()

knn = KNeighborsClassifier()

dt = DecisionTreeClassifier()

rf = RandomForestClassifier()

ada = AdaBoostClassifier()

xgb =XGBClassifier(eval_metric = 'logloss', use_label_encoder=False)

#SVM
# define hyperparameters for SVM
svm_param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}

# define grid search object
grid_svm = GridSearchCV(estimator=svm, param_grid=svm_param_grid, cv=5, scoring='accuracy')

para_knn = {'n_neighbors':np.arange(1, 50)}  #parameters of knn
grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5) #search knn for 5 fold cross validation

#parameters for decision tree
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5) #grid search decision tree for 5 fold cv
#"gini" for the Gini impurity and “entropy” for the information gain.
#min_samples_leaf: The minimum number of samples required to be at a leaf node, have the effect of smoothing the model

#parameters for random forest
#n_estimators: The number of trees in the forest.
params_rf = {'n_estimators':[100,200, 350, 500], 'min_samples_leaf':[2, 10, 30]}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)

#parameters fpr AdaBoost
params_ada = {'n_estimators': [50,100,250,400,500,600], 'learning_rate': [0.2,0.5,0.8,1]}
grid_ada =  GridSearchCV(ada, param_grid=params_ada, cv=5)

#XGBoost
#parameters for xgboost
params_xgb = {'n_estimators': [50,100,250,400,600,800,1000], 'learning_rate': [0.2,0.5,0.8,1]}
rs_xgb =  RandomizedSearchCV(xgb, param_distributions=params_xgb, cv=5)


In [None]:
classifiers = [('Logistic Regression', lr), ('SVM', svm), ('K Nearest Neighbours', knn),
               ('Decision Tree', dt), ('Random Forest', rf), ('AdaBoost', ada), ('XGBoost', xgb)]


In [None]:
from sklearn.metrics import accuracy_score

for classifier_name, classifier in classifiers:
 
    # Fit clf to the training set
    classifier.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.2f}'.format(classifier_name, accuracy))

In [None]:
from sklearn.metrics import classification_report

classifiers = [('Logistic Regression', lr), ('SVM', svm), ('K Nearest Neighbours', knn), ('Decision Tree', dt), ('Random Forest', rf), ('AdaBoost', ada), ('XGBoost', xgb)]

for clf_name, clf in classifiers:
    print(f"Classification Report for {clf_name}:")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))



In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Generate some data for classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base models
lr = LogisticRegression(random_state=42)
svm = SVC(C=10, gamma='scale', kernel='rbf', probability=True)
knn = KNeighborsClassifier(n_neighbors=1)
dt = DecisionTreeClassifier(criterion='gini', max_depth=27, min_samples_leaf=1)
rf = RandomForestClassifier(n_estimators=350, min_samples_leaf=2)
ada = AdaBoostClassifier(n_estimators=600, learning_rate=1)
xgb_clf = xgb.XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.1)

# Define the stacking classifier
estimators = [('lr', lr), ('svm', svm), ('knn', knn), ('dt', dt), ('rf', rf), ('ada', ada), ('xgb', xgb_clf)]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Fit the stacking classifier to the training data
stacking.fit(X_train, y_train)

# Make predictions on the test data
y_pred = stacking.predict(X_test)

# Calculate the accuracy of the stacking classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report

# Make predictions on the test data
y_pred = stacking.predict(X_test)

# Calculate precision, recall, and f1 score
report = classification_report(y_test, y_pred)

print("Classification report:")
print(report)


In [None]:
import matplotlib.pyplot as plt

# Define the classifiers and their precision scores
classifiers = ['SVM', 'KNN', 'DT', 'XGB', 'ADA', 'LR', 'RF', 'SE']
precisions = [0.70, 0.65, 0.85, 0.85, 0.65, 0.58, 0.86, 0.97]

# Create a bar graph
fig, ax = plt.subplots()
ax.bar(classifiers, precisions)

# Set the axis labels and title
ax.set_xlabel('Classifiers')
ax.set_ylabel('Precision')
ax.set_title('Precision scores for different classifiers')

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define the classifiers and their accuracy scores
classifiers = ['SVM', 'KNN', 'DT', 'XGB', 'ADA', 'LR', 'RF', 'SE']
accuracies = [0.70, 0.63, 0.82, 0.84, 0.60, 0.54, 0.87, 0.96]

# Create a bar graph
fig, ax = plt.subplots()
ax.bar(classifiers, accuracies)

# Set the axis labels and title
ax.set_xlabel('Classifiers')
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy scores for different classifiers')

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define the classifiers and their recall scores
classifiers = ['SVM', 'KNN', 'DT', 'XGB', 'ADA', 'LR', 'RF', 'SE']
recalls = [0.74, 0.65, 0.80, 0.84, 0.56, 0.51, 0.90, 0.95]

# Create a bar graph
fig, ax = plt.subplots()
ax.bar(classifiers, recalls)

# Set the axis labels and title
ax.set_xlabel('Classifiers')
ax.set_ylabel('Recall')
ax.set_title('Recall scores for different classifiers')

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define the classifiers and their f1-scores
classifiers = ['SVM', 'KNN', 'DT', 'XGB', 'ADA', 'LR', 'RF', 'SE']
f1_scores = [0.72, 0.65, 0.83, 0.85, 0.60, 0.54, 0.88, 0.96]

# Create a bar graph
fig, ax = plt.subplots()
ax.bar(classifiers, f1_scores)

# Set the axis labels and title
ax.set_xlabel('Classifiers')
ax.set_ylabel('F1-Score')
ax.set_title('F1-Score for different classifiers')

# Show the plot
plt.show()
