<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/Water_Quality_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download opendatasets library
!pip install opendatasets

In [None]:
# Import necessary libraries and functions
import opendatasets as od
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from flask import Flask, render_template, request
import pickle
import pandas as pd

In [None]:
# Load the data from kaggle into your working enviroment
od.download("https://www.kaggle.com/datasets/devanshibavaria/water-potability-dataset-with-10-parameteres")

In [None]:
# Read and display the data
data=pd.read_csv("/content/water-potability-dataset-with-10-parameteres/water_potability.csv")
data.head()

In [None]:
# Data shape
print("This data contains: "+str(data.shape[0])+" rows and "+str(data.shape[1])+" columns.")

In [None]:
# Data info
data.info()

In [None]:
# Search for nulls
data.isnull().sum()

In [None]:
# Iterate over columns with null values and fill them with the mean of the column
columns_with_nulls=['ph','Sulfate','Trihalomethanes']
for column in columns_with_nulls:
   data[column]=data[column].fillna(data[column].mean())

In [None]:
# Check the value_counts of 'Potability' column
data['Potability'].value_counts()

In [None]:
# Create two sub datas from your original data
data_drinkabel_water=data[data['Potability']==1]
data_non_drinkabel_water=data[data['Potability']==0]

In [None]:
# Create subplots with 2 rows and 5 columns
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 6))

# Flatten the 2D array of subplots into a 1D array
axes = axes.flatten()
df=data.drop('Potability',axis=1)
# Loop through each column and plot a histogram in the corresponding subplot
for i, column in enumerate(df.columns):
     axes[i].hist(data_drinkabel_water[column], bins=10)  # Adjust the bins as needed
     axes[i].set_title(column)
     axes[i].set_xlabel('Values')
     axes[i].set_ylabel('Frequency')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
# Create subplots with 2 rows and 5 columns
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 6))

# Flatten the 2D array of subplots into a 1D array
axes = axes.flatten()
df=data.drop('Potability',axis=1)
# Loop through each column and plot a histogram in the corresponding subplot
for i, column in enumerate(df.columns):
     axes[i].hist(data_non_drinkabel_water[column], bins=10)  # Adjust the bins as needed
     axes[i].set_title(column)
     axes[i].set_xlabel('Values')
     axes[i].set_ylabel('Frequency')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
# Over sample minority class
x=data.drop('Potability',axis=1).values
y=data['Potability'].values
over=RandomOverSampler(sampling_strategy=0.8)
x,y=over.fit_resample(x,y)

In [None]:
# Split the data into training and testing parts
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
# Create 5 models to handle the classification task
LR=LogisticRegression()

tree=ExtraTreeClassifier()

adab=AdaBoostClassifier()

svc=SVC()

forest=RandomForestClassifier()

In [None]:
# Check the performance of the created models using the cross_validate technique
models = [LR, tree, adab,forest,svc]
for model in models:
    print(type(model).__name__)  # Print the name of the model class
    scores = cross_validate(model, x, y, cv=3, scoring=['accuracy', 'precision', 'recall'])

    # Print accuracy scores
    print("Accuracy %:", np.mean(scores['test_accuracy'])*100)
    print("Precision %:", np.mean(scores['test_accuracy'])*100)
    print("Recall %:", np.mean(scores['test_accuracy'])*100)
    print("----------------")

In [None]:
# Create a dictionary for GridSearchCV-usage
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# Create the GridSearchCV object and use the model with best performance
grid_search = GridSearchCV(estimator=forest, param_grid=param_grid,
                           scoring='accuracy', cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(x_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

In [None]:
# Choose the best hyperparameters and redfine your model
tree=RandomForestClassifier(max_depth=None,min_samples_leaf=2,min_samples_split=2,n_estimators=100)
tree.fit(x_train,y_train)

In [None]:
# Test the performance of your model
y_hat=tree.predict(x_test)
print("accuracy%:",accuracy_score(y_hat,y_test)*100)
print("precision%:",precision_score(y_hat,y_test)*100)
print("recall%:",recall_score(y_hat,y_test)*100)

In [None]:
# Feed the model with testing data
tree.fit(x_test,y_test)

In [None]:
# Save the model to a file using pickle
with open('tree.pkl', 'wb') as file:
    pickle.dump(tree, file)

In [None]:
# Create a folder and name it template for example, and inside, create the following (index.html):
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Water Potability Prediction</title>
    <style>
        body {
            display: grid;
            place-items: center;
            height: 100vh;
            margin: 0;
        }

        form {
            display: flex;
            flex-direction: row;
            gap: 10px;
            align-items: center;
        }

        label {
            text-align: right;
            padding-right: 10px;
        }

        input {
            width: 100px;  /* Adjust the width as needed */
        }
    </style>
</head>
<body>
    <h1>Water Potability Prediction</h1>
    <form action="/predict" method="post">
        {% for column in feature_columns %}
        <label for="{{ column }}">{{ column }}:</label>
        <input type="text" name="{{ column }}" required>
        {% endfor %}
        <input type="submit" value="Predict">
    </form>
</body>
</html>

In [None]:
# Inside the template folder, create another html file and name it for example (pass.html)
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Water Potability Prediction Result</title>
</head>
<body>
    <h1>Water Potability Prediction Result</h1>
    <p>The predicted potability is: {{ prediction }}</p>
</body>
</html>

In [None]:
# Create your Flask app
app = Flask(__name__,template_folder='template')

# Load the trained model
with open('tree.pkl', 'rb') as file:
    model = pickle.load(file)

# Column names for the water quality features
feature_columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
                   'Organic_carbon', 'Trihalomethanes', 'Turbidity']
# Home route
@app.route('/')
def home():
    return render_template('index.html', feature_columns=feature_columns)
# Results route
@app.route('/predict', methods=['POST'])
def predict():
        # Get input values from the form
        input_data = [float(request.form[column]) for column in feature_columns]

        # Make a prediction
        prediction = model.predict([input_data])[0]

        # Display the prediction
        return render_template('pass.html', prediction=prediction)


if __name__ == '__main__':
    app.run(debug=True)
