In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# water potability model with PyCaret

in this notebook we will try to analyse a dataset with different feature and try to create a classification machine learning model to classify weither this water is drinkable or not.

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np9
import seaborn as sns 
import plotly.express as px

In [3]:
df=pd.read_csv('../input/water-dataset/water_dataset.csv')
df.head(15)

# Data cleaning :

we can see nan values in our data so in order to fix that we will remove all the rows that contain null values, you can also chose to replace nan values with the mean value.

In [4]:
df=df.dropna()
df.isnull().sum()

now we can be sure that there is no nan values in our data 

# potability distribution :
the main goal is to predict the potability of water, the value is distributed between to values 1 and 0, if the water is fit to the standards the value of potability is 1 if it's not is 0 so let's see the distributions of these 2 values 

In [26]:
plt.figure(figsize=(15,15))
sns.countplot(df.Potability)
plt.title('the distribution of potability values')
plt.show

we must notice here that there is no balanced between the distribution of the potability and non potability, samples of 0 is more than 1.
so in the next step we will explore all the columns by creating histograms to see how each one affect potability.

# explore columns

In [6]:
data=df
figure = px.histogram(df, x="ph", color="Potability", title="factors affecting water quality : ph")
figure.show()

the ph value of the water is an important feature it must be between 6.5 and 8.5 to be drinkable.

In [7]:
figure = px.histogram(df, x= "Hardness", color = "Potability" , title="factors affecting water quality : Hardness")
figure.show()

the hardness of the water it depends on it's source, but for a drinkable water the hardness is must be between 120-200 milligrams.

In [8]:
figure = px.histogram(df , x="Solids", color="Potability",title="factors affecting water quality : Solids")
figure.show()

The figure above represents the distribution of total dissolved solids in water in the dataset. All organic and inorganic minerals present in water are called dissolved solids. Water with a very high number of dissolved solids is highly mineralized.

In [9]:
figure = px.histogram(df, x="Chloramines", color= "Potability", title="factors affecting water quality : Chloramines")
figure.show()

Chloramine and chlorine are disinfectants used in public water systems.

In [10]:
 figure = px.histogram(df, x="Sulfate", color= "Potability", title="factors affecting water quality : Sulfate")
figure.show()

Water containing less than 500 milligrams of sulfate is safe to drink.

In [11]:
 figure = px.histogram(df, x="Conductivity", color= "Potability", title="factors affecting water quality : Conductivity")
figure.show()

Water is a good conductor of electricity, but the purest form of water is not a good conductor of electricity. Water with an electrical conductivity of less than 500 is drinkable.

In [12]:
 figure = px.histogram(df, x="Organic_carbon", color= "Potability", title="factors affecting water quality : Organic_carbon")
figure.show()

organic carbon comes from the breakdown of natural organic materials and synthetic sources. Water containing less than 25 milligrams of organic carbon is considered safe to drink.

In [13]:
 figure = px.histogram(df, x="Trihalomethanes", color= "Potability", title="factors affecting water quality : Trihalomethanes")
figure.show()

THMs are chemicals found in chlorine-treated water. Water containing less than 80 milligrams of THMs is considered safe to drink.

In [14]:
 figure = px.histogram(df, x="Turbidity", color= "Potability", title="factors affecting water quality : Turbidity")
figure.show()

The turbidity of water depends on the number of solids present in suspension. Water with a turbidity of fewer than 5 milligrams is considered drinkable.

# Water Quality Prediction Model using PyCaret 
in the section above we've explore all the columns affect potability of water to have a clear idea about the influence of them on our target potability. now the purpose is to train a machine learning model using PyCaret for for the task of water quality analysis .

# install pycaret

In [21]:
!pip install pycaret --ignore-installed llvmlite

#### first let's test some correlation between features and our target "Potability"

In [18]:
corr_matrix=df.corr()
corr_matrix["Potability"].sort_values(ascending=False)

In [19]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

##### now let's see wich machine learning algorithm is the best for our dataset using pyCaret 

In [23]:
from pycaret.classification import *
clsf= setup(df, target="Potability", silent=True, session_id=786)
compare_models()

#####  According to the result, we notice that the CatBoost Classifier is the best for our data. so let's create and train our model and predict the results.

In [25]:
model = create_model("catboost")
predict = predict_model(model, data=df)
predict.head(30)