<a href="https://colab.research.google.com/github/MokshithReddy17/Login-Page/blob/main/Water_quality_potability_model(Mokshith_Reddy).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import necessary libraries

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Step 1: Load the dataset

In [9]:
data = pd.read_csv('water_potability.csv')
print(data)

            ph    Hardness        Solids  Chloramines     Sulfate  \
0          NaN  204.890455  20791.318981     7.300212  368.516441   
1     3.716080  129.422921  18630.057858     6.635246         NaN   
2     8.099124  224.236259  19909.541732     9.275884         NaN   
3     8.316766  214.373394  22018.417441     8.059332  356.886136   
4     9.092223  181.101509  17978.986339     6.546600  310.135738   
...        ...         ...           ...          ...         ...   
3271  4.668102  193.681735  47580.991603     7.166639  359.948574   
3272  7.808856  193.553212  17329.802160     8.061362         NaN   
3273  9.419510  175.762646  33155.578218     7.350233         NaN   
3274  5.126763  230.603758  11983.869376     6.303357         NaN   
3275  7.874671  195.102299  17404.177061     7.509306         NaN   

      Conductivity  Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       564.308654       10.379783        86.990970   2.963135           0  
1       592.88535

#Step 2: Preprocess the data

In [10]:
data = data.dropna()

#Step 3: Define features (X) and target variable (y)

In [11]:
X = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate',
          'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
y = data['Potability']

#Step 4: Split the data into training and testing sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Train a Random Forest Classifier

In [13]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Step 7: Evaluate the model

In [17]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6771523178807947

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.84      0.75       355
           1       0.66      0.45      0.53       249

    accuracy                           0.68       604
   macro avg       0.67      0.64      0.64       604
weighted avg       0.67      0.68      0.66       604


Confusion Matrix:
 [[298  57]
 [138 111]]


# Step 8: Calculate additional insights


In [15]:
def calculate_purity(row):
    # Define purity score based on ideal low values for certain parameters
    purity_score = (
        (14 - abs(row['ph'] - 7)) +  # pH near 7 is ideal
        (200 - row['Hardness']) +
        (500 - row['Solids']) +
        (4 - row['Chloramines']) +
        (250 - row['Sulfate']) +
        (250 - row['Conductivity']) +
        (5 - row['Organic_carbon']) +
        (100 - row['Trihalomethanes']) +
        (1 - row['Turbidity'])
    )
    return purity_score

data['Purity_Score'] = data.apply(calculate_purity, axis=1)

def detect_harmful_substances(row):
    harmful = []
    if row['Solids'] > 1000: harmful.append('High Solids')
    if row['Chloramines'] > 4: harmful.append('High Chloramines')
    if row['Trihalomethanes'] > 80: harmful.append('High Trihalomethanes')
    if row['Turbidity'] > 5: harmful.append('High Turbidity')
    if abs(row['ph'] - 7) > 2: harmful.append('Extreme pH')
    return harmful

data['Harmful_Substances'] = data.apply(detect_harmful_substances, axis=1)


# Step 9: Display a few examples to showcase purity score and harmful substances

In [16]:
print("\nSample data with additional insights:")
print(data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
           'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability',
           'Purity_Score', 'Harmful_Substances']].head())

# Save the model
import joblib
joblib.dump(model, 'water_quality_potability_model.pkl')


Sample data with additional insights:
          ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
3   8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4   9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   
5   5.584087  188.313324  28748.687739     7.544869  326.678363    280.467916   
6  10.223862  248.071735  28749.716544     7.513408  393.663396    283.651634   
7   8.635849  203.361523  13672.091764     4.563009  303.309771    474.607645   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  Purity_Score  \
3       18.436524       100.341674   4.628771           0 -21761.726554   
4       11.558279        31.997993   4.075075           0 -17600.904570   
5        8.399735        54.917862   2.559708           0 -28294.985429   
6       13.789695        84.603556   2.672989           0 -28462.906819   
7       12.363817        62.798309   4.401425           0 -13415.133110   

                       

['water_quality_potability_model.pkl']