In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle

In [2]:
data = pd.read_csv('balanced_water_quality_data_cowsv2.csv')

In [3]:
data.head()

Unnamed: 0,pH,Nitrate,Sulfate,Total_Dissolved_Solids,Bacterial_Count,Hardness,Temperature,Suitability
0,7.148122,33.370861,35.716704,1952.665419,308,484.954926,23.324426,Suitable
1,6.530848,18.182497,45.851127,912.726729,21,3.533153,15.230624,Suitable
2,7.100381,12.203823,123.794228,103.165563,205,195.530304,16.822361,Suitable
3,7.888404,42.515587,51.985416,1703.100983,476,484.792314,22.751328,Suitable
4,6.73372,1.407982,49.710601,2134.025858,418,385.635173,15.740447,Suitable


In [4]:
data.sum()

pH                                                             28928.843944
Nitrate                                                       160496.562637
Sulfate                                                       500881.896037
Total_Dissolved_Solids                                       6046420.880189
Bacterial_Count                                                     1634992
Hardness                                                      984485.744421
Temperature                                                    79819.555953
Suitability               SuitableSuitableSuitableSuitableSuitableSuitab...
dtype: object

In [5]:
data.count()

pH                        4000
Nitrate                   4000
Sulfate                   4000
Total_Dissolved_Solids    4000
Bacterial_Count           4000
Hardness                  4000
Temperature               4000
Suitability               4000
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pH                      4000 non-null   float64
 1   Nitrate                 4000 non-null   float64
 2   Sulfate                 4000 non-null   float64
 3   Total_Dissolved_Solids  4000 non-null   float64
 4   Bacterial_Count         4000 non-null   int64  
 5   Hardness                4000 non-null   float64
 6   Temperature             4000 non-null   float64
 7   Suitability             4000 non-null   object 
dtypes: float64(6), int64(1), object(1)
memory usage: 250.1+ KB


In [8]:
data.head()

Unnamed: 0,pH,Nitrate,Sulfate,Total_Dissolved_Solids,Bacterial_Count,Hardness,Temperature,Suitability
0,7.148122,33.370861,35.716704,1952.665419,308,484.954926,23.324426,Suitable
1,6.530848,18.182497,45.851127,912.726729,21,3.533153,15.230624,Suitable
2,7.100381,12.203823,123.794228,103.165563,205,195.530304,16.822361,Suitable
3,7.888404,42.515587,51.985416,1703.100983,476,484.792314,22.751328,Suitable
4,6.73372,1.407982,49.710601,2134.025858,418,385.635173,15.740447,Suitable


In [10]:
X = data.drop(['Suitability', 'Nitrate', 'Sulfate', 'Bacterial_Count', 'Hardness'], axis=1)
y = data['Suitability']

In [11]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y) 

In [12]:
y

array([0, 0, 0, ..., 1, 1, 1])

# Identifying Encoded Values

In [13]:
original_labels = label_encoder.inverse_transform(y)

result_df = pd.DataFrame({
    'Encoded': y,
    'Original': data.Suitability
})

print("\nEncoded and Original Labels:")
print(result_df)


Encoded and Original Labels:
      Encoded    Original
0           0    Suitable
1           0    Suitable
2           0    Suitable
3           0    Suitable
4           0    Suitable
...       ...         ...
3995        1  Unsuitable
3996        1  Unsuitable
3997        1  Unsuitable
3998        1  Unsuitable
3999        1  Unsuitable

[4000 rows x 2 columns]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

In [15]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [16]:
# Train and evaluate each model separately
for name, model in models.items():
    print(f"\n{'='*30}")
    print(f"Model: {name}")
    print(f"{'='*30}")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")


Model: Logistic Regression
Accuracy: 0.50

Classification Report:
              precision    recall  f1-score   support

    Suitable       0.54      0.39      0.45       422
  Unsuitable       0.48      0.63      0.54       378

    accuracy                           0.50       800
   macro avg       0.51      0.51      0.50       800
weighted avg       0.51      0.50      0.49       800


Confusion Matrix:
[[163 259]
 [140 238]]



Model: Decision Tree
Accuracy: 0.51

Classification Report:
              precision    recall  f1-score   support

    Suitable       0.53      0.52      0.53       422
  Unsuitable       0.48      0.49      0.49       378

    accuracy                           0.51       800
   macro avg       0.51      0.51      0.51       800
weighted avg       0.51      0.51      0.51       800


Confusion Matrix:
[[219 203]
 [191 187]]



Model: Random Forest
Accuracy: 0.49

Classification Report:
              precision    recall  f1-score   support

    Suitable  

In [17]:
xgb_model = models["XGBoost"]
model_filename = 'xgboost_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(xgb_model, file)  # Save the trained XGBoost model
print(f"Model saved as {model_filename}")

Model saved as xgboost_model.pkl


# Predictions with User inputs

In [25]:
import pandas as pd
import pickle

def load_model():
    model_filename = 'xgboost_model.pkl'
    with open(model_filename, 'rb') as file:
        model = pickle.load(file)
    print("Loaded XGBoost model")
    return model

def predict_with_user_input(model, feature_columns):
    user_data = {}
    
    for column in feature_columns:
        user_data[column] = float(input(f"Enter {column}: "))
    
    user_input_df = pd.DataFrame([user_data])
    user_input_df = user_input_df[feature_columns]

    prediction = model.predict(user_input_df)
    suitability = 'Suitable' if prediction[0] == 0 else 'Unsuitable'
    print(f"\nPredicted Suitability: {suitability}")

model = load_model()
predict_with_user_input(model, X.columns.tolist())


Loaded XGBoost model
Enter pH: 7.519800	
Enter Nitrate: 38.244371
Enter Sulfate: 246.864154
Enter Total_Dissolved_Solids: 2115.063778	
Enter Bacterial_Count: 980	
Enter Hardness: 387.067329
Enter Temperature: 19.029998

Predicted Suitability: Unsuitable


In [14]:
data

Unnamed: 0,pH,Nitrate,Sulfate,Total_Dissolved_Solids,Bacterial_Count,Hardness,Temperature,Suitability
0,7.148122,33.370861,35.716704,1952.665419,308,484.954926,23.324426,Suitable
1,6.530848,18.182497,45.851127,912.726729,21,3.533153,15.230624,Suitable
2,7.100381,12.203823,123.794228,103.165563,205,195.530304,16.822361,Suitable
3,7.888404,42.515587,51.985416,1703.100983,476,484.792314,22.751328,Suitable
4,6.733720,1.407982,49.710601,2134.025858,418,385.635173,15.740447,Suitable
...,...,...,...,...,...,...,...,...
3995,7.519800,38.244371,246.864154,2115.063778,980,387.067329,19.029998,Unsuitable
3996,7.806385,68.398033,240.848345,645.390679,826,315.455215,17.112547,Unsuitable
3997,6.469691,47.123699,222.167150,1404.916462,464,116.064301,15.476124,Unsuitable
3998,7.833196,56.665608,67.899731,436.618594,976,180.630865,15.219706,Unsuitable
