In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [5]:
data = pd.read_csv('cow_water_quality_dataset_with_patterns_DR.csv')

In [6]:
data.head()

Unnamed: 0,pH,TDS,Temperature,suitable
0,6.515173,3151.620797,22.766013,0
1,8.591503,1872.212803,6.553427,0
2,6.708148,3340.92403,31.62743,0
3,8.133186,2392.626027,26.592238,1
4,8.055588,1388.7001,9.371154,0


In [7]:
data.sum()

pH             7.051644e+04
TDS            2.899846e+07
Temperature    2.000184e+05
suitable       2.133000e+03
dtype: float64

In [8]:
data.count()

pH             10000
TDS            10000
Temperature    10000
suitable       10000
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pH           10000 non-null  float64
 1   TDS          10000 non-null  float64
 2   Temperature  10000 non-null  float64
 3   suitable     10000 non-null  int64  
dtypes: float64(3), int64(1)
memory usage: 312.6 KB


In [10]:
data.head()

Unnamed: 0,pH,TDS,Temperature,suitable
0,6.515173,3151.620797,22.766013,0
1,8.591503,1872.212803,6.553427,0
2,6.708148,3340.92403,31.62743,0
3,8.133186,2392.626027,26.592238,1
4,8.055588,1388.7001,9.371154,0


In [11]:
X = data.drop(['suitable'], axis=1)
y = data['suitable']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

In [13]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train and evaluate each model separately
for name, model in models.items():
    print(f"\n{'='*30}")
    print(f"Model: {name}")
    print(f"{'='*30}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate and print accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Print confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")



Model: Logistic Regression
Accuracy: 0.77

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1569
           1       0.46      0.29      0.36       431

    accuracy                           0.77      2000
   macro avg       0.64      0.60      0.61      2000
weighted avg       0.75      0.77      0.75      2000


Confusion Matrix:
[[1420  149]
 [ 304  127]]



Model: Decision Tree
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1569
           1       1.00      1.00      1.00       431

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Confusion Matrix:
[[1567    2]
 [   0  431]]



Model: Random Forest
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      1569
           1       0.58      0.62      0.60       431

    accuracy                           0.82      2000
   macro avg       0.74      0.75      0.74      2000
weighted avg       0.83      0.82      0.82      2000


Confusion Matrix:
[[1376  193]
 [ 162  269]]



Model: XGBoost
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1569
           1       0.99      1.00      1.00       431

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Confusion Matrix:
[[1566    3]
 [   1  430]]




In [17]:
xgb_model = models["XGBoost"]
model_filename = 'xgboost_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(xgb_model, file)  # Save the trained XGBoost model
print(f"Model saved as {model_filename}")

Model saved as xgboost_model.pkl
