Data Treatment

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('src/dataset/water_potability.csv')

In [3]:
print(df.columns)
df = df[['Potability'] + [column for column in df.columns if column != 'Potability']]

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')


In [4]:
print(df.columns)

Index(['Potability', 'ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate',
       'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity'],
      dtype='object')


In [5]:
df.isnull().sum()

Potability           0
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
dtype: int64

In [6]:
df = df.fillna({'ph': df.ph.median(),'Sulfate': df.Sulfate.median(), 'Trihalomethanes': df.Trihalomethanes.median()})

In [7]:
X = df.drop(columns=['Potability'])
Y = df['Potability']

In [8]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size= 0.2)

XGBoost Training

In [9]:
from xgboost import XGBClassifier

In [10]:
XGBoostModel = XGBClassifier(n_estimators=500, max_depth=16, objective='binary:logistic', eta=0.01, gamma=4.2, subsample = 0.5, random_state=42)

In [11]:
XGBoostModel.fit(X_train, Y_train)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [13]:
XGBoostPredictions = XGBoostModel.predict(X_test)

In [14]:
accuracy = accuracy_score(Y_test, XGBoostPredictions)
report = classification_report(Y_test, XGBoostPredictions)
confusion = confusion_matrix(Y_test, XGBoostPredictions)

print("Accuracy:\n", accuracy)
print("Confusion Matrix:\n", confusion)
print("Relatory:\n", report)

Accuracy:
 0.6753048780487805
Confusion Matrix:
 [[374  26]
 [187  69]]
Relatory:
               precision    recall  f1-score   support

           0       0.67      0.94      0.78       400
           1       0.73      0.27      0.39       256

    accuracy                           0.68       656
   macro avg       0.70      0.60      0.59       656
weighted avg       0.69      0.68      0.63       656



RandomForest Training

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
RandomForestModel = RandomForestClassifier(n_estimators=500, random_state=30)

In [17]:
RandomForestModel.fit(X_train, Y_train)

In [18]:
RandomForestPredictions = RandomForestModel.predict(X_test)

In [None]:
accuracy = accuracy_score(Y_test, RandomForestPredictions)
report = classification_report(Y_test, RandomForestPredictions)
confusion = confusion_matrix(Y_test, RandomForestPredictions)

print("Accuracy:\n", accuracy)
print("Confusion Matrix:\n", confusion)
print("Relatory:\n", report)

Accuracy:
 0.6951219512195121
Confusion Matrix:
 [[368  39]
 [161  88]]
Relatory:
               precision    recall  f1-score   support

           0       0.70      0.90      0.79       407
           1       0.69      0.35      0.47       249

    accuracy                           0.70       656
   macro avg       0.69      0.63      0.63       656
weighted avg       0.69      0.70      0.67       656



Saving The Models

In [19]:
import joblib

In [20]:
XGBoostModel.save_model("src/models/XGBoostModel.json")

In [22]:
joblib.dump(RandomForestModel, "src/models/RandomForestModel.joblib")

['./RandomForestModel.joblib']