In [21]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ML libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE

#Code to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [22]:
# Load the dataset
df = pd.read_csv('hepatitis.csv', na_values="?")
df.head()

Unnamed: 0,ID,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,...,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,1,2,30,2,1.0,2,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,2,50,1,1.0,2,1.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,3,2,78,1,2.0,2,1.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,4,2,31,1,,1,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,5,2,34,1,2.0,2,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [23]:
# Drop the ID column
df.drop('ID', axis=1, inplace=True)
df.head()

Unnamed: 0,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,2,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [24]:
# Separate the columns into categorical and numerical
num_cols = ["target", "age", "bili", "alk", "sgot", "albu", "protime"]
cat_cols = ['gender', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia', 'liverBig', 
            'liverFirm', 'spleen', 'spiders', 'ascites', 'varices', 'histology']

In [25]:
# Impute missing values of num_cols and cat_cols by using imputer
num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [26]:
# Concatenate num_cols and cat_cols
df = pd.concat([df[num_cols], df[cat_cols]], axis=1)

In [39]:
# Check missing values
df.isnull().sum().sort_values(ascending=False)

target        0
age           0
varices       0
ascites       0
spiders       0
spleen        0
liverFirm     0
liverBig      0
anorexia      0
malaise       0
fatigue       0
antivirals    0
steroid       0
gender        0
protime       0
albu          0
sgot          0
alk           0
bili          0
histology     0
dtype: int64

In [40]:
df.head()

Unnamed: 0,target,age,bili,alk,sgot,albu,protime,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,histology
0,2.0,30.0,1.0,85.0,18.0,4.0,61.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
1,2.0,50.0,0.9,135.0,42.0,3.5,61.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
2,2.0,78.0,0.7,96.0,32.0,4.0,61.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
3,2.0,31.0,0.7,46.0,52.0,4.0,80.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
4,2.0,34.0,1.0,85.0,200.0,4.0,61.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0


In [27]:
# Split the data into X and y column
X = df.drop('target', axis=1)
y = df['target']

In [28]:
# Apply SMOTE for handling class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [41]:
# Split the resampled data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [30]:
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
# Model selection and hyperparameter tuning with GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
svm_grid = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)
svm_grid.fit(X_train, y_train)
svm_best = svm_grid.best_estimator_

In [32]:
# Train the model
model= svm_best.fit(X_train, y_train)

In [33]:
# Predict the model
y_pred = svm_best.predict(X_test)

In [34]:
y_pred

array([2., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 2., 1., 1.,
       1., 1., 2., 1., 2., 2., 1., 1., 1., 1., 2., 1., 2., 1., 1., 2., 2.,
       1., 2., 2., 1., 2., 2., 1., 1., 2., 1., 2., 2., 2., 1., 2., 1.])

In [35]:
# Evaluate the model
print('Accuracy Score (SVM): ', accuracy_score(y_test, y_pred))
print('Recall Score (SVM): ', recall_score(y_test, y_pred))
print('Precision Score (SVM): ', precision_score(y_test, y_pred))
print('F1 Score (SVM): ', f1_score(y_test, y_pred))
print('Confusion Matrix (SVM): \n', confusion_matrix(y_test, y_pred))
print('Classification Report (SVM): \n', classification_report(y_test, y_pred))

Accuracy Score (SVM):  0.98
Recall Score (SVM):  1.0
Precision Score (SVM):  0.967741935483871
F1 Score (SVM):  0.9836065573770492
Confusion Matrix (SVM): 
 [[30  0]
 [ 1 19]]
Classification Report (SVM): 
               precision    recall  f1-score   support

         1.0       0.97      1.00      0.98        30
         2.0       1.00      0.95      0.97        20

    accuracy                           0.98        50
   macro avg       0.98      0.97      0.98        50
weighted avg       0.98      0.98      0.98        50



In [36]:
# RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

In [37]:
# Evaluate the RandomForestClassifier
print('\nAccuracy Score (Random Forest): ', accuracy_score(y_test, y_pred_rfc))
print('Recall Score (Random Forest): ', recall_score(y_test, y_pred_rfc))
print('Precision Score (Random Forest): ', precision_score(y_test, y_pred_rfc))
print('F1 Score (Random Forest): ', f1_score(y_test, y_pred_rfc))
print('Confusion Matrix (Random Forest): \n', confusion_matrix(y_test, y_pred_rfc))
print('Classification Report (Random Forest): \n', classification_report(y_test, y_pred_rfc))


Accuracy Score (Random Forest):  0.94
Recall Score (Random Forest):  0.9333333333333333
Precision Score (Random Forest):  0.9655172413793104
F1 Score (Random Forest):  0.9491525423728815
Confusion Matrix (Random Forest): 
 [[28  2]
 [ 1 19]]
Classification Report (Random Forest): 
               precision    recall  f1-score   support

         1.0       0.97      0.93      0.95        30
         2.0       0.90      0.95      0.93        20

    accuracy                           0.94        50
   macro avg       0.94      0.94      0.94        50
weighted avg       0.94      0.94      0.94        50



In [38]:
# save the model
import pickle
pickle.dump(model, open('./saved_models/02_model_hapitits.pkl', 'wb'))