# Deteksi Hepatitis MLP

## Import Library

In [None]:
# Data Manipulation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Data Visualization
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import missingno as msno
import seaborn as sns

# MLP
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping

# Utils
import json
from keras.utils import np_utils

## Load Data

In [None]:
df = pd.read_csv('HepatitisDataset.csv')
df.head()

In [None]:
df.info()

## Data Preprocessing

### Remove Unnecessary Columns

In [None]:
df = df.drop('Unnamed: 0', axis=1)

### Missing Values

In [None]:
msno.matrix(df)

In [None]:
df.isnull().groupby('Category').sum()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum() * 100 / len(df)

Karena data yang kosong atau null terbilang sedikit kami memutuskan untuk menghapus kolom yang memiliki missing values.

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.isnull().sum()

### Convert to Numerical Data

In [None]:
df['Sex'].replace('m', 1, inplace=True)
df['Sex'].replace('f', 0, inplace=True)
df['Category'].replace({"1=Hepatitis", "2=Fibrosis", "3=Cirrhosis"}, 1, inplace=True)
df['Category'].replace({"0=Blood Donor", "0s=suspect Blood Donor"}, 0, inplace=True)

In [None]:
df.tail()

## Data Visualization

### Macam-macam Feature

Terdapat beberapa macam feature yaitu:

#### Caterogical Feature

Categorical feature adalah feature yang berhubungan dengan kategori. Kategori bisa terdiri dari 2 atau lebih. Misalnya, jenis kelamin yang mana memiliki 2 kategori yaitu pria dan wanita. Feature ini biasanya disebut juga sebagai Numerical Feature

#### Ordinal Feature

Ordinal feature mirip dengan categorical feature namun feature ini dapat diurutkan nilainya. Misalnya kepuasan konsumen yang memiliki 4 kategori yang dapat dirutkan yaitu tidak puas, puas sedang, puas dan sangat puas.

#### Continuous Feature

Continous feature adalah feature yang berisi angka. Misalnya, umur.

### Feature Analyze

### Category Visualization

In [None]:
fig, ax = plt.subplots(1,1, figsize = (6, 6))
fig.patch.set_facecolor('#f5f5f5')
sns.set_palette(sns.color_palette(['green','purple', 'orange', 'red']))

ax.pie(df['Category'].value_counts().values, 
            labels=['Healthy', 'Hepatitis'],
            autopct='%1.1f%%', explode=[0.1,  0.1], 
            colors=['tab:green', 'tab:red'])

for piece in ax.axes.patches:
    piece.set_alpha(0.5)

plt.title('Hepatitis Dataset', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})

plt.show()

#### Age -> Continuous Feature

In [None]:
print('Oldest Age:', df['Age'].max(),'Years')
print('Youngest Age:', df['Age'].min(),'Years')
print('Average Age:', df['Age'].mean(),'Years')

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (12,5))
fig.patch.set_facecolor('#f5f5f5')
sns.set_palette(sns.color_palette(['green','purple', 'orange', 'red']))

sns.kdeplot('Age', data=df[df.Category==0], ax=ax, shade=True, color='tab:green', alpha=1, label='Healthy')
sns.kdeplot('Age', data=df[df.Category==1], ax=ax, shade=True, color='tab:red', alpha=0.9, label='Hepatitis')

ax.legend()
ax.set_xlabel('Age', {'font':'Serif', 'fontsize':16,'fontweight':'bold', 'color':'black'})

plt.title('Age Distribution', {'font':'Serif', 'fontsize':20,'fontweight':'bold', 'color':'black'})

plt.show()

#### Sex -> Categorical Feature

In [None]:
fig = plt.figure(figsize = (24,10), dpi = 60)
gs = GridSpec(ncols=13, nrows=5, left=0.05, right=0.5, wspace=0.2, hspace=0.1)
fig.patch.set_facecolor('#f5f5f5')
sns.set_palette(sns.color_palette(['green','purple', 'orange', 'red']))

ax1 = fig.add_subplot(gs[:, 0:5])
ax2 = fig.add_subplot(gs[:, 8:])

axes = [ax1,ax2]

for ax in axes:
    ax.axes.get_yaxis().set_visible(False)
    ax.set_facecolor('#f5f5f5')
    
    for loc in ['left', 'right', 'top', 'bottom']:
        ax.spines[loc].set_visible(False)

#------------------------------------------------------------    
#ax1
ax1_plot = ax1.pie(df[df.Sex == 1].groupby('Category').Category.count().values, 
                   labels=['Healthy', 'Hepatitis'],
                    autopct='%1.1f%%', explode=[0.1,  0.1], 
                   colors=['orange', 'red', 'green', 'purple'])

for piece in ax1_plot[0]:
    piece.set_alpha(0.5)

for i, text in enumerate(ax1_plot[1]):
    text.set_weight('bold')
    text.set_size(14)

for i, text in enumerate(ax1_plot[2]):
    text.set_weight('bold')
    text.set_size(12)

    
fig.text(0.1, 0.75, 'Males',  {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#------------------------------------------------------------    
#ax2
ax2_plot = ax2.pie(df[df.Sex == 0].groupby('Category').Category.count().values, 
                   labels=['Healthy', 'Hepatitis'],
                    autopct='%1.1f%%', explode=[0.1,  0.1], 
                   colors=['orange', 'red', 'green', 'purple'])

for piece in ax2_plot[0]:
    piece.set_alpha(0.5)

for i, text in enumerate(ax2_plot[1]):
    text.set_weight('bold')
    text.set_size(14)

for i, text in enumerate(ax2_plot[2]):
    text.set_weight('bold')
    text.set_size(12)

fig.text(0.38, 0.75, 'Females',  {'font':'Serif', 'weight':'bold','color': 'black', 'size':25}) 
plt.show()

#### ALB, ALP, ALT, AST, BIL, CHE, CHOL, CREA, GGT, PROT -> Continous Feature

In [None]:
fig = plt.figure(figsize = (16,16), dpi = 60, constrained_layout=True)
spec = GridSpec(ncols=2, nrows=5, figure=fig)
ax1 = fig.add_subplot(spec[0, 0])
ax2 = fig.add_subplot(spec[0, 1])
ax3 = fig.add_subplot(spec[1, 0])
ax4 = fig.add_subplot(spec[1, 1])
ax5 = fig.add_subplot(spec[2, 0])
ax6 = fig.add_subplot(spec[2, 1])
ax7 = fig.add_subplot(spec[3, 0])
ax8 = fig.add_subplot(spec[3, 1])
ax9 = fig.add_subplot(spec[4, 0])
ax10 = fig.add_subplot(spec[4, 1])

#-------------------------------------------------------------------------------
sns.kdeplot('ALB', data=df[df.Category==0], ax=ax1, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('ALB', data=df[df.Category==1], ax=ax1, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax1.set_title('ALB vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('ALP', data=df[df.Category==0], ax=ax2, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('ALP', data=df[df.Category==1], ax=ax2, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax2.set_title('ALP vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('ALT', data=df[df.Category==0], ax=ax3, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('ALT', data=df[df.Category==1], ax=ax3, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax3.set_title('ALT vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('AST', data=df[df.Category==0], ax=ax4, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('AST', data=df[df.Category==1], ax=ax4, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax4.set_title('AST vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('BIL', data=df[df.Category==0], ax=ax5, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('BIL', data=df[df.Category==1], ax=ax5, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax5.set_title('BIL vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('CHE', data=df[df.Category==0], ax=ax6, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('CHE', data=df[df.Category==1], ax=ax6, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax6.set_title('CHE vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('CHOL', data=df[df.Category==0], ax=ax7, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('CHOL', data=df[df.Category==1], ax=ax7, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax7.set_title('CHOL vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('CREA', data=df[df.Category==0], ax=ax8, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('CREA', data=df[df.Category==1], ax=ax8, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax8.set_title('CREA vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('GGT', data=df[df.Category==0], ax=ax9, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('GGT', data=df[df.Category==1], ax=ax9, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax9.set_title('GGT vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#-------------------------------------------------------------------------------
sns.kdeplot('PROT', data=df[df.Category==0], ax=ax10, shade=True, color='tab:green', alpha=1, label='Healthy', legend=False)
sns.kdeplot('PROT', data=df[df.Category==1], ax=ax10, shade=True, color='tab:red', alpha=0.9, label='Hepatitis', legend=False)
ax10.set_title('PROT vs Category', {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})

plt.show()

### Correlation Matrix Between Independent Variables

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
pd.DataFrame([["0 < r <= 0.19", "Very Low Correlation"],["0.2 <= r <= 0.39", "Low Correlation"], ["0.4 < r <= 0.59", "Modderate Correlation"], ["0.6 < r <= 0.79", "High Correlation"], ["0.8 < r <= 1.0", "Very High Correlation"] ], columns=['Scale of Correlation', 'Value'])

Pada matriks korelasi di atas dapat dilihat bahwa korelasi tertinggi antar variabel independen adalah ALB dengan PROT dengan nilai 0.56, CGT dengan AST dengan nilai 0.49, dan CGT dengan ALP dengan nilai 0.45. Karena ketiga korelasi tersebut masih masuk ke dalam kategori moderate maka dari itu kami memutuskan untuk menggunakan semua variabel independen yang ada untuk proses training. 

## Sepperate Label and Features

In [None]:
X = df.drop(['Category'],axis=1)
y = df["Category"]

In [None]:
X

In [None]:
y

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=46, stratify=y)

## Data sampling

Karena dataset yang digunakan imbalance blablabal

In [None]:
over = SMOTE()
overs = RandomOverSampler()
under = RandomUnderSampler()
steps = [('o', over), ('os', overs),('u',under)]
pipeline = Pipeline(steps=steps)

In [None]:
X_train_og, y_train_og = X_train, y_train 
X_train, y_train = pipeline.fit_resample(X_train, y_train)

### Perbandingan blabla

In [None]:
fig = plt.figure(figsize = (24,10), dpi = 60)
gs = GridSpec(ncols=13, nrows=5, left=0.05, right=0.5, wspace=0.2, hspace=0.1)
fig.patch.set_facecolor('#f5f5f5')
sns.set_palette(sns.color_palette(['green','purple', 'orange', 'red']))

ax1 = fig.add_subplot(gs[:, 0:5])
ax2 = fig.add_subplot(gs[:, 8:])

axes = [ax1,ax2]

for ax in axes:
    ax.axes.get_yaxis().set_visible(False)
    ax.set_facecolor('#f5f5f5')
    
    for loc in ['left', 'right', 'top', 'bottom']:
        ax.spines[loc].set_visible(False)

#------------------------------------------------------------    
#ax1
ax1_plot = ax1.pie(y_train_og.value_counts().values, 
                   labels=['Healthy', 'Hepatitis'],
                    autopct='%1.1f%%', explode=[0.1,  0.1], 
                   colors=['green', 'red'])

for piece in ax1_plot[0]:
    piece.set_alpha(0.5)

for i, text in enumerate(ax1_plot[1]):
    text.set_weight('bold')
    text.set_size(14)

for i, text in enumerate(ax1_plot[2]):
    text.set_weight('bold')
    text.set_size(12)

    
fig.text(0.1, 0.75, 'Before',  {'font':'Serif', 'weight':'bold','color': 'black', 'size':25})
#------------------------------------------------------------    
#ax2
ax2_plot = ax2.pie(y_train.value_counts().values, 
                   labels=['Healthy', 'Hepatitis'],
                    autopct='%1.1f%%', explode=[0.05,  0], 
                   colors=['green', 'red'])

for piece in ax2_plot[0]:
    piece.set_alpha(0.5)

for i, text in enumerate(ax2_plot[1]):
    text.set_weight('bold')
    text.set_size(14)

for i, text in enumerate(ax2_plot[2]):
    text.set_weight('bold')
    text.set_size(12)

fig.text(0.38, 0.75, 'After',  {'font':'Serif', 'weight':'bold','color': 'black', 'size':25}) 

plt.show()

## Data Normalization

In [None]:
X_train_unormal, X_test_unormal = X_train, X_test

In [None]:
standard_sc = preprocessing.StandardScaler()

In [None]:
standard_sc.fit(X_train)
X_train = standard_sc.transform(X_train)
X_test = standard_sc.transform(X_test)

## One Hot Encoding

In [None]:
y_train = np_utils.to_categorical(y_train, 2)
y_test = np_utils.to_categorical(y_test, 2)

## Bikin Model

In [None]:
earlystop = EarlyStopping(monitor='val_accuracy', patience=5, verbose=1, mode='auto')

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=12))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=100, batch_size=8, validation_data=(X_test, y_test), verbose=1, callbacks=[earlystop])

In [None]:
plt.plot(model.history.history['accuracy'])
plt.plot(model.history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

### Validasi Model

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test.argmax(axis=1), y_test_pred.argmax(axis=1)))

In [None]:
names = ['Non Hepatitis','Hepatitis']
cm = confusion_matrix(y_test.argmax(axis=1), y_test_pred.argmax(axis=1))
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(cm,annot=True,linewidth=.5,linecolor="r",fmt=".0f",ax=ax)
plt.title("Confussion Matrix", size = 25)
plt.xlabel("y_pred")
plt.ylabel("y_true")
ax.set_xticklabels(names)
ax.set_yticklabels(names)

plt.show()

In [None]:
wrong_pred = []

for input, prediction, label in zip(X_test, y_test_pred.argmax(axis=1), y_test.argmax(axis=1)):
  if prediction != label:
    wrong_pred.append((input, prediction, label))

if len(wrong_pred) == 0:
  print("No wrong predictions yayy 🎉🎉")
else:
  print("Wrong predictions:")
  for input, prediction, label in wrong_pred:
    print("Input:", input)
    print("Prediction:", names[prediction])
    print("Actual:", names[label])
    print()

## Experiment

### Normalized Data vs Unnormalized Data

In [None]:
model_unnormal = Sequential()
model_unnormal.add(Dense(12, input_dim=12))
model_unnormal.add(Activation('relu'))
model_unnormal.add(Dropout(0.2))
model_unnormal.add(Dense(128))
model_unnormal.add(Activation('relu'))
model_unnormal.add(Dropout(0.2))
model_unnormal.add(Dense(2))
model_unnormal.add(Activation('softmax'))

model_unnormal.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
model_unnormal.fit(X_train_unormal, y_train, epochs=100, batch_size=8, validation_data=(X_test_unormal, y_test), verbose=1, callbacks=[earlystop])

In [None]:
score_unormal = model_unnormal.evaluate(X_test, y_test, verbose=0)

In [None]:
left = [1, 2]
# heights of bars
height = [score[1], score_unormal[1]]
# labels for bars
tick_label = ['Normalized Data', 'Unormalized Data']
# plotting a bar chart
plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['tab:red', 'tab:blue'])
# naming the x-axis
plt.xlabel('Accuracy')
# naming the y-axis
plt.ylabel('Method')
# plot title
plt.title('Comparison of Accuracy')
# function to show the plot
plt.show()