# Imports

In [None]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Input, Dense, Normalization, Dropout
from keras.optimizers import Adam, SGD
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import FalsePositives

import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import wandb
from wandb.keras import WandbCallback

In [None]:
sweep_config = {
    'method': "bayes",
    'metric': {
        'name': 'loss',
        'goal': 'minimize',
    },
    'parameters': {
        "optimizer": {
            "values": ['adam', 'sgd']
        },
        "nodes": {
            "values": [64, 128, 256, 512, 1024]
        },
        "epochs": {
            'max': 30, 'min': 10
        },
        "learning_rate": {
            'max': 0.01, 'min': 0.00001
        },
        "batch_size": {
            'values': [16, 32, 64]
        },
        "layers": {
            'max': 6, 'min': 1
        },
        'dropout': {
            'values': [0.3, 0.4, 0.5]
        },
    },
}

In [None]:
wandb.login(key="50fdec0adbfaf9ef335c8ec6c0446e412c23a452")
sweep_id = wandb.sweep(sweep_config, project="parodontitis")

In [None]:
RANDOM_STATE = 1

# Load data

In [None]:
# df = convert_xml_to_dataframe()
df = pd.read_csv('../data/patients-v2.csv')

X = df[[
    # 'PATIENT_ID',
    # 'SEX',
    # 'BIRTH_DATE',
    # 'VISIT_DATE',
    'TREATING_PROVIDER_DENTIST',
    'TREATING_PROVIDER_FACULTY',
    'TREATING_PROVIDER_STUDENT',
    'PROCEDURE_A',
    'PROCEDURE_B',
    'BLEEDING_ON_PROBING',
    'NR_OF_POCKET',
    'NR_OF_FURCATION',
    'NR_OF_MOBILITY',
    'TOTAL_LOSS_OF_ATTACHMENT_LEVEL'
]]

y = df[[
    'HAS_PARODONTITIS'
]]
# df.head()

# Split train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RANDOM_STATE)

In [None]:
y_train_without_parodontitis = y_train[y_train["HAS_PARODONTITIS"] == 0]
y_test_without_parodontitis = y_test[y_test["HAS_PARODONTITIS"] == 0]

print(f'y_train: {(100/len(y_train)) * len(y_train_without_parodontitis)}% without parodontitis')
print(f'y_test: {(100/len(y_test)) * len(y_test_without_parodontitis)}% without parodontitis')

# Normalization

In [None]:
normalizer = Normalization(axis=-1)
normalizer.adapt(np.array(X_train))

# Training

In [None]:
def train():
    default_config={
        "optimizer": 'adam',
        "nodes":128,
        "epochs": 1,
        "learning_rate": 1e-2,
        "batch_size": 32,
        "layers": 2,
        "dropout": 0.4
    }

    wandb.init(config=default_config)
    config = wandb.config

    batch_size = config.batch_size
    epochs = config.epochs
    learning_rate = config.learning_rate
    nodes = config.nodes
    optimizer = config.optimizer
    layers = config.layers
    dropout = config.dropout

    if config.optimizer == 'adam':
        optimizer = Adam(lr=learning_rate)
    elif config.optimizer == 'sgd':
        optimizer = SGD(lr=learning_rate)

    model = Sequential()

    model.add(Input(shape=(X_train.shape[1],)))
    model.add(normalizer)

    for i in range(layers):
        model.add(Dense(nodes, activation='relu'))
        model.add(Dropout(dropout))

    model.add(Dense(2, activation='softmax'))

    model.compile(
        optimizer=optimizer,
        loss=SparseCategoricalCrossentropy(),
        metrics=['accuracy']
    )

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, callbacks=[WandbCallback()])
    model.save('models/parodontitis_sparse_categorical_classifier.h5', overwrite=True)
    wandb.log_artifact('models/parodontitis_sparse_categorical_classifier.h5', name='run_' + wandb.run.id + '_model', type='model')

In [None]:
wandb.agent(sweep_id, train, count=5)

# Saving the model

In [None]:
# Download your Model Version files
# wandb.init()
# path = wandb.use_artifact('parodontitis/run_9c859hs6_model:v0').download()

# Reconstruct your model object in memory:
# `make_model_from_data` below represents your deserialization logic
# to load in a model from disk
from keras.models import load_model
model = load_model('models/parodontitis_binary_classifier.h5')

## Validation using the test set

In [None]:
result = model.predict(X_test)
result = pd.DataFrame(result)

print(result)
result['PREDICTION'] = result.apply(lambda x: 0 if x.loc[0] > x.loc[1] else 1, axis=1)

y_test = y_test.reset_index()
y_test['PREDICTION'] = result['PREDICTION']

validation_result = y_test[['PREDICTION', 'HAS_PARODONTITIS']]

In [None]:
wrong_prediction = validation_result[validation_result["HAS_PARODONTITIS"] != validation_result['PREDICTION']]
print(f"Predicted wrong: {len(wrong_prediction)}/{len(validation_result)}")

# SHAP

In [None]:
shap.initjs()

In [None]:
shap_explainer = shap.KernelExplainer(model, X_train[:100])

sample = np.array(X_train.iloc[1])
# sample = np.array([2,2,0,0,1]) # <- playable sample

shap_values = shap_explainer.shap_values(sample)

# Label: has not parodontitis
shap.force_plot(
    shap_explainer.expected_value[0], shap_values[0], sample, matplotlib=True, show=False, plot_cmap=['#77dd77', '#f99191'], feature_names=X_train.columns
)

# Label: has parodontitis
shap.force_plot(
    shap_explainer.expected_value[1], shap_values[1], sample, matplotlib=True, show=False, plot_cmap=['#77dd77', '#f99191'], feature_names=X_train.columns
)

plt.show()