In [None]:
# Step 1. Loading the Data and required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize


import sys
import json
sys.path.append("/home/guilherme/Documents/GitHub/Tese/MDCompass/")

from ML_Algorithm.Functions.compute_symptom_relatedness import compute_relatedness, compute_relatedness_matrix

# Loading the severity scores
with open("/home/guilherme/Documents/GitHub/Tese/MDCompass/ML_Algorithm/json_files/severity_scores.json", "r") as f:
    severity_mapping = json.load(f)
    
# Load the dataset
file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Augmented_database_29_09_2023.xlsx"
df = pd.read_excel(file_path, sheet_name="Augmented Data")


In [None]:
# 2. Data Transformation and Encoding

# Replace NaN with 'None' placeholder
symptom_cols = [f'Symptom_{i}' for i in range(1, 26)]
df[symptom_cols] = df[symptom_cols].fillna('None')
df['symptoms'] = df[symptom_cols].apply(lambda row: [symptom for symptom in row if symptom != 'None'], axis=1)

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptoms'])

# Insert the additional features calculation here
def get_severity_scores(symptom_list):
    return [severity_mapping.get(symptom, 0) for symptom in symptom_list]


# Calculating additional features
df['relatedness_values'] = df['symptoms'].apply(compute_relatedness_matrix)
df['severity_values'] = df['symptoms'].apply(get_severity_scores)

# Padding the relatedness matrix to a consistent shape
max_dim = max([len(x) for x in df['relatedness_values']])

def pad_matrix_to_shape(matrix, max_dim):
    # Determine how much to pad
    pad_dim = max_dim - matrix.shape[0]
    
    # Pad rows
    matrix = np.pad(matrix, ((0, pad_dim), (0, pad_dim)), 'constant')
    return matrix

# After calculating max_dim in the training phase
with open('max_dim.json', 'w') as f:
    json.dump({"max_dim": max_dim}, f)


# Apply padding to each matrix
df['relatedness_values_padded'] = df['relatedness_values'].apply(lambda x: pad_matrix_to_shape(x, max_dim))

# Padding the severity values to have the same length
padded_severity = pad_sequences(df['severity_values'], padding='post', dtype='float32', maxlen=max_dim)

# Now, reshaping the relatedness and severity features
relatedness_features = np.array(df['relatedness_values_padded'].tolist()).reshape(len(df), -1)

severity_features = padded_severity.reshape(len(df), -1)

# Stacking them up with the binarized symptoms
X = np.hstack((X, relatedness_features, severity_features))
# X = np.hstack((X, relatedness_features))


label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['ICD 11'])


In [None]:
# Step 3. Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Introducing SMOTE
smote = SMOTE(k_neighbors=4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Step 4: Hyperparameter tunning

# Import the necessary libraries
from keras_tuner.tuners import RandomSearch
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Calculate the number of classes
n_classes = len(np.unique(y))

# Define the model-building function
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('input_units', min_value=256, max_value=1024, step=256),
                    activation='relu', 
                    input_shape=(X_train_resampled.shape[1],)))
    
    for i in range(hp.Int('n_layers', 1, 4)):
        model.add(Dense(units=hp.Int(f'hidden_units_{i}', min_value=128, max_value=512, step=128),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.1, max_value=0.5, step=0.1)))
    
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Define the tuner
tuner = RandomSearch(
    build_model,  # No lambda function required here
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='hyperparam_search',
    project_name='disease_prediction'
)

# Summarize the search space
# tuner.search_space_summary()

# Search for the best model
tuner.search(X_train_resampled, y_train_resampled, epochs=10, validation_split=0.2)

# Get the best hyperparameters and build the model
best_hyperparameters = tuner.get_best_hyperparameters()[0]
print(best_hyperparameters.values)
model = tuner.hypermodel.build(best_hyperparameters)

model.summary()



In [None]:
## Step 5. Training the Model

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)

history = model.fit(X_train, y_train, epochs=20, batch_size=32, 
                    validation_split=0.2, callbacks=[early_stopping, reduce_lr])

# history = model.fit(X_train_resampled, y_train_resampled, epochs=20, batch_size=32, validation_split=0.2)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Step 6. Evaluating the Model

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))

print(f"Accuracy of the model: {accuracy * 100:.2f}%")

# Predict class labels for the test set
y_pred_probabilities = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probabilities, axis=1)

# Ensure all classes are accounted for in the classification report
all_classes = np.arange(len(label_encoder.classes_))
report = classification_report(y_test, y_pred_classes, labels=all_classes, target_names=label_encoder.classes_)
print(report)



In [None]:
# Step 7: Model Deployment
import joblib

# Save the model, label encoder, and binarizer for later use
model_filename = 'NN_with_features.pkl'
label_encoder_filename = 'disease_label_encoder.pkl'
binarizer_filename = 'symptoms_binarizer.pkl'

joblib.dump(model, model_filename)
joblib.dump(label_encoder, label_encoder_filename)
joblib.dump(mlb, binarizer_filename)

loaded_model = joblib.load(model_filename)
loaded_label_encoder = joblib.load(label_encoder_filename)
loaded_binarizer = joblib.load(binarizer_filename)


In [None]:
def predict_disease_nn(symptoms_list):
    """
    Given a list of symptoms (ICD-11 codes), predict potential diseases using the trained neural network.
    """
    # Transform the symptoms list into the appropriate binary vector format and other preprocessing
    symptoms_encoded = loaded_binarizer.transform([symptoms_list])
    
    # Calculate the relatedness and severity features
    relatedness_values = compute_relatedness_matrix(symptoms_list)
    severity_values = get_severity_scores(symptoms_list)

    # Flatten the relatedness values matrix
    flattened_relatedness = relatedness_values.reshape(-1)
    
    # Pad the sequences
    padded_relatedness = pad_sequences([flattened_relatedness], maxlen=max_dim*max_dim, padding='post', dtype='float32').reshape(1, -1)
    padded_severity = pad_sequences([severity_values], maxlen=max_dim, padding='post', dtype='float32').reshape(1, -1)

    # Combine all the features
    # combined_features = np.hstack((symptoms_encoded, padded_relatedness, padded_severity))
    combined_features = np.hstack((symptoms_encoded, padded_relatedness))

    
    # Predict using the trained model
    disease_probs = loaded_model.predict(combined_features)
    
    # Sort probabilities and get top 5 indices
    top_5_disease_indices = np.argsort(disease_probs[0])[::-1][:5]
    
    # Decode the predicted disease codes
    top_5_disease_predictions = loaded_label_encoder.inverse_transform(top_5_disease_indices)

    # Extract the associated probabilities
    top_5_confidences = disease_probs[0][top_5_disease_indices]
    
    results = [(disease, f"{confidence * 100:.2f}%") for disease, confidence in zip(top_5_disease_predictions, top_5_confidences)]

    return results

sample_symptoms = ['MC15', '9D9Z', '9D90.6', '9C80.0', 'LD20.4', '8A68.Z', '9B73.3', '9B65.2', '1D01.Y', 'MA01.Z']
predicted_diseases = predict_disease_nn(sample_symptoms)

print("Given the symptoms, the top 5 predicted diseases are:")
for disease, confidence in predicted_diseases:
    print(f"Disease ICD-11 code: {disease} with confidence: {confidence}")


In [None]:
# Confusion Matrix Visualization

conf_mat = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(12,12))
sns.heatmap(conf_mat, annot=True, fmt="d",
            cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()


In [None]:
## 7. Visualization

# We can also visualize the training progress.

import matplotlib.pyplot as plt

# Plotting accuracy and loss over epochs
plt.figure(figsize=(12, 4))

# Plotting accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy Over Epochs')

# Plotting loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Over Epochs')

plt.tight_layout()
plt.show()
