In [None]:
# Step 1: Loading the Data

import joblib
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

import json
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import sys
sys.path.append("/home/guilherme/Documents/GitHub/Tese/MDCompass/")
from keras.preprocessing.sequence import pad_sequences

from ML_Algorithm.Functions.compute_symptom_relatedness import compute_relatedness, compute_relatedness_matrix

# Loading the severity scores
with open("/home/guilherme/Documents/GitHub/Tese/MDCompass/ML_Algorithm/json_files/severity_scores.json", "r") as f:
    severity_mapping = json.load(f)

file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Raw_Augmented_database_13_10_2023.xlsx"
df_raw = pd.read_excel(file_path, sheet_name="Original Data")

file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Augmented_database_29_09_2023.xlsx"
df = pd.read_excel(file_path, sheet_name="Original Data")

file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Augmented_database_29_09_2023.xlsx"
df_augmented = pd.read_excel(file_path, sheet_name="Augmented Data")

## Loading the models
NoFeature_label_encoder = "loaded_models/no_features_label_encoder.pkl"
WithFeature_label_encoder = "loaded_models/with_features_label_encoder.pkl"
Embeddings_feature_label_encoder = "loaded_models/raw_X_padded.pkl"
BothWithFeature_label_encoder = "loaded_models/both_features_label_encoder.pkl"


NoFeature_symptom_binarizer = "loaded_models/no_features_symptoms_binarizer.pkl"
WithFeature_symptom_binarizer = "loaded_models/with_features_symptoms_binarizer.pkl"
BothWithFeature_symptom_binarizer = "loaded_models/both_features_symptoms_binarizer.pkl"

KNN_NoFeature = "loaded_models/KNN_no_features.pkl"
KNN_WithFeature = "loaded_models/KNN_with_features.pkl"
RTF_NoFeature = "loaded_models/RTF_no_features.pkl"
RTF_WithFeature = "loaded_models/RTF_with_features.pkl"
NN_NoFeature = "loaded_models/NN_no_features.pkl"
NN_WithFeature = "loaded_models/NN_with_features.pkl"

NN_BothFeatures = "loaded_models/NN_with_features_BOTH.pkl"

Embedding_model = "loaded_models/Raw_Embedded_model.pkl"
w2v_model = "loaded_models/raw_w2v_model.pkl"


# Load models and other necessary components
# model_paths = [KNN_NoFeature, RTF_NoFeature, NN_NoFeature] # Add paths to your models
# model_paths = [KNN_WithFeature, RTF_WithFeature, NN_WithFeature] # Add paths to your models
model_paths = [NN_NoFeature, NN_WithFeature, Embedding_model] # Add paths to your models

# 
binarizer_paths = [NoFeature_symptom_binarizer, WithFeature_symptom_binarizer, w2v_model, BothWithFeature_symptom_binarizer] # Corresponding binarizers
label_encoder_paths = [NoFeature_label_encoder, WithFeature_label_encoder, Embeddings_feature_label_encoder, BothWithFeature_label_encoder] # Corresponding label encoders

In [None]:
# Step 2: Extracting the sample data to feed the function and data initialization


# Filter diseases with at least 10 symptoms
filtered_data = df[df.notnull().sum(axis=1) >= 11]  # At least 11 non-null values (1 for ICD 11 + 10 symptoms)

# Extract a sample of 10 diseases
sample_data = filtered_data.sample(15)
sample_diseases = sample_data["ICD 11"].tolist()

max_dim = 24 #Max number of symptoms

# Extract 10 symptoms for each of these diseases
sample_symptoms_list = []
sample_symptoms_raw_list = []
for _, row in sample_data.iterrows():
    encoded_symptoms = row.dropna().values[3:13].tolist()
    raw_symptoms = df_raw.loc[df['ICD 11'] == row['ICD 11']].dropna(axis=1).values[0][3:13].tolist()
    sample_symptoms_list.append(encoded_symptoms)
    sample_symptoms_raw_list.append(raw_symptoms)


In [None]:
# Step 3: Existing function for predicting diseases given a model

# No feature function
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_augmented["ICD 11"])
embedding_size = 128


def predict_disease(model, binarizer, label_encoder, symptoms_list, raw_symptoms_list, model_path, max_dim=max_dim, top_n=5):
    """
    Given a list of symptoms (ICD-11 codes), predict the top potential diseases along with their confidence.
    """

    if "Embedded" in model_path:

        embedded_symptoms = symptoms_to_embedding(raw_symptoms_list, binarizer, embedding_size)
        embedded_symptoms = np.expand_dims(embedded_symptoms, 0)  # Add batch dimension

        # Pad the embedded symptoms
        features = pad_sequences(embedded_symptoms, maxlen=label_encoder.shape[1], padding='post')

        disease_probabilities = model.predict(features)
        
        # Extract top predictions
        top_indices = np.argsort(disease_probabilities[0])[-top_n:][::-1]
        top_diseases = encoder.inverse_transform(top_indices)
        top_probabilities = disease_probabilities[0][top_indices]
        
    else:
    
        symptoms_encoded = binarizer.transform([symptoms_list])
        features = symptoms_encoded

        # Infer from the model's path if the model needs features
        if "with_features" in model_path:
            
            if max_dim:
                relatedness_values = compute_relatedness_matrix(symptoms_list)
                flattened_relatedness = relatedness_values.reshape(-1)
                padded_relatedness = pad_sequences([flattened_relatedness], maxlen=max_dim*max_dim, padding='post', dtype='float32').reshape(1, -1)
                features = np.hstack((symptoms_encoded, padded_relatedness))

        elif "both_features" in model_path:
                severity_values = get_severity_scores(symptoms_list)
                padded_severity = pad_sequences([severity_values], maxlen=max_dim, padding='post', dtype='float32').reshape(1, -1)

                relatedness_values = compute_relatedness_matrix(symptoms_list)
                flattened_relatedness = relatedness_values.reshape(-1)
                padded_relatedness = pad_sequences([flattened_relatedness], maxlen=max_dim*max_dim, padding='post', dtype='float32').reshape(1, -1)

                features = np.hstack((symptoms_encoded, padded_relatedness, padded_severity))



        # Predict using the model
        try:
            # print(f"features size: {features.shape}")
            disease_probabilities = model.predict_proba(features)
        except AttributeError:
            disease_probabilities = model.predict(features)
        
        # Extract top predictions
        top_indices = np.argsort(disease_probabilities[0])[-top_n:][::-1]
        top_diseases = label_encoder.inverse_transform(top_indices)
        top_probabilities = disease_probabilities[0][top_indices]

    return list(zip(top_diseases, top_probabilities))


# Function to load a model, its binarizer, and label encoder from pickle files
def load_model_from_files(model_path, binarizer_path, label_encoder_path):
    model = joblib.load(model_path)
    binarizer = joblib.load(binarizer_path)
    label_encoder = joblib.load(label_encoder_path)
    return model, binarizer, label_encoder

def get_disease_name_from_code(code):
    """
    Fetch the disease name corresponding to the given code from the dataset.
    """
    try:
        return df[df['ICD 11'] == code]['Disease'].iloc[0]
    except IndexError:
        # Return code itself if not found
        return code

def symptoms_to_embedding(symptoms, model, embedding_size):
    embedding_matrix = np.zeros((len(symptoms), embedding_size))
    for i, symptom in enumerate(symptoms):
        if symptom in model.wv:
            embedding_matrix[i] = model.wv[str(symptom)]
    return embedding_matrix


def get_severity_scores(symptom_list):
    return [severity_mapping.get(symptom, 0) for symptom in symptom_list]
    

In [None]:
val = 0
for i in range(len(model_paths)):
    
    if "with_features" in model_paths[i]:
        val = 1
    elif "Embedded" in model_paths[i]:
        val = 2
    elif "both_features" in model_paths[i]:
        val = 3
    else:
        val = 0

    model, binarizer, label_encoder = load_model_from_files(model_paths[i], binarizer_paths[val], label_encoder_paths[val])
    print(f"\n\nModel {i+1} Predictions:")
    for j, symptoms in enumerate(sample_symptoms_list):
        raw_symptoms = sample_symptoms_raw_list[j]  # Get the corresponding raw symptoms
        predicted_diseases_with_confidence = predict_disease(model, binarizer, label_encoder, symptoms, raw_symptoms, model_path=model_paths[i])
        print(f"\nFor actual disease {sample_diseases[j]}:")
        for disease, confidence in predicted_diseases_with_confidence:
            print(f"Predicted Disease: {disease} with confidence: {confidence*100:.2f}%")
    print("==========================================")


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tabulate import tabulate

# Lists to store values for visualization
model_confidences = [[] for _ in model_paths]
all_diseases = []

for i in range(len(model_paths)):
    
    if "with_features" in model_paths[i]:
        val = 1
    elif "Embedded" in model_paths[i]:
        val = 2
    elif "both_features" in model_paths[i]:
        val = 3
    else:
        val = 0

    model, binarizer, label_encoder = load_model_from_files(model_paths[i], binarizer_paths[val], label_encoder_paths[val])
    
    for j, symptoms in enumerate(sample_symptoms_list):
        raw_symptoms = sample_symptoms_raw_list[j]
        predicted_diseases_with_confidence = predict_disease(model, binarizer, label_encoder, symptoms,raw_symptoms, model_path=model_paths[i])
        
        # If it's the first model, add the disease to all_diseases list
        if i == 0:
            all_diseases.append(sample_diseases[j])
            
        # Append the top confidence value for the current model
        model_confidences[i].append(predicted_diseases_with_confidence[0][1])

# Plotting
bar_width = 0.25
index = np.arange(len(all_diseases))
plt.figure(figsize=(15, 7))

for i, confidences in enumerate(model_confidences):
    plt.bar(index + i*bar_width, [c*100 for c in confidences], bar_width, label=f'Model {i+1}', alpha=0.8)

plt.xlabel('Disease', fontsize=13)
plt.ylabel('Top Prediction Confidence (%)', fontsize=13)
plt.xticks(index + bar_width, all_diseases, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Convert model_confidences to a 2D array suitable for a heatmap
heatmap_data = np.array(model_confidences) * 100

plt.figure(figsize=(15, 7))
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', cbar_kws={'label': 'Prediction Confidence (%)'}, yticklabels=[f'Model {i+1}' for i in range(len(model_paths))], xticklabels=all_diseases, fmt=".0f")
plt.yticks(rotation=0)
plt.xlabel('Disease', fontsize=13)
plt.ylabel('Model', fontsize=13)
plt.show()



In [None]:
# 4. Tabulated Predictions
# ---------------------
table_data = []

# First, we create an empty list for each disease's row
for _ in sample_symptoms_list:
    table_data.append([])

# Initialize a list to count correct predictions for each model
correct_predictions = [0] * len(model_paths)

# For each model, collect the top prediction and add it to the respective disease's row
for i in range(len(model_paths)):

    if "with_features" in model_paths[i]:
        val = 1
    elif "Embedded" in model_paths[i]:
        val = 2
    elif "both_features" in model_paths[i]:
        val = 3
    else:
        val = 0

    model, binarizer, label_encoder = load_model_from_files(model_paths[i], binarizer_paths[val], label_encoder_paths[val])
    
    for j, symptoms in enumerate(sample_symptoms_list):
        raw_symptoms = sample_symptoms_raw_list[j]
        predicted_diseases_with_confidence = predict_disease(model, binarizer, label_encoder, symptoms, raw_symptoms, model_path=model_paths[i])
        top_prediction = predicted_diseases_with_confidence[0]
        
        # Notation to mark if the prediction was correct or incorrect
        notation = "Incorrect"
        if top_prediction[0] == sample_diseases[j]:
            notation = "Correct"
            correct_predictions[i] += 1
        
        # Add the top prediction for this model to the disease's row
        table_data[j].append(f"{top_prediction[0]} ({top_prediction[1]*100:.2f}%) {notation}")

# Now, we need to prefix each row with the code and the actual disease name
for j in range(len(sample_symptoms_list)):
    disease_name = get_disease_name_from_code(sample_diseases[j])
    table_data[j].insert(0, disease_name)
    table_data[j].insert(0, sample_diseases[j])  # Inserting code as a separate column entry

headers = ['Code']  + ['Actual Disease'] +  [f"Model {i+1}" for i in range(len(model_paths))]

print(tabulate(table_data, headers=headers))

# Compute and display accuracy for each model
total_predictions = len(sample_symptoms_list)
for i, correct in enumerate(correct_predictions):
    accuracy = (correct / total_predictions) * 100
    print(f"Accuracy for Model {i+1}: {accuracy:.2f}%")
