In [None]:
# 1. Load the dataset

import numpy as np
import pandas as pd
import json
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, Dropout
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder



file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Augmented_database_22_09_2023.xlsx"
df = pd.read_excel(file_path, sheet_name="Augmented Data")

with open("/home/guilherme/Documents/GitHub/Tese/MDCompass/ML_Algorithm/json_files/severity_scores.json", "r") as f:
    severity_mapping = json.load(f)



In [None]:
# Step 2: Add the features functions

def get_severity_scores(symptom_list):
    return [severity_mapping.get(symptom, 0) for symptom in symptom_list]


def compute_relatedness(code1, code2):
    """
    Computes relatedness between two ICD-11 codes.
    """

    # Check if either code is NaN or not a string
    if not isinstance(code1, str) or not isinstance(code2, str):
        return 0
    
    # Check if codes are exactly the same
    if code1 == code2:
        return 1.0
    
    # Handle custom codes
    custom_prefixes = ["AAAA", "BBBB", "CCCC"]
    if code1[:4] in custom_prefixes or code2[:4] in custom_prefixes:
        if code1[:4] == code2[:4]:
            return 0.5
        else:
            return 0

    # Check prefix relatedness for genuine ICD-11 codes
    common_prefix_length = 0
    min_length = min(len(code1), len(code2))

    for i in range(min_length):
        if code1[i] == code2[i]:
            common_prefix_length += 1
        else:
            break
    
    # Add additional weight if first four characters are same
    if code1[:4] == code2[:4]:
        common_prefix_length += 1
        denominator = max(len(code1), len(code2)) + 1
    else:
        denominator = max(len(code1), len(code2))

    return common_prefix_length / denominator


def compute_relatedness_matrix(symptoms_list):
    matrix = []
    for symptom1 in symptoms_list:
        row = []
        for symptom2 in symptoms_list:
            row.append(compute_relatedness(symptom1, symptom2))
        matrix.append(row)
    return np.array(matrix)

In [None]:
# Step 3: Prepare the data: Create a list of symptom sequences for each disease

symptom_cols = [f"Symptom_{i}" for i in range(1, 26)]
symptoms_data = df[symptom_cols].values.tolist()
symptoms_data = [list(filter(lambda v: v==v, lst)) for lst in symptoms_data]


df['relatedness_values'] = df[symptom_cols].apply(compute_relatedness_matrix, axis=1)
df['severity_values'] = df[symptom_cols].apply(get_severity_scores, axis=1)


In [None]:
from gensim.models import Word2Vec
import numpy as np

# Step 4: Convert symptom sequences to embeddings using Word2Vec and process severity metrics

embedding_size = 128
w2v_model = Word2Vec(sentences=symptoms_data, vector_size=embedding_size, window=5, min_count=1, workers=4)

def symptoms_to_embedding(symptom_codes, w2v_model, embedding_size, severity_scores):
    # Convert symptom codes to embeddings
    symptom_embeddings = [w2v_model.wv[code] if code in w2v_model.wv else np.zeros(embedding_size) for code in symptom_codes]
    
    # Concatenate embeddings with severity scores
    combined_features = [np.concatenate([embed, [severity_scores[i]]]) for i, embed in enumerate(symptom_embeddings)]
    
    # Flatten the combined features
    flattened_features = np.hstack(combined_features)
    
    return flattened_features

X_embedded = np.array([symptoms_to_embedding(symptoms, w2v_model, embedding_size, severity) for symptoms, severity in zip(symptoms_data, df['severity_values'].tolist())])


In [None]:
# Step 5: Prepare the data for training by combining the features

max_dim = max([m.shape[0] for m in df['relatedness_values'].tolist()])
relatedness_features = np.array([m.flatten() for m in df['relatedness_values'].tolist()])

X_padded = pad_sequences(X_embedded, padding='post')
X_padded_reshaped = X_padded.reshape(4802, -1)  # -1 makes numpy automatically compute the correct size for that dimension.
# print(X_padded_reshaped.shape)  # This should print (4802, 3096)

# print(X_padded.shape)
# print(relatedness_features.shape)

X_combined = np.hstack((X_padded_reshaped, relatedness_features))


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["ICD 11"])


In [None]:
# Step 6: Define and compile the neural network model

input_shape = (X_combined.shape[1],)
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=input_shape))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(set(y_encoded)), activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
# Step 7: Split data and train the model

X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)
y_train_onehot = to_categorical(y_train)
y_test_onehot = to_categorical(y_test)

epochs = 50
batch_size = 64
history = model.fit(X_train, y_train_onehot, validation_data=(X_test, y_test_onehot), epochs=epochs, batch_size=batch_size)


In [None]:
# Step 8: Evaluate the model and visualize the training progress

loss, accuracy = model.evaluate(X_test, y_test_onehot)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.tight_layout()
plt.show()



In [None]:
def predict_top_diseases_with_confidence(symptom_codes, top_n=5):
    embedded_symptoms = symptoms_to_embedding(symptom_codes, w2v_model, embedding_size, get_severity_scores(symptom_codes))
    padded_symptoms = pad_sequences([embedded_symptoms], maxlen=X_combined.shape[1], padding='post')
    
    # Reshape the padded symptoms to match the expected shape
    reshaped_symptoms = padded_symptoms[:, :3721]
    
    # Convert to float32
    reshaped_symptoms = reshaped_symptoms.astype(np.float32)
    
    prediction = model.predict(reshaped_symptoms)

    top_n_indices = np.argsort(prediction[0])[-top_n:][::-1]
    top_n_probabilities = prediction[0][top_n_indices]
    top_n_icd_codes = label_encoder.inverse_transform(top_n_indices)
    results = list(zip(top_n_icd_codes, top_n_probabilities))
    return results

symptom_input = ['MC15', '9D9Z', '9D90.6', '9C80.0', 'LD20.4', '8A68.Z', '9B73.3', '9B65.2', '1D01.Y', 'MA01.Z'] # 1F57.Z	Toxoplasmosis
predictions_with_confidence = predict_top_diseases_with_confidence(symptom_input)
for icd, confidence in predictions_with_confidence:
    print(f"Predicted disease (ICD 11 Code): {icd} with confidence: {confidence:.2f}%")
