In [None]:
# Step 1: Import necessary libraries and load the dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize
import sys
import json
sys.path.append("/home/guilherme/Documents/GitHub/Tese/MDCompass/")
from keras.preprocessing.sequence import pad_sequences

from ML_Algorithm.Functions.compute_symptom_relatedness import compute_relatedness, compute_relatedness_matrix
# from ML_Algorithm.Functions.compute_severity import get_severity_scores

# Loading the severity scores
with open("/home/guilherme/Documents/GitHub/Tese/MDCompass/ML_Algorithm/json_files/severity_scores.json", "r") as f:
    severity_mapping = json.load(f)

# 1. Loading the Data
file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Augmented_database_29_09_2023.xlsx"
df = pd.read_excel(file_path, sheet_name="Augmented Data")

In [None]:
# 2. Data Transformation and Encoding

# Replace NaN with 'None' placeholder
symptom_cols = [f'Symptom_{i}' for i in range(1, 26)]
df[symptom_cols] = df[symptom_cols].fillna('None')
df['symptoms'] = df[symptom_cols].apply(lambda row: [symptom for symptom in row if symptom != 'None'], axis=1)

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptoms'])

# Insert the additional features calculation here
def get_severity_scores(symptom_list):
    return [severity_mapping.get(symptom, 0) for symptom in symptom_list]


# Calculating additional features
df['relatedness_values'] = df['symptoms'].apply(compute_relatedness_matrix)
df['severity_values'] = df['symptoms'].apply(get_severity_scores)

# Padding the relatedness matrix to a consistent shape
max_dim = max([len(x) for x in df['relatedness_values']])

def pad_matrix_to_shape(matrix, max_dim):
    # Determine how much to pad
    pad_dim = max_dim - matrix.shape[0]
    
    # Pad rows
    matrix = np.pad(matrix, ((0, pad_dim), (0, pad_dim)), 'constant')
    return matrix

# After calculating max_dim in the training phase
with open('max_dim.json', 'w') as f:
    json.dump({"max_dim": max_dim}, f)


# Apply padding to each matrix
df['relatedness_values_padded'] = df['relatedness_values'].apply(lambda x: pad_matrix_to_shape(x, max_dim))

# Padding the severity values to have the same length
padded_severity = pad_sequences(df['severity_values'], padding='post', dtype='float32', maxlen=max_dim)

# Now, reshaping the relatedness and severity features
relatedness_features = np.array(df['relatedness_values_padded'].tolist()).reshape(len(df), -1)

severity_features = padded_severity.reshape(len(df), -1)

# Stacking them up with the binarized symptoms
# X = np.hstack((X, relatedness_features, severity_features))
X = np.hstack((X, relatedness_features))


label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['ICD 11'])


In [None]:
# Step 3: Split the Data into Training and Testing Sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Introducing SMOTE
smote = SMOTE(k_neighbors=4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
# Step 4: Model Fine-tuning
# This is a simple example using GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

clf_template = RandomForestClassifier(n_estimators=100, random_state=42)


# Set up parameter grid (adjust parameters and ranges accordingly)
param_grid = {
    'n_estimators': [25, 50, 75, 100], 
    'max_depth': [None], 
    'min_samples_split': [2, 5, 10], 
    'max_features': [2, 5, 10, 15],
    'class_weight': [None, 'balanced'],
    'bootstrap': [True, False]
}

# Run grid search
grid_search = GridSearchCV(clf_template, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get best parameters and estimator
best_params = grid_search.best_params_
# best_clf = grid_search.best_estimator_


In [None]:
# Step 5: Train the Random Forest Model

clf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42,
    max_features= best_params['max_features'],
    class_weight=best_params['class_weight'],
    bootstrap=best_params['bootstrap']
)
clf.fit(X_train_resampled, y_train_resampled)

In [None]:
# Step 6: Model Evaluation

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred, labels=range(len(label_encoder.classes_)), target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix Visualization
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12,12))
sns.heatmap(conf_mat, annot=True, fmt="d",
            cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()

In [None]:
# 7. Model Deployment
import joblib

# Save the model, label encoder, and binarizer for later use
model_filename = 'RTF_with_features.pkl'
label_encoder_filename = 'with_features_label_encoder.pkl'
binarizer_filename = 'with_features_symptoms_binarizer.pkl'

joblib.dump(clf, model_filename)
joblib.dump(label_encoder, label_encoder_filename)
joblib.dump(mlb, binarizer_filename)

loaded_model = joblib.load(model_filename)
loaded_label_encoder = joblib.load(label_encoder_filename)
loaded_binarizer = joblib.load(binarizer_filename)


In [None]:
# Step 9: Predict the disease

import numpy as np

def predict_disease(symptoms_list, top_n=5):
    """
    Given a list of symptoms (ICD-11 codes), predict the top potential diseases along with their confidence.
    """
    # Transform the symptoms list into the appropriate binary vector format
    symptoms_encoded = loaded_binarizer.transform([symptoms_list])

    # Calculate the relatedness and severity features
    relatedness_values = compute_relatedness_matrix(symptoms_list)
    severity_values = get_severity_scores(symptoms_list)
    
    # Flatten the relatedness values matrix
    flattened_relatedness = relatedness_values.reshape(-1)
    
    # Pad the flattened relatedness values to the length of max_dim^2
    padded_relatedness = pad_sequences([flattened_relatedness], maxlen=max_dim*max_dim, padding='post', dtype='float32').reshape(1, -1)

    padded_severity = pad_sequences([severity_values], maxlen=max_dim, padding='post', dtype='float32').reshape(1, -1)

    # Combine all the features
    combined_features = np.hstack((symptoms_encoded, padded_relatedness, padded_severity))
    # combined_features = np.hstack((symptoms_encoded, padded_relatedness))

    # Predict the probability distribution over classes using the trained model
    disease_probabilities = loaded_model.predict_proba(combined_features)

    # Get indices of the top_n classes
    top_indices = np.argsort(disease_probabilities[0])[-top_n:][::-1]

    # Decode these indices to get the actual disease codes
    top_diseases = loaded_label_encoder.inverse_transform(top_indices)
    
    # Extract their corresponding probabilities
    top_probabilities = disease_probabilities[0][top_indices]

    return list(zip(top_diseases, top_probabilities))

# Test the prediction function
sample_symptoms = ['MC15', '9D9Z', '9D90.6', '9C80.0', 'LD20.4', '8A68.Z', '9B73.3', '9B65.2', '1D01.Y', 'MA01.Z'] # 1F57.Z	Toxoplasmosis
predicted_diseases_with_confidence = predict_disease(sample_symptoms)

for disease, confidence in predicted_diseases_with_confidence:
    print(f"Disease: {disease} with confidence: {confidence*100:.2f}%")
