In [None]:
# Step 1: Import necessary libraries and load the dataset
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize

# Loading the Data
file_path = "/home/guilherme/Documents/GitHub/Tese/Documentation/Dataset_Augmentation/Augmented_database_29_09_2023.xlsx"
df = pd.read_excel(file_path, sheet_name="Augmented Data")

In [None]:
# 2. Data Transformation and Encoding

# Replace NaN with 'None' placeholder
symptom_cols = [f'Symptom_{i}' for i in range(1, 26)]
df[symptom_cols] = df[symptom_cols].fillna('None')
df['symptoms'] = df[symptom_cols].apply(lambda row: [symptom for symptom in row if symptom != 'None'], axis=1)

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptoms'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['ICD 11'])

In [None]:
# Step 3: Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Introducing SMOTE
smote = SMOTE(k_neighbors=4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Step 4: Hyperparameter tunning
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define your RandomForestClassifier
clf_template = RandomForestClassifier(n_estimators=100, random_state=42)

# Set up parameter grid (adjust parameters and ranges accordingly)
param_grid = {
    'n_estimators': [25, 50], 
    'max_depth': [None, 10], 
    'min_samples_split': [2, 5, 10], 
    'max_features': [10, 25],
    'class_weight': [None, 'balanced'],
    'bootstrap': [True, False]
}

# Run grid search
grid_search = GridSearchCV(clf_template, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

# Get best parameters and estimator
best_params = grid_search.best_params_
# best_clf = grid_search.best_estimator_

# # Evaluate the best model
# y_pred = best_clf.predict(X_test)


In [None]:
# Step 5: Train the Random Forest Model

clf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42,
    max_features= best_params['max_features'],
    class_weight=best_params['class_weight'],
    bootstrap=best_params['bootstrap']
)
clf.fit(X_train_resampled, y_train_resampled)


In [None]:
# Step 6: Model Evaluation

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred, labels=range(len(label_encoder.classes_)), target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix Visualization
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12,12))
sns.heatmap(conf_mat, annot=True, fmt="d",
            cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.show()


In [None]:
# Step 8: Model Deployment
import joblib

# Save the model, label encoder, and binarizer for later use
model_filename = 'RTF_no_features.pkl'
label_encoder_filename = 'no_features_label_encoder.pkl'
binarizer_filename = 'no_features_symptoms_binarizer.pkl'

joblib.dump(clf, model_filename)
joblib.dump(label_encoder, label_encoder_filename)
joblib.dump(mlb, binarizer_filename)

loaded_model = joblib.load(model_filename)
loaded_label_encoder = joblib.load(label_encoder_filename)
loaded_binarizer = joblib.load(binarizer_filename)



In [None]:
# Step 9: Predict the disease

import numpy as np

def predict_disease(symptoms_list, top_n=5):
    """
    Given a list of symptoms (ICD-11 codes), predict the top potential diseases along with their confidence.
    """
    # Transform the symptoms list into the appropriate binary vector format
    symptoms_encoded = loaded_binarizer.transform([symptoms_list])

    # Predict the probability distribution over classes using the trained model
    disease_probabilities = loaded_model.predict_proba(symptoms_encoded)

    # Get indices of the top_n classes
    top_indices = np.argsort(disease_probabilities[0])[-top_n:][::-1]

    # Decode these indices to get the actual disease codes
    top_diseases = loaded_label_encoder.inverse_transform(top_indices)
    
    # Extract their corresponding probabilities
    top_probabilities = disease_probabilities[0][top_indices]

    return list(zip(top_diseases, top_probabilities))

# Test the prediction function
sample_symptoms = ['MC15', '9D9Z', '9D90.6', '9C80.0', 'LD20.4', '8A68.Z', '9B73.3', '9B65.2', '1D01.Y', 'MA01.Z'] # 1F57.Z	Toxoplasmosis
predicted_diseases_with_confidence = predict_disease(sample_symptoms)

for disease, confidence in predicted_diseases_with_confidence:
    print(f"Disease: {disease} with confidence: {confidence*100:.2f}%")
