In [None]:
# Step 1: Import necessary libraries and load the dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize

import sys
import json
sys.path.append("C:\\Users\\london\\Documents\\Github\\Tese\\Disease_Dataset_Playground\\Server_Models")
from keras.preprocessing.sequence import pad_sequences

from ML_Algorithm.Functions.compute_symptom_relatedness import compute_relatedness, compute_relatedness_matrix
# from ML_Algorithm.Functions.compute_severity import get_severity_scores

# Loading the severity scores
with open("C:\\Users\\london\\Documents\\Github\\Tese\\Disease_Dataset_Playground\\Datasets\\step_4\\severity_scores.json", "r") as f:
    severity_mapping = json.load(f)

# 1. Loading the Data
file_path = "C:\\Users\\london\\Documents\\Github\\Tese\\Documentation\\Dataset_Augmentation\\Augmented_database_29_09_2023.xlsx"
df = pd.read_excel(file_path, sheet_name="Augmented Data")

In [6]:
# 2. Data Transformation and Encoding

# Replace NaN with 'None' placeholder
symptom_cols = [f'Symptom_{i}' for i in range(1, 26)]
df[symptom_cols] = df[symptom_cols].fillna('None')
df['symptoms'] = df[symptom_cols].apply(lambda row: [symptom for symptom in row if symptom != 'None'], axis=1)

mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptoms'])

# Insert the additional features calculation here
def get_severity_scores(symptom_list):
    return [severity_mapping.get(symptom, 0) for symptom in symptom_list]


# Calculating additional features
df['relatedness_values'] = df['symptoms'].apply(compute_relatedness_matrix)
df['severity_values'] = df['symptoms'].apply(get_severity_scores)

# Padding the relatedness matrix to a consistent shape
max_dim = max([len(x) for x in df['relatedness_values']])

def pad_matrix_to_shape(matrix, max_dim):
    # Determine how much to pad
    pad_dim = max_dim - matrix.shape[0]
    
    # Pad rows
    matrix = np.pad(matrix, ((0, pad_dim), (0, pad_dim)), 'constant')
    return matrix

# After calculating max_dim in the training phase
with open('max_dim.json', 'w') as f:
    json.dump({"max_dim": max_dim}, f)


# Apply padding to each matrix
df['relatedness_values_padded'] = df['relatedness_values'].apply(lambda x: pad_matrix_to_shape(x, max_dim))

# Padding the severity values to have the same length
padded_severity = pad_sequences(df['severity_values'], padding='post', dtype='float32', maxlen=max_dim)

# Now, reshaping the relatedness and severity features
relatedness_features = np.array(df['relatedness_values_padded'].tolist()).reshape(len(df), -1)

severity_features = padded_severity.reshape(len(df), -1)

# Stacking them up with the binarized symptoms
X = np.hstack((X, relatedness_features, severity_features))
# X = np.hstack((X, relatedness_features))

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['ICD 11'])


In [8]:
# Step 3: Train-test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Introducing SMOTE
smote = SMOTE(k_neighbors=4, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# # Step 4: Hyperparameter tunning

# from sklearn.model_selection import cross_val_score
# import numpy as np

# # List of potential k values
# k_values = list(range(1, 51))  # Checking values of k from 1 to 50.

# cv_scores = []

# # Perform cross-validation for each value of k
# for k in k_values:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     scores = cross_val_score(knn, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
#     cv_scores.append(np.mean(scores))

# # Find the value of k that gave the highest accuracy
# best_k = k_values[np.argmax(cv_scores)]
# print(f"Optimal value of k is: {best_k}")




In [None]:
# Step 5: Model Training

# Train the model with the best value of k and save it as `loaded_model`
knn = KNeighborsClassifier(n_neighbors=4)  # You can optimize the number of neighbors based on cross-validation
knn.fit(X_train_resampled, y_train_resampled)


In [None]:
# Step 6: Evaluation

y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# Confusion Matrix Visualization
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(13,11))
sns.heatmap(conf_mat, annot=True, fmt="d",
            cmap=plt.cm.Blues)
plt.xlabel('Predicted label', fontsize=15)
plt.ylabel('True label', fontsize=15)

# Ensure x and y ticks are displayed properly
tick_interval = 20  # Adjust based on how many labels you want to show

# Set ticks manually and force visibility
plt.xticks(np.arange(0, conf_mat.shape[0], step=tick_interval), labels=np.arange(0, conf_mat.shape[0], step=tick_interval), fontsize=10)
plt.yticks(np.arange(0, conf_mat.shape[1], step=tick_interval), labels=np.arange(0, conf_mat.shape[1], step=tick_interval), fontsize=10)

# Force display of ticks by ensuring they are set correctly
plt.tick_params(axis='x', which='major', labelsize=10, rotation=90)  # Rotate x labels for clarity
plt.tick_params(axis='y', which='major', labelsize=10)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# ROC Curve and AUC (only for binary classification or One-vs-All)
if len(label_encoder.classes_) == 2:
    fpr, tpr, thresholds = roc_curve(y_test, knn.predict_proba(X_test)[:,1])
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label=f'AUC: {auc(fpr, tpr):.2f}')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

# Precision-Recall Curve (especially for imbalanced datasets)
if len(label_encoder.classes_) == 2:  # Still keeping it for binary for simplicity
    average_precision = average_precision_score(y_test, knn.predict_proba(X_test)[:,1])
    precision, recall, _ = precision_recall_curve(y_test, knn.predict_proba(X_test)[:,1])
    plt.figure(figsize=(8,6))
    plt.step(recall, precision, where='post')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(f'2-class Precision-Recall curve: AP={average_precision:0.2f}')
    plt.show()

# For Multiclass Precision-Recall Curve:
else:
    y_test_bin = label_binarize(y_test, classes=np.unique(y))
    y_score = knn.predict_proba(X_test)
    
    # A "micro-average": quantifying score on all classes jointly
    precision, recall, _ = precision_recall_curve(y_test_bin.ravel(), y_score.ravel())
    average_precision = average_precision_score(y_test_bin, y_score, average="micro")
    plt.figure(figsize=(8,6))
    plt.step(recall, precision, where='post')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(f'Micro-averaged Precision-Recall curve: AP={average_precision:0.2f}')
    plt.show()


In [None]:
# Step 8: Model Deployment
import joblib

# Save the model, label encoder, and binarizer for later use
model_filename = 'KNN_with_features.pkl'
label_encoder_filename = 'disease_label_encoder.pkl'
binarizer_filename = 'symptoms_binarizer.pkl'

joblib.dump(knn, model_filename)
joblib.dump(label_encoder, label_encoder_filename)
joblib.dump(mlb, binarizer_filename)

loaded_model = joblib.load(model_filename)
loaded_label_encoder = joblib.load(label_encoder_filename)
loaded_binarizer = joblib.load(binarizer_filename)



In [None]:
# Step 7: Predict the top diseases based on confidence

def predict_disease(symptoms_list, top_n=5):
    """
    Given a list of symptoms (ICD-11 codes), predict the top potential diseases along with their confidence.
    """

    # Transform the symptoms list into the appropriate binary vector format
    symptoms_encoded = loaded_binarizer.transform([symptoms_list])  # `mlb` was previously used for multilabel binarization
    
    # Calculate the relatedness and severity features
    relatedness_values = compute_relatedness_matrix(symptoms_list)
    severity_values = get_severity_scores(symptoms_list)
    
    # Flatten the relatedness values matrix
    flattened_relatedness = relatedness_values.reshape(-1)
    
    # Pad the flattened relatedness values to the length of max_dim^2
    padded_relatedness = pad_sequences([flattened_relatedness], maxlen=max_dim*max_dim, padding='post', dtype='float32').reshape(1, -1)

    padded_severity = pad_sequences([severity_values], maxlen=max_dim, padding='post', dtype='float32').reshape(1, -1)

    # Combine all the features
    # combined_features = np.hstack((symptoms_encoded, padded_relatedness, padded_severity))
    combined_features = np.hstack((symptoms_encoded, padded_relatedness))

    # Predict the probability distribution over classes using the trained model
    disease_probabilities = knn.predict_proba(combined_features)

    # Get indices of the top_n classes
    top_indices = np.argsort(disease_probabilities[0])[-top_n:][::-1]

    # Decode these indices to get the actual disease codes
    top_diseases = loaded_label_encoder.inverse_transform(top_indices)
    
    # Extract their corresponding probabilities
    top_probabilities = disease_probabilities[0][top_indices]

    return list(zip(top_diseases, top_probabilities))

# Test the prediction function
sample_symptoms = ['MC15', '9D9Z', '9D90.6', '9C80.0', 'LD20.4', '8A68.Z', '9B73.3', '9B65.2', '1D01.Y', 'MA01.Z'] # 1F57.Z	Toxoplasmosis
predicted_diseases_with_confidence = predict_disease(sample_symptoms)

for disease, confidence in predicted_diseases_with_confidence:
    print(f"Disease: {disease} with confidence: {confidence*100:.2f}%")
