In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score, roc_curve, auc, cohen_kappa_score, confusion_matrix
import os
import sys
from tabulate import tabulate
from functools import reduce
from pathlib import Path
import re
parent_dir = os.path.abspath('../')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)


In [3]:
from Utils import utils

# reading main config file
config = utils.read_config()

system = 1 # 1 or 2
if system == 1:
    PATH = config["Source"]["paths"]["source_path_system_1"]
elif system == 2:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    PATH = config["Source"]["paths"]["source_path_system_2"]
else:
    PATH = ""
    print("Invalid system")

In [4]:
def calculate_metrics(pred_prob, GT):
    fpr, tpr, thresholds = roc_curve(GT, pred_prob, pos_label=1)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred_labels = (pred_prob >= optimal_threshold).astype(int)
    #print("prediction: ", pred_labels)
    #print("GT: ", GT)

    # Calculate True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN)
    TP = ((pred_labels == 1) & (GT == 1)).sum()
    TN = ((pred_labels == 0) & (GT == 0)).sum()
    FP = ((pred_labels == 1) & (GT == 0)).sum()
    FN = ((pred_labels == 0) & (GT == 1)).sum()
    sensitivity = TP / (TP + FN)
    # precision = TP / (TP + FP)
    specificity = TN / (TN + FP)
    auc_score = auc(fpr, tpr)

    results = [
        ["True Positives (TP)", TP],
        ["True Negatives (TN)", TN],
        ["False Positives (FP)", FP],
        ["False Negatives (FN)", FN],
        ["Sensitivity", sensitivity],
        # ["Precision", precision],
        ["Specificity", specificity],
        ["AUC", auc_score]
    ]
    # Print results in tabular form
    # print(tabulate(results, headers=["Metric", "Value"], tablefmt="fancy_grid"))
    return auc_score, results

In [5]:
def best_model_selection_from_fold_metricbased(system, type, category, experiment_number, fold_number):
    repls = ("Network_Weights/best_model", "Metrics/epoch"), ("pth.tar", "csv")
    
    if type == "regression":
        path = config["Source"]["paths"][f"source_path_system_{system}"] + config["regression_path"] + f"/Experiment_{experiment_number}/CV_{fold_number}/Network_Weights/"
    else:
        path = config["Source"]["paths"][f"source_path_system_{system}"] + config["classification_path"] + f"/{category}/Experiment_{experiment_number}/CV_{fold_number}/Network_Weights/"
    
    model_files = []
    for dirs, subdirs, files in os.walk(path):
        for file in files:
            file_path = str(os.path.join(dirs, file))
            file_path = file_path.replace('\\','/')
            model_files.append(file_path)

    sorted(model_files)

    model_dict = dict()

          
    if type=="regression":   
        best_metric = 100000000.0
    else:
        best_metric = -1

    for model_path in model_files:
        metric_path = reduce(lambda a, kv: a.replace(*kv), repls, model_path)
        metric_data = pd.read_csv(metric_path)

        if type=="regression":   
            metric = mean_absolute_error(metric_data["GT"], metric_data["prediction (age)"])
            if metric < best_metric:
                best_metric = metric    
        else:
            if category=="Diagnosis":
                metric = cohen_kappa_score(metric_data["prediction"], metric_data["GT"])
                if metric > best_metric:
                    best_metric = metric   
            else:
                auc, _ = calculate_metrics(metric_data["prediction_probability (sex)"], np.array(metric_data["GT"]).astype(int))
                metric = auc
                if metric > best_metric:
                    best_metric = metric   

        epoch_num = model_path.split("_")[-1].split(".")[0]
        #print(metric, epoch_num, model_path)
        if metric in model_dict.keys():
            if epoch_num < model_dict[metric][1]:
                model_dict[metric] = (model_path, epoch_num)
            else:
                pass
        else:
            model_dict[metric] = (model_path, epoch_num)
          
    #print(model_dict)
    best_model_path = model_dict[best_metric][0]
    epoch_to_continue = model_dict[best_metric][1] #best_model_path.split("_")[-1].split(".")[0]
    #print("best: ", best_metric, epoch_to_continue, best_model_path)
    return best_model_path, epoch_to_continue

In [None]:
system=2
type="classification"
category="Sex"
experiment_number=2
fold_number=1
path = config["Source"]["paths"][f"source_path_system_{system}"] + config["classification_path"] + f"/{category}/Experiment_{experiment_number}/CV_{fold_number}/Network_Weights/"
    
model_files = []
for dirs, subdirs, files in os.walk(path):
    for file in files:
        file_path = str(os.path.join(dirs, file))
        file_path = file_path.replace('\\','/')
        model_files.append(file_path)
#sorted(model_files, reverse=True)
model_files
model_files[0].split("_")[-1].split(".")[0]

In [None]:

[int(num) if num else alpha for num, alpha in re.compile(r'(\d+)|(\D+)').findall('/home/ashish/Ashish/UCAN/Results/classification/Sex/Experiment_2/CV_1/Network_Weights/best_model_2.pth.tar')]

In [11]:
tokenize = re.compile(r'(\d+)|(\D+)').findall
def natural_sortkey(string):          
    return tuple(int(num) if num else alpha for num, alpha in tokenize(string.name))

def best_model_selection_from_fold(system, type, category, experiment_number, fold_number):
    if type == "regression":
        path = config["Source"]["paths"][f"source_path_system_{system}"] + config["regression_path"] + f"/Experiment_{experiment_number}/CV_{fold_number}/Network_Weights/"
    else:
        path = config["Source"]["paths"][f"source_path_system_{system}"] + config["classification_path"] + f"/{category}/Experiment_{experiment_number}/CV_{fold_number}/Network_Weights/"
    path_object = Path(path)
    #print(path)
    models = path_object.glob("*")
    models_sorted = sorted(models, key=natural_sortkey)
    for i in models_sorted:
        pass#print(i.name)
    best_model_path = path + [model.name for model in models_sorted][-1]
    epoch_to_continue = best_model_path.split("_")[-1].split(".")[0]
    return best_model_path, epoch_to_continue
best_model_selection_from_fold(2, 'classification', 'Sex', 2, 1)

IndexError: list index out of range

In [9]:
def evaluate_best_models_all_folds(system, type, category, experiment_number, folds_list):
    repls = ("Network_Weights/best_model", "Metrics/epoch"), ("pth.tar", "csv")

    auc_from_all_folds = []
    metric_data_list = []
    for fold_number in folds_list:
        best_model_path, _ = best_model_selection_from_fold(system, type, category, experiment_number, fold_number)
        best_metric_path = reduce(lambda a, kv: a.replace(*kv), repls, best_model_path)
        print(best_metric_path)
        metric_data = pd.read_csv(best_metric_path)
        metric_data_list.append(metric_data)

    all_metric_df = pd.concat(metric_data_list)

    if type=="regression":        
        metric_r_squared = r2_score(all_metric_df["GT"], all_metric_df["prediction (age)"])
        metric_mae = mean_absolute_error(all_metric_df["GT"], all_metric_df["prediction (age)"])
        metrics = {"metric_mae":metric_mae, "metric_r_squared":metric_r_squared}
        return metrics
    else:
        if category=="Diagnosis":
            c_k_score = cohen_kappa_score(all_metric_df["prediction"], all_metric_df["GT"])
            idx_classes = ["C81_GT", "C83_GT", "Others_GT"]
            col_classes = ["C81_Pred", "C83_Pred", "Others_Pred"]
            confusion_matrix_df = pd.DataFrame(confusion_matrix(all_metric_df["GT"], all_metric_df["prediction"]), columns=col_classes, index=idx_classes)
            metrics = {"c_k_score":c_k_score, "confusion_matrix":confusion_matrix_df}
            return metrics
        else:
            for fold in folds_list:
                best_model_path, _ = best_model_selection_from_fold(system, type, category, experiment_number, fold)
                best_metric_path = reduce(lambda a, kv: a.replace(*kv), repls, best_model_path)
                print(best_metric_path)
                metric_data = pd.read_csv(best_metric_path)
                #print(metric_data.columns)
                auc, _  = calculate_metrics(metric_data["prediction_probability (sex)"], np.array(metric_data["GT"]).astype(int) )  #"prediction_probability_male" for exp1, prediction_probability (sex) for exp 2
                auc_from_all_folds.append(auc)
            return np.mean(auc_from_all_folds)

In [7]:
def evaluate_best_models_all_folds_metricbased(system, type, category, experiment_number, folds_list):
    repls = ("Network_Weights/best_model", "Metrics/epoch"), ("pth.tar", "csv")

    auc_from_all_folds = []
    metric_data_list = []
    for fold_number in folds_list:
        best_model_path, _ = best_model_selection_from_fold_metricbased(system, type, category, experiment_number, fold_number)
        best_metric_path = reduce(lambda a, kv: a.replace(*kv), repls, best_model_path)
        print(best_metric_path)
        metric_data = pd.read_csv(best_metric_path)
        metric_data_list.append(metric_data)

    all_metric_df = pd.concat(metric_data_list)

    if type=="regression":        
        metric_r_squared = r2_score(all_metric_df["GT"], all_metric_df["prediction (age)"])
        metric_mae = mean_absolute_error(all_metric_df["GT"], all_metric_df["prediction (age)"])
        metrics = {"metric_mae":metric_mae, "metric_r_squared":metric_r_squared}
        return metrics
    else:
        if category=="Diagnosis":
            c_k_score = cohen_kappa_score(all_metric_df["prediction"], all_metric_df["GT"])
            idx_classes = ["C81_GT", "C83_GT", "Others_GT"]
            col_classes = ["C81_Pred", "C83_Pred", "Others_Pred"]
            confusion_matrix_df = pd.DataFrame(confusion_matrix(all_metric_df["GT"], all_metric_df["prediction"]), columns=col_classes, index=idx_classes)
            metrics = {"c_k_score":c_k_score, "confusion_matrix":confusion_matrix_df}
            return metrics
        else:
            for fold in folds_list:
                best_model_path, _ = best_model_selection_from_fold_metricbased(system, type, category, experiment_number, fold)
                best_metric_path = reduce(lambda a, kv: a.replace(*kv), repls, best_model_path)
                print(best_metric_path)
                metric_data = pd.read_csv(best_metric_path)
                #print(metric_data.columns)
                auc, _  = calculate_metrics(metric_data["prediction_probability (sex)"], np.array(metric_data["GT"]).astype(int) )  #"prediction_probability_male" for exp1, prediction_probability (sex) for exp 2
                auc_from_all_folds.append(auc)
            return np.mean(auc_from_all_folds)

In [None]:
df = pd.read_csv("/home/ashish/Ashish/UCAN/Results/classification/Sex/Experiment_1/CV_0/Metrics/epoch_56.csv")

In [None]:
np.array(df["prediction_probability_male"]).flatten()

### For regression

In [14]:
metrics = evaluate_best_models_all_folds(system=1, type="regression", category=None, experiment_number=3, folds_list=list(range(10)))
metrics

/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_0/Metrics/epoch_202.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_1/Metrics/epoch_324.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_2/Metrics/epoch_285.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_3/Metrics/epoch_263.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_4/Metrics/epoch_261.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_5/Metrics/epoch_290.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_6/Metrics/epoch_328.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_7/Metrics/epoch_191.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_8/Metrics/epoch_291.csv
/media/andres/T7 Shield1/UCAN_project/Results/regression/Experiment_3/CV_9/Metrics/epoch_304.csv


{'metric_mae': 6.3549918575434665, 'metric_r_squared': 0.7426546397128754}

In [13]:
metrics = evaluate_best_models_all_folds_metricbased(system=2, type="regression", category=None, experiment_number=3, folds_list=list(range(10)))
metrics

KeyError: 100000000.0

In [None]:
metrics = evaluate_best_models_all_folds_metricbased(system=2, type="regression", category=None, experiment_number=3, folds_list=list(range(10)))
metrics

### For classification

In [18]:
metrics = evaluate_best_models_all_folds(system=1, type="classification", category="Sex", experiment_number=2, folds_list=list(range(10)))
metrics

/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_0/Metrics/epoch_27.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_1/Metrics/epoch_92.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_2/Metrics/epoch_44.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_3/Metrics/epoch_45.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_4/Metrics/epoch_39.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_5/Metrics/epoch_99.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_6/Metrics/epoch_80.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_7/Metrics/epoch_59.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex/Experiment_2/CV_8/Metrics/epoch_96.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Sex

0.9985318828488333

In [17]:
metrics = evaluate_best_models_all_folds_metricbased(system=1, type="classification", category="Diagnosis", experiment_number=1, folds_list=list(range(10)))
metrics

/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_0/Metrics/epoch_130.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_1/Metrics/epoch_142.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_2/Metrics/epoch_103.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_3/Metrics/epoch_286.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_4/Metrics/epoch_343.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_5/Metrics/epoch_193.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_6/Metrics/epoch_418.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_7/Metrics/epoch_306.csv
/media/andres/T7 Shield1/UCAN_project/Results/classification/Diagnosis/Experiment_1/CV_8/Metrics/epoch_406.csv
/

{'c_k_score': 0.3506130099345308,
 'confusion_matrix':            C81_Pred  C83_Pred  Others_Pred
 C81_GT          230        60           38
 C83_GT           58       120           56
 Others_GT        52        57           91}

In [None]:
df = pd.read_csv('/home/ashish/Ashish/UCAN/Results/classification/Sex/Experiment_2/CV_0/Metrics/epoch_27.csv')
df['prediction_probability (sex)'].round().to_list()

In [None]:
confusion_matrix_df = metrics["confusion_matrix"]
pred_class= list(confusion_matrix_df.columns)
GT_class= list(confusion_matrix_df.index)
print(pred_class, GT_class)
metrics["confusion_matrix"]

In [None]:
image = np.load("/home/ashish/Ashish/UCAN/Results/classification/Sex/Experiment_1/CV_9/AUC.npy")
import matplotlib.pyplot as plt
plt.plot(image)

In [None]:
specificity = []
for cls in GT_class:
    cls_orig = cls.split('_')[0]
    FP = np.sum(np.array(metrics["confusion_matrix"].loc[[i for i in GT_class if i!=cls], [cls_orig + "_Pred"]]))
    TN = np.sum(np.array(metrics["confusion_matrix"].loc[[i for i in GT_class if i!=cls], [i for i in pred_class if i!=cls_orig + "_Pred"]]))
    specificity.append(TN/(TN+FP))
specificity

In [None]:
data["prediction_probability (diagnosis)"].hist()

### Check reshaped projections

In [None]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
image = np.load("/media/andres/T7 Shield1/UCAN_project/collages/reshaped_collages/lpr385705046400/20140313/CT_adipose.npy")
plt.imshow(image, cmap="gray")
plt.show()

In [None]:
path = "/media/andres/T7 Shield1/UCAN_project/collages/reshaped_collages/lpr415675513429/20190201/"
#lpr415675513429_20190201
lst1 = ["SUV_", "CT_"]
lst2 = ["MIP.npy","bone.npy","lean.npy","adipose.npy","air.npy"]

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20,10))

cv=0
for i in range(2):
    for j in range(5):
        img = np.load(path + lst1[i] + lst2[j])
        axs[i, j].imshow(img,cmap="gray")
        cv += 1
plt.show()

In [None]:
img = "/media/andres/T7 Shield1/UCAN_project/collages/reshaped_collages/npr113096472044/20110318/SUV_MIP.npy"
image_max = np.load(img)
plt.imshow(image_max, cmap="gray")
plt.show()

In [None]:
img = "/media/andres/T7 Shield1/UCAN_project/collages/reshaped_collages/npr113096472044/20110318/SUV_lean.npy"
image_mean = np.load(img)
plt.imshow(image_mean, cmap="gray")
plt.show()

In [None]:
(image_max == image_mean).all()

In [None]:
df1 = pd.read_excel(r'/home/ashish/Ashish/UCAN/Dataframes/ClinicalData/Lymphoma_Octopus_Export 2023-11-29_C80_only_n1291_incl_YYYY_MM_pnr_removed.xlsx')
df2 = pd.read_excel(r'/home/ashish/Ashish/UCAN/Dataframes/LinkedData/dataset_for_training_366patients_baseline_scans_clinical20231129.xlsx')

In [None]:
df1.translatedDiagnosis.value_counts().to_csv('/home/ashish/Ashish/UCAN/Dataframes/Analysis/clinical_data_analysis.csv')

In [None]:
df2.diagnosis.value_counts().to_csv('/home/ashish/Ashish/UCAN/Dataframes/Analysis/clinical_processed_data_analysis.csv')

In [None]:
import torch
efficientnet = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_efficientnet_b7', pretrained=True)