In [None]:
"""
Author: Miriam Cobo Cano
"""

import sys
import torch
import datetime
import numpy as np
from model.train import train_model
from model.test import val_model, test_model
import pandas as pd
from sklearn.utils import class_weight
import logging
from model.plots import plot_training_metrics
sys.path.append('/home/ubuntu/tenerife/data/ZZ_githubRepos/baselinesLungAmbition/ImageModels/LoadData')
from load_LungAmbition3D_test_data import load_lungAmbition
# for paralelization
import torch.optim as optim
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import matplotlib.pyplot as plt
import psutil
import ast
from monai.networks.nets import DenseNet121
torch.cuda.empty_cache()
import argparse
import os
import json
from types import SimpleNamespace

np.set_printoptions(precision=3)
n_folds=3
path_to_folds_csv = f'/home/ubuntu/tenerife/data/ZZ_githubRepos/LungAmbition/Data_stratified_split/folds-def_{n_folds}folds'
keep_false_positives_as_separate_test = True

df_merged = pd.read_csv('/home/ubuntu/tenerife/data/LungAmbition/Excels_merged/LungAmbitionMergedAllGroupUpdated3mar2025.csv')
# filter df_merged by GroupUpdated to keep only Lung_Cancer, Benign_Nodules and False_Positive
df_merged = df_merged[df_merged['GroupUpdated'].isin(['Lung_Cancer', 'Benign_Nodules', 'False_Positive'])]
df_merged = df_merged[['ID_proteinData', 'Group', 'Stage_category', 'NRRD_File', 'SEG_Files', 'Cancer_Status', 'TimeYears_CT_blood']]
df_merged['SEG_Files'] = df_merged['SEG_Files'].apply(ast.literal_eval)
if keep_false_positives_as_separate_test:
    y_false_positives = df_merged[df_merged['Group'] == 'False_Positive']['Group']
    # convert label to 1
    y_false_positives = y_false_positives.replace({'False_Positive': 0})
    ID_false_positives = df_merged[df_merged['Group'] == 'False_Positive']['ID_proteinData']
    # create list to store wrong predicted false positives
    list_ID_wrong_predicted_false_positives = []
    X_false_positives = df_merged[df_merged['Group'] == 'False_Positive'].drop(columns=['ID_proteinData', 'Group'])
    # drop in df_cur rows where Group is False_Positive
    df_merged = df_merged[df_merged['Group'].isin(['Lung_Cancer', 'Benign_Nodules'])]
    print("Number of false positives:", X_false_positives.shape[0])
    # save false_positive_metrics in df
    false_positive_metrics = pd.DataFrame(columns=['AUC', 'Balanced_accuracy', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
else:
    df_merged = df_merged[df_merged['Group'].isin(['Lung_Cancer', 'Benign_Nodules', 'False_Positive'])]
    # print shape of excel
    print("Dimensions excel after dropping rows where Group is False_Positive:", df_merged.shape)

# assert that Cancer_Status is 1 for patients where TimeYears_CT_blood is 0 and
# Cancer_Status is 0 for patients where TimeYears_CT_blood is 5
assert (df_merged.loc[df_merged["TimeYears_CT_blood"] == 0, "Cancer_Status"] == 1).all(), \
    "There are patients with TimeYears_CT_blood = 0 who do not have Cancer_Status = 1"

assert (df_merged.loc[df_merged["TimeYears_CT_blood"] == 5, "Cancer_Status"] == 0).all(), \
    "There are patients with TimeYears_CT_blood = 5 who do not have Cancer_Status = 0"
# define a new malignancy column, if Cancer_Status is 0, then malignancy is 0, else 1 according to proposed method
df_merged['Malignancy'] = df_merged['Cancer_Status'].apply(lambda x: 0 if x == 0 else 1)
# save best metrics for each fold
fold_metrics_df = pd.DataFrame(columns=['Fold', 'Accuracy', 'Specificity', 'NPV', 'Precision', 'Recall', 'F1-score'])

for fold in range(0, n_folds):
    print("=" * 80)
    print(f"Fold {fold + 1}:")
    # read train, test and val indices for each fold
    fold_data = pd.read_csv(os.path.join(path_to_folds_csv, f'id2splitfold_{fold}.csv'))
    train_index = fold_data[fold_data['split'] == 'train']['ID_proteinData']
    test_index = fold_data[fold_data['split'] == 'test']['ID_proteinData']
    val_index = fold_data[fold_data['split'] == 'val']['ID_proteinData']
    # get first train, text, val, then split into X_train, X_test, X_val and y_train, y_test, y_val
    train = df_merged.loc[df_merged['ID_proteinData'].isin(train_index)]
    test = df_merged.loc[df_merged['ID_proteinData'].isin(test_index)]
    val = df_merged.loc[df_merged['ID_proteinData'].isin(val_index)]
    # print number of samples in train, val and test for Malignancy 0 and 1
    print("Train, total benign nodules", train[train['Malignancy'] == 0].shape[0], "lung cancer", train[train['Malignancy'] == 1].shape[0])
    print("Val, total benign nodules", val[val['Malignancy'] == 0].shape[0], "lung cancer", val[val['Malignancy'] == 1].shape[0])
    print("Test, total benign nodules", test[test['Malignancy'] == 0].shape[0], "lung cancer", test[test['Malignancy'] == 1].shape[0])
    train_loader = load_lungAmbition(df_merged, batch_size=config_args.batch_size, spatial_size=[32, 32, 32], shuffle=False, type_processing = None)
    break

Number of false positives: 5
Fold 1:
Train, total benign nodules 40 lung cancer 18
Val, total benign nodules 5 lung cancer 2
Test, total benign nodules 23 lung cancer 11


In [12]:
train_loader = load_lungAmbition(train, batch_size=1, spatial_size=[64, 64, 64], shuffle=False, type_processing = None)

In [13]:
# print next iter in train_loader
for i, data in enumerate(train_loader):
    print(i)
    print(data)
    break

0
{'image': tensor([[[[[0.0286, 0.1157, 0.1779,  ..., 0.7593, 0.7786, 0.7279],
           [0.1143, 0.0379, 0.0479,  ..., 0.5636, 0.6621, 0.7486],
           [0.2129, 0.0121, 0.0571,  ..., 0.8457, 0.7393, 0.6757],
           ...,
           [0.1329, 0.2093, 0.1071,  ..., 0.0879, 0.0571, 0.0893],
           [0.2357, 0.2036, 0.1479,  ..., 0.0307, 0.0279, 0.1771],
           [0.1464, 0.1900, 0.0000,  ..., 0.0471, 0.0000, 0.0307]],

          [[0.1371, 0.0071, 0.0414,  ..., 0.7393, 0.8793, 0.6457],
           [0.0136, 0.0636, 0.0000,  ..., 0.6329, 0.7229, 0.6171],
           [0.0450, 0.1836, 0.0136,  ..., 0.8550, 0.7929, 0.5729],
           ...,
           [0.2236, 0.0000, 0.1043,  ..., 0.1029, 0.1379, 0.0014],
           [0.2243, 0.1136, 0.0850,  ..., 0.1571, 0.0679, 0.0236],
           [0.1257, 0.0229, 0.0314,  ..., 0.0679, 0.0000, 0.0000]],

          [[0.0450, 0.0529, 0.1143,  ..., 0.6443, 0.5529, 0.6679],
           [0.0571, 0.0957, 0.1036,  ..., 0.8371, 0.7343, 0.8250],
           [0.

In [14]:
data.keys()

dict_keys(['image', 'seg', 'id', 'mal', 'seg_path'])

In [17]:
data['image'].shape

torch.Size([1, 1, 64, 64, 64])

In [18]:
data['seg'].shape

torch.Size([1, 1, 64, 64, 64])