In [1]:
# Imports
import os
import pandas as pd
import numpy as np


# Clinical data extraction

In [2]:
#Read in the data
data_path = "L:/Basic/divi/jstoker/slicer_pdac/Master Students WS 24/Martijn/data/dataset_CAESAR_dec23_MASTER.xlsx"
all_data = pd.read_excel(data_path)

pat_resp_data_path = "C:/Users/P095550/OneDrive - Amsterdam UMC/Documenten/GitHub/CRLM-morph-features/CAIRO5_path_responseTRG__dec2023.xlsx"
pat_resp_data = pd.read_excel(pat_resp_data_path)

### Training input

In [11]:
training_data = all_data[["SubjectKey", "sex", "Age", "largestsize", "distribution", "synchr", "sidedness", "KRAS", "NRAS", "BRAF", "treatment", "CEALEVEL", "TOTMETAS", "nodalstatus"]]
training_data.head()


Unnamed: 0,SubjectKey,sex,Age,largestsize,distribution,synchr,sidedness,KRAS,NRAS,BRAF,treatment,CEALEVEL,TOTMETAS,nodalstatus
0,1,Female,56,62,Bilobar,Synchronous,Left,0,0,1,FOLFOX-B,71.8,50,Positive
1,2,Male,65,34,Bilobar,Synchronous,Left,0,0,0,FOLFOX-P,1200.0,9,Missing
2,3,Male,55,48,Bilobar,Synchronous,Left,1,0,0,FOLFOX-B,357.0,6,Missing
3,4,Female,35,67,Bilobar,Synchronous,Left,0,0,0,FOLFOX-P,8090.0,68,Missing
4,5,Female,57,38,Bilobar,Synchronous,Right,0,0,0,FOLFIRI-B,148.0,9,Positive


In [12]:
# Correctly apply transformations to columns
training_data["male"] = training_data["sex"].apply(lambda x: 1 if x == "Male" else 0)
training_data["synchr"] = training_data["synchr"].apply(lambda x: 1 if x == "Synchronous" else 0)
training_data["left_sidedness"] = training_data["sidedness"].apply(lambda x: 1 if x == "Left" else 0)
training_data["bevacizumab"] = training_data["treatment"].apply(lambda x: 1 if x.endswith("B") else 0)
training_data["bilobar"] = training_data["distribution"].apply(lambda x: 1 if x == "Bilobar" else 0)
#training_data["nodalstatus"] = training_data["nodalstatus"].replace("Missing", np.nan)
training_data["nodalstatus"] = training_data["nodalstatus"].apply(lambda x: 1 if x == "Positive" else (0 if x == "Negative" else np.nan))


# Drop the original columns if they are no longer needed
training_data.drop(columns=["sex", "sidedness", "treatment", "distribution"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data["male"] = training_data["sex"].apply(lambda x: 1 if x == "Male" else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data["synchr"] = training_data["synchr"].apply(lambda x: 1 if x == "Synchronous" else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data["left_s

In [13]:
nan_counts = training_data.isna().sum()
print(nan_counts)

SubjectKey          0
Age                 0
largestsize         0
synchr              0
KRAS                0
NRAS                0
BRAF                0
CEALEVEL            1
TOTMETAS            0
nodalstatus       361
male                0
left_sidedness      0
bevacizumab         0
bilobar             0
dtype: int64


In [14]:
training_data.head()

Unnamed: 0,SubjectKey,Age,largestsize,synchr,KRAS,NRAS,BRAF,CEALEVEL,TOTMETAS,nodalstatus,male,left_sidedness,bevacizumab,bilobar
0,1,56,62,1,0,0,1,71.8,50,1.0,0,1,1,1
1,2,65,34,1,0,0,0,1200.0,9,,1,1,0,1
2,3,55,48,1,1,0,0,357.0,6,,1,1,1,1
3,4,35,67,1,0,0,0,8090.0,68,,0,1,0,1
4,5,57,38,1,0,0,0,148.0,9,1.0,0,0,1,1


In [15]:
# Subset the data to only include patients which we have CT scans for

paired_scans_path = "L:/Basic/divi/jstoker/slicer_pdac/Master Students WS 24/Martijn/data/Training/paired_scans"
subject_keys = set()
for filename in os.listdir(paired_scans_path):
    subject_keys.add(int(filename[6:9]))

training_data = training_data[training_data["SubjectKey"].isin(subject_keys)]

training_data.drop(columns=["SubjectKey"], inplace=True)

In [9]:
training_data.head()

Unnamed: 0,Age,largestsize,synchr,KRAS,NRAS,BRAF,CEALEVEL,TOTMETAS,male,left_sidedness,bevacizumab,bilobar
0,56,62,1,0,0,1,71.8,50,0,1,1,1
3,35,67,1,0,0,0,8090.0,68,0,1,0,1
6,54,25,1,1,0,0,1.9,3,1,0,1,0
7,33,59,1,0,0,0,231.0,17,0,1,1,1
9,72,85,1,0,0,0,104.2,15,0,1,0,1


In [16]:

# Save the dataframe as a CSV file
training_data.to_csv("training_data/training_data.csv", index=False)

### Training labels

In [19]:
training_targets = all_data[["OSm"]]

# Save log survival times
training_targets['log_OSm'] = np.log(training_targets['OSm'])
training_targets.drop(columns=["OSm"], inplace=True)
training_targets.to_csv('training_data/training_targets_OS_log_whole.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_targets['log_OSm'] = np.log(training_targets['OSm'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_targets.drop(columns=["OSm"], inplace=True)


In [21]:
nan_counts_targets = training_targets.isna().sum()
print(nan_counts_targets)

OSSTAT    0
dtype: int64


### Training occurences

In [None]:
training_targets_occurences = all_data[["OSSTAT"]]
training_targets_occurences.to_csv('training_data/training_targets_OS_occurence_whole.csv', index=False)

In [None]:
print(all_data.isna().sum())