## Preprocessing for the .xlsx file

In [1]:
# import libraries
import pandas as pd

### Load data from excel sheet

In [46]:
# reads the patient excel sheet and puts them into a pandas Dataframe
path = "/Users/LennartPhilipp/Desktop/Uni/Prowiss/Dateien/Metas_PatID_Full_SMI private Datei.xlsx"
try:
    patientsDataFrame = pd.read_excel(path)
except ImportError as e:
    print(e)

modifiedPatientsDataFrame = patientsDataFrame.copy()

### Remove unnecessary columns

In [47]:
# removes patients' names from dataset
modifiedPatientsDataFrame = modifiedPatientsDataFrame.drop(columns=["name"])

# removes unused columns from dataset
modifiedPatientsDataFrame = modifiedPatientsDataFrame.iloc[:, :38] # only keeps the columns until column 38
modifiedPatientsDataFrame = modifiedPatientsDataFrame.drop(columns=["T.1",
                                                                    "primary_coded",
                                                                    "N",
                                                                    "M",
                                                                    "M_number_organ_systems",
                                                                    "TNM",
                                                                    "Survival ab ED gesamt",
                                                                    "Survival ab ED Hirnmet.",
                                                                    "Survival_from_ED_Met (months)",
                                                                    "Alter bei MRT (J./kalkuliert)",
                                                                    "Dauer Tumorleiden\n(kalkuliert; von ED bis ED Hirnmet.)",
                                                                    "Karnofsky (%)",
                                                                    "periphere Metastasen zum Zeitpunkt des MRTs ( Keine = 0, Einzelne = 1, Oligo =2, Multipel =3)",
                                                                    "im Verlauf neu aufgetretene Metastasen (0 = keine bekannt)",
                                                                    ])

# removes patients that did not fit the study requirements
# once all the MRI files are on the server change the following line to only inclue patients with MRI files: modifiedDataFrame["on_server"] != 0
modifiedPatientsDataFrame = modifiedPatientsDataFrame[modifiedPatientsDataFrame["in_study (0 = no, 1 = yes, 2 = tbd, 3 = remarkable)"] != 0]

# turn all preop_MRICE_date strings into datetime format, ignoring strings that can't be turned into dates
modifiedPatientsDataFrame["preop_MRICE_date"] = pd.to_datetime(modifiedPatientsDataFrame["preop_MRICE_date"], errors="ignore")

# turn all brithdate, death_date, date_first_diagnosis_primary and date_first_diagnosis_brain_met strings into datetime format, setting all strings that can't be turned into dates to NaT
for column in ["birthdate", "death_date", "date_first_diagnosis_primary", "date_first_diagnosis_brain_met"]:
    modifiedPatientsDataFrame[column] = pd.to_datetime(modifiedPatientsDataFrame[column], errors="coerce")

# turn all weights into numerics
modifiedPatientsDataFrame["Gewicht (kg)"] = pd.to_numeric(modifiedPatientsDataFrame["Gewicht (kg)"], errors="coerce")

### CUP patients

In [16]:
cupPatients = modifiedPatientsDataFrame[modifiedPatientsDataFrame["in_study (0 = no, 1 = yes, 2 = tbd, 3 = remarkable)"] == 3]

### Check amount of missing values

In [94]:
# checks the amount of missing values for each column
for column in modifiedPatientsDataFrame.columns:
    amount_of_nan = modifiedPatientsDataFrame[column].isna().sum()
    print("# of missing values in " + column + ": " + str(amount_of_nan))

# of missing values in on_server: 0
# of missing values in in_study (0 = no, 1 = yes, 2 = tbd, 3 = remarkable): 0
# of missing values in reason: 81
# of missing values in MRI_comments: 567
# of missing values in ID: 0
# of missing values in birthdate: 2
# of missing values in sex (m/w): 4
# of missing values in preop_MRICE_date: 7
# of missing values in primary_full: 6
# of missing values in T: 33
# of missing values in death_date: 361
# of missing values in date_first_diagnosis_primary: 189
# of missing values in date_first_diagnosis_brain_met: 181
# of missing values in OP: 19
# of missing values in Chemotherapie: 22
# of missing values in Radiatio: 18
# of missing values in Immuntherapie: 21
# of missing values in Größe (cm): 205
# of missing values in Gewicht (kg): 190
# of missing values in ECOG: 209
# of missing values in AZ Beschreibung: 582
# of missing values in Resektionsstatus (0 = Komplettresektion; 1 = Rest/weitere Hirnfiliae, 2 = keine Resektion, 3 = unklar): 51
# of miss

### Update ECOG values
We assume that patients that were in good shape ("guter AZ") also have an ECOG value of 0, even though this was not explicitally labeled as such.

In [36]:
#counter = 0
for index, patient in modifiedPatientsDataFrame.iterrows():
    if pd.isna(patient["ECOG"]) and patient["AZ Beschreibung"] == "guter AZ":
        #counter += 1
        #print(counter)
        patient["ECOG"] = 0


### Adds a new column for the age when the images were taken

In [1]:
# modifiedPatientsDataFrame["age_at_MRI"] = pd.to_datetime(modifiedPatientsDataFrame["preop_MRICE_date"]) - pd.to_datetime((modifiedPatientsDataFrame["birthdate"]))