In [268]:
# Read the relevant excel files in the folder TCGA_tables, TCGA_survival_time_per_patient_by_ofir_metadata.xlsx, TCGA_AI_scores_per_patient.xlsx, and TCGA_drug_sensitivity_per_patient.xlsx

import pandas as pd
import numpy as np

# Read the relevant excel files
surv_time = pd.read_excel('TCGA_tables/TCGA_survival_time_per_patient_by_ofir_metadata.xlsx')
ai_scores = pd.read_excel('TCGA_tables/TCGA_AI_scores_per_patient.xlsx')
drug_sensitivity = pd.read_excel('TCGA_tables/TCGA_drug_sensitivity_per_patient.xlsx')
gene_expression = pd.read_excel('TCGA_tables/TCGA_gene_sig_per_patient.xlsx')

# Merge the dataframes on the PatientID column
merged = pd.merge(surv_time, ai_scores, on='PatientID')
merged = pd.merge(merged, drug_sensitivity, on='PatientID')
merged = pd.merge(merged, gene_expression, on='PatientID')


In [269]:
relevant_columns = ['PatientID', 'Overall_Survival__Months_', 'Overall_Survival_Status', 'tamoxifen', 'lapatinib', 'label_ER', 'label_PR', 'label_Her2', 'Grade', 'TumorType', 'PGR', 'ESR1', 'ERBB2', 'ERBB2_DESMEDT_18698033', 'Gender', 'Age']
merged = merged[relevant_columns]

In [270]:
# Count rows with nan
print("Nuber of rows with nan values:", (merged.isnull().sum(axis=1) > 0).sum())
# Which columns have nan values
print("Columns with nan values:", merged.isnull().sum())

Nuber of rows with nan values: 206
Columns with nan values: PatientID                      0
Overall_Survival__Months_      1
Overall_Survival_Status        1
tamoxifen                    114
lapatinib                    114
label_ER                       3
label_PR                       6
label_Her2                   138
Grade                         53
TumorType                      0
PGR                           12
ESR1                          12
ERBB2                         12
ERBB2_DESMEDT_18698033         4
Gender                         0
Age                            1
dtype: int64


In [271]:
# Drop missing with few examples
merged = merged.dropna(subset=['label_ER', 'Overall_Survival__Months_', 'Overall_Survival_Status'])
# Fill missing PR with ER
merged['label_PR'] = merged['label_PR'].fillna(merged['label_ER'])
# Fill missing label_Her2, Grade, tamoxifen, and lapatinib with -1
merged = merged.fillna(-1)

In [272]:
# count values in TumorType
print(merged['TumorType'].value_counts())

Infiltrating Ductal Carcinoma                                                                746
Infiltrating Lobular Carcinoma                                                               197
Other, specify                                                                                45
Mucinous Carcinoma                                                                            16
Metaplastic Carcinoma                                                                          9
Mixed Histology (please specify) -Infiltrating ductal and lobular carcinoma                    8
Medullary Carcinoma                                                                            4
Mixed Histology (please specify) -lobular/ductal                                               2
Mixed Histology (please specify) -Lobular/Ductal                                               2
Mixed Histology (please specify) -Mixed invasive ductal and invasive lobular                   2
Mixed Histology (please specif

In [273]:
# Create two new columns, one for ductul carcinoma, and one for lobular carcinoma
merged['ductal_carcinoma'] = merged['TumorType'].apply(lambda x: 1 if 'ductal' in x.lower() else 0)
merged['lobular_carcinoma'] = merged['TumorType'].apply(lambda x: 1 if 'lobular' in x.lower() else 0)
merged = merged.drop('TumorType', axis=1)

In [274]:
# binarize the Overall_Survival_Status column
merged['Overall_Survival_Status'] = merged['Overall_Survival_Status'].apply(lambda x: 1 if x == 'DECEASED' else 0)
merged.rename(columns={'Overall_Survival__Months_': 'time', 'Overall_Survival_Status': 'event'}, inplace=True)

In [275]:
merged['Gender'] = merged['Gender'].apply(lambda x: 1 if x == 'Female' else 0)

In [276]:
# For each PatientID, find a folder in '/SSDStorage/Breast/gigapath_CAT_features/gigapath_features' whose name contains the PatientID
# Inside the folder, find the file final_embed_*.npy. Read it and save the vectors inside to a numpy array

import os
import numpy as np

# Get the list of PatientIDs
PatientIDs = merged['PatientID'].values

# Get the list of folders in the gigapath_features folder
folders = os.listdir('/SSDStorage/Breast/gigapath_CAT_features/gigapath_features')

# Create a dictionary to store the embeddings
embeddings = {}

# Loop over the folders
for folder in folders:
    # Check if the folder name contains a PatientID
    for PatientID in PatientIDs:
        if PatientID in folder:
            # Get the list of files in the folder
            files = os.listdir('/SSDStorage/Breast/gigapath_CAT_features/gigapath_features/' + folder)
            # Loop over the files
            for file in files:
                # Check if the file name contains final_embed_
                if 'final_embed_' in file:
                    # Read the file
                    embeddings[PatientID] = np.load('/SSDStorage/Breast/gigapath_CAT_features/gigapath_features/' + folder + '/' + file)[0]

In [277]:
# Use PCA to reduce the dimensionality of the embeddings to 10
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X = np.array([embeddings[PatientID] for PatientID in PatientIDs])
X = pca.fit_transform(X)

# Print statistics
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Explained variance ratio sum:", pca.explained_variance_ratio_.sum())

# Merge the embeddings with the dataframe
X = pd.DataFrame(X, columns=['PC' + str(i) for i in range(1, 4)], index=PatientIDs)
merged = pd.merge(merged, X, left_on='PatientID', right_index=True)

Explained variance ratio: [0.5430749  0.10453925 0.08282101]
Explained variance ratio sum: 0.7304352


In [279]:
# Save the resulting table to a csv file
merged.to_csv('TCGA_tables/TCGA_merged.csv', index=False)

In [280]:
# Print the proportion of subjects that have event == 0, and a duration < 5
print("Proportion of subjects with event == 0 and duration < 8.976:", ((merged['event'] == 0) & (merged['time'] < 8.976)).sum() / (merged['event'] == 0).sum())
print("Proportion of subjects with event == 1 and duration < 8.976:", ((merged['event'] == 1) & (merged['time'] < 8.976)).sum() / (merged['event'] == 0).sum())

Proportion of subjects with event == 0 and duration < 8.976: 0.10398230088495575
Proportion of subjects with event == 1 and duration < 8.976: 0.012168141592920354


In [281]:
# survival 0.1 quantile
print("Censored time 0.1 quantile:", merged['time'].quantile(0.1))
# uncensored 0.1 quantile
print("Uncensored 0.1 quantile:", merged[merged['event'] == 1]['time'].quantile(0.1))
# censored 0.1 quantile
print("Censored 0.1 quantile:", merged[merged['event'] == 0]['time'].quantile(0.1))

Censored time 0.1 quantile: 8.979000000000001
Uncensored 0.1 quantile: 10.458
Censored 0.1 quantile: 8.788000000000002


In [282]:
# Mean and median time
print("Mean time:", merged['time'].mean())
print("Median time:", merged['time'].median())

Mean time: 41.51514367816092
Median time: 27.86
