In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from core import utils

  from .autonotebook import tqdm as notebook_tqdm


## Reading the relevent TaylorX data

In [2]:
patient_data = pd.read_csv("/data/unsynced_data/Breast/TAILORx/NCT00310180-D1-Dataset.csv")
slide_data = pd.read_excel("/data/unsynced_data/Breast/TAILORx/Deid/TAILORx_1/slides_data_TAILORx_1.xlsx")
patient_slide_connection = pd.read_excel("/data/unsynced_data/Breast/TAILORx/PACT1 Blinded Digital Slides Final Manifest - Revised.xlsx")
deleted_slides = pd.read_excel("/mnt/gipmed_new/Data/data_QA_and_bad_slides/HE_TAILORx/slide_review_list_TAILORX.xlsx", 'bad slides')

## Reading current largest metadata

In [3]:
current_meta = pd.read_csv("metadata_csvs/largest_current_metadata.csv")

## data treatment

In [4]:
print(f"current meta keys are: {current_meta.keys()}")

current meta keys are: Index(['Unnamed: 0', 'file', 'patient_barcode', 'id', 'mpp', 'total_tiles',
       'tiles_count', 'legitimate_tiles', 'width', 'height', 'magnification',
       'er_status', 'pr_status', 'her2_status', 'grade', 'tumor_type',
       'ki_67_status', 'onco_ki_67', 'onco_score_11', 'onco_score_18',
       'onco_score_26', 'onco_score_31', 'onco_score_all', 'fold'],
      dtype='object')


In [5]:
print(f"patient data keys are: {patient_data.keys()}") 
# blindid should be associated with connection. 
# should be renamed to patient_barcode.
# All else should be kept.
print(f"patient slide connection keys are: {patient_slide_connection.keys()}")
# Full File Name and Deidentified ID should be associated with connection.
# Full File Name => file
# Deidentified => patient_barcode
# All others need to be dropped.
print(f"slide data keys are: {slide_data.keys()}")
# file should be used for connection.
# Width * Height => Total tiles.
# use _add_tiles_count for dataset tiles_count and legitimate tiles
# randomize 5 folds
# all columns aside from id, file, DX, Objective Power, fold should be dropped.
print(f"deleted_slides keys are: {deleted_slides.keys()}")

patient data keys are: Index(['blindid', 'rxarm', 'InAnalysis', 'osind', 'inel', 'StratTumorSize',
       'StratMeno', 'StratPlannedChemo', 'StratPlannedRT', 'RSgp', 'Strat',
       'RS', 'age', 'meno', 'race', 'ethnicity', 'TumorSize', 'TumorSizeGp',
       'Grade', 'NucGrade', 'ERStatus', 'PRStatus', 'PrimSurg', 'RecChemo',
       'ChemRegGp', 'ChemReg', 'TypeEndocrine', 'ttfET', 'ttlET', 'durET',
       'endET', 'dfs', 'dfsind', 'drfi', 'drfiind', 'rfi', 'rfiind',
       'survtime', 'survstat', 'WithdrawConsent', 'LostFU', 'typefdfs',
       'typefrec', 'cause'],
      dtype='object')
patient slide connection keys are: Index(['Full File Name', 'File ID', 'Deidentified ID', 'EAS Slide ID'], dtype='object')
slide data keys are: Index(['Unnamed: 0.1', 'Unnamed: 0', 'patient barcode', 'id', 'file', 'DX',
       'MPP', 'Width', 'Height', 'Objective Power', 'Scan Date', 'ER status',
       'PR status', 'Her2 status', 'test fold idx',
       'Manipulated Objective Power'],
      dtype='obj

In [6]:
relevant_features = ["blindid", "osind", "inel", "StratMeno", "StratPlannedRT", "rxarm", \
         "RS", "age", "race", "ethnicity", "TumorSize", "Grade", "NucGrade", "ERStatus", \
         "PRStatus", "PrimSurg", "ChemRegGp", "TypeEndocrine", "ttfET", "InAnalysis",\
         "dfs", "dfsind", "drfi", "drfiind", "rfi", "rfiind", "survtime", "survstat",\
         "WithdrawConsent", "LostFU",  "typefdfs", "typefrec", "cause"]
patient_data = patient_data[relevant_features]
patient_data.rename(columns={'blindid': 'patient_barcode', "ERStatus": "er_status", "PRStatus": "pr_status"}, inplace=True)
value_change_dict = {"StratPlannedRT":
                        {1: "Whole Breast, no boost", 2: "Whole breast with boost", 3: "partial breast", 4: "none planned"},
                    "age":
                        {">=90": 90},
                    "race":
                        {1: "White", 3: "Black", 4: "Asian", 5: "Pacific Islander", 6: "Native American", 98: "Multirace", 99: "Not Reported"},
                    "ethnicity":
                        {1: "Hispanic", 2: "Not Hispanic", 99: "Not Reported"},
                    "pr_status":
                        {"Pos": 1, "Neg": 0},
                    "er_status":
                        {"Pos": 1, "Neg": 0},
                    'PrimSurg':
                        {'Mx': 1, 'Tx':0},
                    "cause":
                        {1: "Protocol treatment", 2: "Breast cancer", 3: "Cardiovascular disease", 4: "Other chronic disease", 5: "Other cancer", 6: "Other", 99: pd.NA},
                    "typefrec":
                        {1: "ipsilateral breast recurrence", 2: "recurrence at local-regional site", 3: "recurrence at distant site"},
                    "typefdfs":
                        {1: "ipsilateral breast recurrence", 2: "recurrence at local-regional site", 3: "recurrence at distant site", 
                         4: "new cancer of the opposite breast", 5: "new primary cancer at other than breast or non-melanoma skin cancer", 
                         6: "death without another event reported"},
                    "ChemRegGp": 
                        {'1CMF': 'CMF', '2Anthracycline w/o Taxane': 'Anthracycline w/o Taxane', '3Anthracycline and Taxane': 'Anthracycline and Taxane', '4TC and variations': 'TC and variations', 'Other or Not Specified': pd.NA,  '6None': 'None'},
                    "InAnalysis":
                        {"TRUE": 1, "FALSE": 0}
                    
}
patient_data.replace(value_change_dict, inplace=True)


In [7]:
# Assuming df is your DataFrame
excluded_columns = ["dfs", "dfsind", "drfi", "drfiind", "rfi", "rfiind", "survtime", "survstat", "WithdrawConsent", "LostFU",  "typefdfs", "typefrec", "cause"]
subset_df = patient_data[patient_data.columns.difference(excluded_columns)].loc[patient_data['InAnalysis'] == 1]
num_rows_with_na = subset_df.isnull().sum()
num_rows_with_na

ChemRegGp             0
Grade               289
InAnalysis            0
NucGrade           1977
PrimSurg              0
RS                    0
StratMeno             0
StratPlannedRT        0
TumorSize             3
TypeEndocrine         0
age                   0
er_status             0
ethnicity             0
inel                  0
osind                 0
patient_barcode       0
pr_status           204
race                  0
rxarm                 0
ttfET               175
dtype: int64

In [8]:
# ohe_keys = [*value_change_dict.keys(), "meno", "Grade", "NucGrade", "ERStatus", "PRStatus", "PrimSurg", "ChemRegGp", "TypeEndocrine"]
# ohe_keys.remove("age")
# dummies = pd.get_dummies(patient_data[ohe_keys])
# dummies = dummies.drop(["race_Race Not reported", "ethnicity_Eth. Not Reported", "StratPlannedRT_none planned", "StratPlannedChemo_not applicable"], axis=1, errors='ignore')
# df = pd.concat([patient_data, dummies], axis=1).drop(ohe_keys, axis=1)
# df = df.astype("float64")
# df.iloc[:,1:-1] = df.iloc[:,1:-1].apply(lambda x: (x-x.mean())/ x.std(), axis=0)

In [9]:
patient_slide_connection.rename(columns={'Full File Name': 'file', "Deidentified ID": "patient_barcode"}, inplace=True)
patient_slide_connection = patient_slide_connection[['file', "patient_barcode"]]
patient_slide_connection["patient_barcode"] = patient_slide_connection["patient_barcode"].str.split("-").str[0].fillna(patient_slide_connection["patient_barcode"]).astype('int64')

In [10]:
def _get_tiles_count(row: pd.Series) -> int:
    dataset_path = "/data/unsynced_data/Breast/TAILORx/Deid/TAILORx_1"
    image_file_name_stem = Path(row["file"]).stem
    if utils.check_segmentation_data_exists(dataset_path=dataset_path, desired_magnification=10, image_file_name_stem=image_file_name_stem, tile_size=256) is False:
        return 0
    else:
        segmentation_data = utils.load_segmentation_data(dataset_path=dataset_path, desired_magnification=10, image_file_name_stem=image_file_name_stem, tile_size=256)
        return segmentation_data.shape[0]
        
def _add_tiles_count(df: pd.DataFrame) -> pd.DataFrame:
    df["tiles_count"] = df.apply(
        lambda row: _get_tiles_count(row=row), axis=1
    )
    df["legitimate_tiles"] = df["tiles_count"]
    return df


In [11]:
slide_data = _add_tiles_count(slide_data)

In [12]:
slide_data = slide_data[~slide_data["patient barcode"].isin(deleted_slides["slide"])]

In [13]:
slide_data.rename(columns={'Manipulated Objective Power': 'magnification', "Height": "height", "Width": "width", "MPP": "mpp", "patient barcode": "patient_barcode"}, inplace=True)

slide_data = slide_data[["legitimate_tiles", "tiles_count", "magnification", "height", "width", "mpp","file","id"]]
slide_data

Unnamed: 0,legitimate_tiles,tiles_count,magnification,height,width,mpp,file,id
0,1613,1613,20,45961,49800,0.5026,PACCT1_6753561-1_AperioUUID11795.svs,TAILORx_1
1,2466,2466,20,28247,47808,0.5026,PACCT1_1167281_AperioUUID16657.svs,TAILORx_1
2,2292,2292,20,47823,75696,0.5026,PACCT1_3019140_AperioUUID21278.svs,TAILORx_1
3,2512,2512,20,46531,45816,0.5026,PACCT1_1340169-1_AperioUUID12948.svs,TAILORx_1
4,2419,2419,20,40025,61752,0.5026,PACCT1_1274621_AperioUUID26572.svs,TAILORx_1
...,...,...,...,...,...,...,...,...
9613,2414,2414,20,46772,55776,0.5026,PACCT1_4855283_AperioUUID17094.svs,TAILORx_1
9614,3052,3052,20,49938,65736,0.5026,PACCT1_3942733_AperioUUID26037.svs,TAILORx_1
9615,1138,1138,20,48883,59760,0.5026,PACCT1_3465713_AperioUUID24546.svs,TAILORx_1
9616,2628,2628,20,48232,63744,0.5026,PACCT1_2530740_AperioUUID18060.svs,TAILORx_1


In [14]:
merged = pd.merge(pd.merge(slide_data, patient_slide_connection, on="file"), patient_data, on="patient_barcode")
merged

Unnamed: 0,legitimate_tiles,tiles_count,magnification,height,width,mpp,file,id,patient_barcode,osind,...,drfiind,rfi,rfiind,survtime,survstat,WithdrawConsent,LostFU,typefdfs,typefrec,cause
0,1613,1613,20,45961,49800,0.5026,PACCT1_6753561-1_AperioUUID11795.svs,TAILORx_1,6753561,1,...,0,3249,0,3249,0,0,0,,,
1,1080,1080,20,39541,51792,0.5026,PACCT1_6753561-2_AperioUUID11797.svs,TAILORx_1,6753561,1,...,0,3249,0,3249,0,0,0,,,
2,2795,2795,20,45634,55776,0.5026,PACCT1_6753561-3_AperioUUID22222.svs,TAILORx_1,6753561,1,...,0,3249,0,3249,0,0,0,,,
3,2466,2466,20,28247,47808,0.5026,PACCT1_1167281_AperioUUID16657.svs,TAILORx_1,1167281,1,...,0,3318,0,3326,0,0,0,,,
4,2292,2292,20,47823,75696,0.5026,PACCT1_3019140_AperioUUID21278.svs,TAILORx_1,3019140,1,...,0,3289,0,3289,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9552,2181,2181,20,35396,51792,0.5026,PACCT1_5781358_AperioUUID17886.svs,TAILORx_1,5781358,1,...,0,3277,0,3327,0,0,0,,,
9553,2414,2414,20,46772,55776,0.5026,PACCT1_4855283_AperioUUID17094.svs,TAILORx_1,4855283,1,...,0,2510,0,2527,1,0,0,new primary cancer at other than breast or non...,,Other cancer
9554,3052,3052,20,49938,65736,0.5026,PACCT1_3942733_AperioUUID26037.svs,TAILORx_1,3942733,1,...,0,2520,0,2551,0,0,0,,,
9555,1138,1138,20,48883,59760,0.5026,PACCT1_3465713_AperioUUID24546.svs,TAILORx_1,3465713,1,...,0,490,0,1462,0,1,0,,,


In [15]:
import re

def extract_number(s):
    match = re.search(r'PACCT1_(\d+)', s)
    if match:
        return int(match.group(1)) # return the first captured group as int
    else:
        raise Exception('String does not match the expected format')

merged['patient_barcode'] = merged['file'].apply(extract_number).astype('int32')
unique_numbers = merged['patient_barcode'].unique()

# Create a dictionary that maps each extracted number to a random number between 1 and 5
number_to_random_mapping = {number: np.random.randint(1, 5) for number in unique_numbers}
merged['fold'] = merged['patient_barcode'].map(number_to_random_mapping)

merged['mpp'] = 10.0 / merged['magnification']
# print(merged.dtypes)
print(merged.dtypes[merged.dtypes == object])
merged.er_status = merged.er_status.astype('str')
print(merged.er_status.unique())
merged.pr_status = merged.pr_status.astype('str')
merged.file = merged.file.astype('str')
print(merged.dtypes[merged.dtypes == object])

file              object
id                object
StratPlannedRT    object
rxarm             object
age               object
race              object
ethnicity         object
Grade             object
NucGrade          object
ChemRegGp         object
TypeEndocrine     object
typefdfs          object
typefrec          object
cause             object
dtype: object
['1.0' 'nan' '0.0']
file              object
id                object
StratPlannedRT    object
rxarm             object
age               object
race              object
ethnicity         object
Grade             object
NucGrade          object
er_status         object
pr_status         object
ChemRegGp         object
TypeEndocrine     object
typefdfs          object
typefrec          object
cause             object
dtype: object


In [16]:
concated = pd.concat([current_meta, merged])
print(concated.er_status.unique())
concated.pr_status[(concated.pr_status == 'nan') | (concated.pr_status.isna())] = pd.NA
concated.er_status[(concated.er_status == 'nan') | (concated.er_status.isna())] = pd.NA
print(concated.er_status.unique())


['3.0' '1.0' '2.0' 'Negative' 'Positive' nan 'Stained but IHC missing'
 'Missing Data Data' 'Equivocal' 'Indeterminate' 'nan' '0.0']
['3.0' '1.0' '2.0' 'Negative' 'Positive' <NA> 'Stained but IHC missing'
 'Missing Data Data' 'Equivocal' 'Indeterminate' '0.0']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concated.pr_status[(concated.pr_status == 'nan') | (concated.pr_status.isna())] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concated.er_status[(concated.er_status == 'nan') | (concated.er_status.isna())] = pd.NA


In [17]:
HL_pd = pd.read_excel("Summary_data_per_slide_With_HL_noisy.xlsx")

In [18]:
HL_pd.columns

Index(['FoldID', 'DatasetName', 'BatchID', 'SlideName', 'SlideID', 'BlockID',
       'TissueID', 'SampleID', 'PatientID', 'TumorType', 'TissueType',
       'BirthDate', 'BreastSide', 'Gender', 'TissueDate', 'Age', 'Grade',
       'label_ER', 'label_PR', 'label_Her2', 'label_Ki67', 'label_IsCancer',
       'IHC_ER', 'IHC_PR', 'IHC_Her2', 'IHC_Ki67', 'label_ER_corrected',
       'label_PR_corrected', 'IHC_ER_corrected', 'IHC_PR_corrected',
       'PS_CAT_ensemble_ER', 'PS_CAT_foldless_ER', 'PS_CAT_CV_ER',
       'PS_CAT_ensemble_PR', 'PS_CAT_foldless_PR', 'PS_CAT_CV_PR',
       'PS_CAT_ensemble_Her2', 'PS_CAT_foldless_Her2', 'PS_CAT_CV_Her2',
       'PS_CAT_ensemble_ER_or_PR', 'PS_CAT_foldless_ER_or_PR',
       'PS_CAT_CV_ER_or_PR', 'PS_Carmel_foldless_ER', 'PS_Carmel_foldless_PR',
       'PS_Carmel_foldless_Her2', 'PS_Carmel_ensemble_Ki67',
       'PS_Carmel_foldless_Ki67', 'PS_Carmel_CV_Ki67', 'PS_TA_ensemble_ER',
       'PS_TA_foldless_ER', 'PS_TA_CV_ER', 'PS_TA_ensemble_PR',
       '

In [19]:
HL_pd[[f"label_HL_noisy_{i}" for i in range(1,10)]] = HL_pd[[f"label_HL_noisy_ {i}" for i in range(1,10)]]
HL_pd[["er_status", "pr_status", "her2_status"]] = HL_pd[['label_ER', 'label_PR', 'label_Her2']]
HL_pd = HL_pd[HL_pd["BatchID"]!="Carmel11_Rescanned"]
HL_pd.loc[HL_pd["BatchID"]=="Carmel11","SlideID"]

10978     21-8212_1_1_b
10979     21-8662_1_1_b
10980     21-8210_1_1_b
10981     21-8113_1_1_b
10982     21-8488_1_1_b
              ...      
11831    21-1617_2_13_b
11832     21-1827_1_1_b
11833     21-1657_1_7_b
11834     21-1835_2_7_b
11835     21-1869_1_4_a
Name: SlideID, Length: 858, dtype: object

In [20]:
HL_pd = HL_pd[["SlideID", "er_status", "pr_status", "her2_status", "label_HL", *[f"label_HL_noisy_{i}" for i in range(1,21)]]]

In [21]:
HL_pd = HL_pd[HL_pd["label_HL"].notna()]

In [22]:
concated["SlideID"] = concated["file"].str.replace(r'\.[^.]*$', '')
concated["SlideID"]
concated = concated.drop(['er_status', 'pr_status', 'her2_status', 'ki_67_status'], axis=1)
concated.columns

  concated["SlideID"] = concated["file"].str.replace(r'\.[^.]*$', '')


Index(['Unnamed: 0', 'file', 'patient_barcode', 'id', 'mpp', 'total_tiles',
       'tiles_count', 'legitimate_tiles', 'width', 'height', 'magnification',
       'grade', 'tumor_type', 'onco_ki_67', 'onco_score_11', 'onco_score_18',
       'onco_score_26', 'onco_score_31', 'onco_score_all', 'fold', 'osind',
       'inel', 'StratMeno', 'StratPlannedRT', 'rxarm', 'RS', 'age', 'race',
       'ethnicity', 'TumorSize', 'Grade', 'NucGrade', 'PrimSurg', 'ChemRegGp',
       'TypeEndocrine', 'ttfET', 'InAnalysis', 'dfs', 'dfsind', 'drfi',
       'drfiind', 'rfi', 'rfiind', 'survtime', 'survstat', 'WithdrawConsent',
       'LostFU', 'typefdfs', 'typefrec', 'cause', 'SlideID'],
      dtype='object')

In [23]:
concated = pd.merge(concated, HL_pd, on="SlideID", how="left")

In [24]:
concated.loc[(concated["id"] == "TCGA") & (concated["file"].str.contains("-DX")), "id"] = "TCGA_DX"
concated.loc[(concated["id"] == "TCGA") & ~(concated["file"].str.contains("-DX")), "id"] = "TCGA_not_DX"
concated

Unnamed: 0.1,Unnamed: 0,file,patient_barcode,id,mpp,total_tiles,tiles_count,legitimate_tiles,width,height,...,label_HL_noisy_11,label_HL_noisy_12,label_HL_noisy_13,label_HL_noisy_14,label_HL_noisy_15,label_HL_noisy_16,label_HL_noisy_17,label_HL_noisy_18,label_HL_noisy_19,label_HL_noisy_20
0,0.0,GS0117622202.tiff,436.0,SHEBA2,,3605.0,231,231.0,104960.0,35328.0,...,,,,,,,,,,
1,1.0,GS133536102.tiff,355.0,SHEBA2,,2745.0,690,690.0,62464.0,45568.0,...,,,,,,,,,,
2,2.0,GS210801202.tiff,435.0,SHEBA2,,2040.0,567,567.0,51712.0,40448.0,...,,,,,,,,,,
3,3.0,GS212441302.tiff,400.0,SHEBA2,,3854.0,689,689.0,83968.0,48128.0,...,,,,,,,,,,
4,4.0,GS212629102.tiff,364.0,SHEBA2,,2940.0,692,692.0,71680.0,42496.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25169,,PACCT1_5781358_AperioUUID17886.svs,5781358,TAILORx_1,0.5,,2181,2181.0,51792.0,35396.0,...,,,,,,,,,,
25170,,PACCT1_4855283_AperioUUID17094.svs,4855283,TAILORx_1,0.5,,2414,2414.0,55776.0,46772.0,...,,,,,,,,,,
25171,,PACCT1_3942733_AperioUUID26037.svs,3942733,TAILORx_1,0.5,,3052,3052.0,65736.0,49938.0,...,,,,,,,,,,
25172,,PACCT1_3465713_AperioUUID24546.svs,3465713,TAILORx_1,0.5,,1138,1138.0,59760.0,48883.0,...,,,,,,,,,,


In [25]:
concated.to_csv("metadata_csvs/largest_with_taylor.csv", index=False)

In [26]:
taylor_meta = pd.read_csv("metadata_csvs/largest_with_taylor.csv")

  taylor_meta = pd.read_csv("metadata_csvs/largest_with_taylor.csv")


In [27]:
taylor_meta.loc[pd.Index((taylor_meta["id"]=="CARMEL9")), "fold"]

7214    1
7215    1
7216    1
7217    1
7218    1
       ..
8206    1
8207    1
8208    1
8209    1
8210    1
Name: fold, Length: 997, dtype: int64

In [28]:
taylor_meta.loc[pd.Index((taylor_meta["id"]=="CARMEL10")), "fold"]

8211    1
8212    1
8213    1
8214    1
8215    1
       ..
8685    1
8686    1
8687    1
8688    1
8689    1
Name: fold, Length: 479, dtype: int64