In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from core import utils

  from .autonotebook import tqdm as notebook_tqdm


## Reading the relevent TaylorX data

In [17]:
patient_data = pd.read_csv("/data/unsynced_data/Breast/TAILORx/NCT00310180-D1-Dataset.csv")
slide_data = pd.read_excel("/data/unsynced_data/Breast/TAILORx/Deid/TAILORx_1/slides_data_TAILORx_1.xlsx")
patient_slide_connection = pd.read_excel("/data/unsynced_data/Breast/TAILORx/PACT1 Blinded Digital Slides Final Manifest - Revised.xlsx")
deleted_slides = pd.read_excel("/mnt/gipmed_new/Data/data_QA_and_bad_slides/HE_TAILORx/slide_review_list_TAILORX.xlsx", 'bad slides')

## Reading current largest metadata

In [18]:
current_meta = pd.read_csv("metadata_csvs/largest_current_metadata.csv")

## data treatment

In [19]:
print(f"current meta keys are: {current_meta.keys()}")

current meta keys are: Index(['Unnamed: 0', 'file', 'patient_barcode', 'id', 'mpp', 'total_tiles',
       'tiles_count', 'legitimate_tiles', 'width', 'height', 'magnification',
       'er_status', 'pr_status', 'her2_status', 'grade', 'tumor_type',
       'ki_67_status', 'onco_ki_67', 'onco_score_11', 'onco_score_18',
       'onco_score_26', 'onco_score_31', 'onco_score_all', 'fold'],
      dtype='object')


In [20]:
print(f"patient data keys are: {patient_data.keys()}") 
# blindid should be associated with connection. 
# should be renamed to patient_barcode.
# All else should be kept.
print(f"patient slide connection keys are: {patient_slide_connection.keys()}")
# Full File Name and Deidentified ID should be associated with connection.
# Full File Name => file
# Deidentified => patient_barcode
# All others need to be dropped.
print(f"slide data keys are: {slide_data.keys()}")
# file should be used for connection.
# Width * Height => Total tiles.
# use _add_tiles_count for dataset tiles_count and legitimate tiles
# randomize 5 folds
# all columns aside from id, file, DX, Objective Power, fold should be dropped.
print(f"deleted_slides keys are: {deleted_slides.keys()}")

patient data keys are: Index(['blindid', 'rxarm', 'InAnalysis', 'osind', 'inel', 'StratTumorSize',
       'StratMeno', 'StratPlannedChemo', 'StratPlannedRT', 'RSgp', 'Strat',
       'RS', 'age', 'meno', 'race', 'ethnicity', 'TumorSize', 'TumorSizeGp',
       'Grade', 'NucGrade', 'ERStatus', 'PRStatus', 'PrimSurg', 'RecChemo',
       'ChemRegGp', 'ChemReg', 'TypeEndocrine', 'ttfET', 'ttlET', 'durET',
       'endET', 'dfs', 'dfsind', 'drfi', 'drfiind', 'rfi', 'rfiind',
       'survtime', 'survstat', 'WithdrawConsent', 'LostFU', 'typefdfs',
       'typefrec', 'cause'],
      dtype='object')
patient slide connection keys are: Index(['Full File Name', 'File ID', 'Deidentified ID', 'EAS Slide ID'], dtype='object')
slide data keys are: Index(['Unnamed: 0.1', 'Unnamed: 0', 'patient barcode', 'id', 'file', 'DX',
       'MPP', 'Width', 'Height', 'Objective Power', 'Scan Date', 'ER status',
       'PR status', 'Her2 status', 'test fold idx',
       'Manipulated Objective Power'],
      dtype='obj

In [21]:
patient_data.rename(columns={'blindid': 'patient_barcode', "ERStatus": "er_status", "PRStatus": "pr_status"}, inplace=True)
patient_data.er_status[patient_data.er_status == "Pos"] = "Positive"
patient_data.er_status[patient_data.er_status == "Neg"] = "Negative"
patient_data.pr_status[patient_data.pr_status == "Pos"] = "Positive"
patient_data.pr_status[patient_data.pr_status == "Neg"] = "Negative"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_data.er_status[patient_data.er_status == "Pos"] = "Positive"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_data.er_status[patient_data.er_status == "Neg"] = "Negative"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_data.pr_status[patient_data.pr_status == "Pos"] = "Positive"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [22]:
patient_slide_connection.rename(columns={'Full File Name': 'file', "Deidentified ID": "patient_barcode"}, inplace=True)
patient_slide_connection = patient_slide_connection[['file', "patient_barcode"]]
patient_slide_connection["patient_barcode"] = patient_slide_connection["patient_barcode"].str.split("-").str[0].fillna(patient_slide_connection["patient_barcode"]).astype('int64')

In [23]:
def _get_tiles_count(row: pd.Series) -> int:
    dataset_path = "/data/unsynced_data/Breast/TAILORx/Deid/TAILORx_1"
    image_file_name_stem = Path(row["file"]).stem
    if utils.check_segmentation_data_exists(dataset_path=dataset_path, desired_magnification=10, image_file_name_stem=image_file_name_stem, tile_size=256) is False:
        return 0
    else:
        segmentation_data = utils.load_segmentation_data(dataset_path=dataset_path, desired_magnification=10, image_file_name_stem=image_file_name_stem, tile_size=256)
        return segmentation_data.shape[0]
        
def _add_tiles_count(df: pd.DataFrame) -> pd.DataFrame:
    df["tiles_count"] = df.apply(
        lambda row: _get_tiles_count(row=row), axis=1
    )
    df["legitimate_tiles"] = df["tiles_count"]
    return df


In [24]:
slide_data = _add_tiles_count(slide_data)

In [25]:
slide_data = slide_data[~slide_data["patient barcode"].isin(deleted_slides["slide"])]

In [26]:
slide_data.rename(columns={'Manipulated Objective Power': 'magnification', "Height": "height", "Width": "width", "MPP": "mpp", "patient barcode": "patient_barcode"}, inplace=True)

slide_data = slide_data[["legitimate_tiles", "tiles_count", "magnification", "height", "width", "mpp","file","id"]]
slide_data

Unnamed: 0,legitimate_tiles,tiles_count,magnification,height,width,mpp,file,id
0,1613,1613,20,45961,49800,0.5026,PACCT1_6753561-1_AperioUUID11795.svs,TAILORx_1
1,2466,2466,20,28247,47808,0.5026,PACCT1_1167281_AperioUUID16657.svs,TAILORx_1
2,2292,2292,20,47823,75696,0.5026,PACCT1_3019140_AperioUUID21278.svs,TAILORx_1
3,2512,2512,20,46531,45816,0.5026,PACCT1_1340169-1_AperioUUID12948.svs,TAILORx_1
4,2419,2419,20,40025,61752,0.5026,PACCT1_1274621_AperioUUID26572.svs,TAILORx_1
...,...,...,...,...,...,...,...,...
9613,2414,2414,20,46772,55776,0.5026,PACCT1_4855283_AperioUUID17094.svs,TAILORx_1
9614,3052,3052,20,49938,65736,0.5026,PACCT1_3942733_AperioUUID26037.svs,TAILORx_1
9615,1138,1138,20,48883,59760,0.5026,PACCT1_3465713_AperioUUID24546.svs,TAILORx_1
9616,2628,2628,20,48232,63744,0.5026,PACCT1_2530740_AperioUUID18060.svs,TAILORx_1


In [27]:
merged = pd.merge(pd.merge(slide_data, patient_slide_connection, on="file"), patient_data, on="patient_barcode")
merged

Unnamed: 0,legitimate_tiles,tiles_count,magnification,height,width,mpp,file,id,patient_barcode,rxarm,...,drfiind,rfi,rfiind,survtime,survstat,WithdrawConsent,LostFU,typefdfs,typefrec,cause
0,1613,1613,20,45961,49800,0.5026,PACCT1_6753561-1_AperioUUID11795.svs,TAILORx_1,6753561,B,...,0,3249,0,3249,0,0,0,,,
1,1080,1080,20,39541,51792,0.5026,PACCT1_6753561-2_AperioUUID11797.svs,TAILORx_1,6753561,B,...,0,3249,0,3249,0,0,0,,,
2,2795,2795,20,45634,55776,0.5026,PACCT1_6753561-3_AperioUUID22222.svs,TAILORx_1,6753561,B,...,0,3249,0,3249,0,0,0,,,
3,2466,2466,20,28247,47808,0.5026,PACCT1_1167281_AperioUUID16657.svs,TAILORx_1,1167281,C,...,0,3318,0,3326,0,0,0,,,
4,2292,2292,20,47823,75696,0.5026,PACCT1_3019140_AperioUUID21278.svs,TAILORx_1,3019140,C,...,0,3289,0,3289,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9552,2181,2181,20,35396,51792,0.5026,PACCT1_5781358_AperioUUID17886.svs,TAILORx_1,5781358,C,...,0,3277,0,3327,0,0,0,,,
9553,2414,2414,20,46772,55776,0.5026,PACCT1_4855283_AperioUUID17094.svs,TAILORx_1,4855283,B,...,0,2510,0,2527,1,0,0,5.0,,5.0
9554,3052,3052,20,49938,65736,0.5026,PACCT1_3942733_AperioUUID26037.svs,TAILORx_1,3942733,C,...,0,2520,0,2551,0,0,0,,,
9555,1138,1138,20,48883,59760,0.5026,PACCT1_3465713_AperioUUID24546.svs,TAILORx_1,3465713,A,...,0,490,0,1462,0,1,0,,,


In [28]:
import re

def extract_number(s):
    match = re.search(r'PACCT1_(\d+)', s)
    if match:
        return int(match.group(1)) # return the first captured group as int
    else:
        raise Exception('String does not match the expected format')

merged['patient_barcode'] = merged['file'].apply(extract_number).astype('int32')
unique_numbers = merged['patient_barcode'].unique()

# Create a dictionary that maps each extracted number to a random number between 1 and 5
number_to_random_mapping = {number: np.random.randint(1, 5) for number in unique_numbers}
merged['fold'] = merged['patient_barcode'].map(number_to_random_mapping)

merged['mpp'] = 10.0 / merged['magnification']
# print(merged.dtypes)
print(merged.dtypes[merged.dtypes == object])
merged.er_status = merged.er_status.astype('str')
print(merged.er_status.unique())
merged.pr_status = merged.pr_status.astype('str')
merged.file = merged.file.astype('str')
print(merged.dtypes[merged.dtypes == object])

file             object
id               object
rxarm            object
age              object
meno             object
TumorSizeGp      object
Grade            object
NucGrade         object
er_status        object
pr_status        object
PrimSurg         object
ChemRegGp        object
TypeEndocrine    object
dtype: object
['Positive' 'nan' 'Negative']
file             object
id               object
rxarm            object
age              object
meno             object
TumorSizeGp      object
Grade            object
NucGrade         object
er_status        object
pr_status        object
PrimSurg         object
ChemRegGp        object
TypeEndocrine    object
dtype: object


In [29]:
concated = pd.concat([current_meta, merged])
print(concated.er_status.unique())
concated.pr_status[(concated.pr_status == 'nan') | (concated.pr_status.isna())] = pd.NA
concated.er_status[(concated.er_status == 'nan') | (concated.er_status.isna())] = pd.NA
print(concated.er_status.unique())

concated.to_csv("metadata_csvs/largest_with_taylor.csv", index=False)

['3.0' '1.0' '2.0' 'Negative' 'Positive' nan 'Stained but IHC missing'
 'Missing Data Data' 'Equivocal' 'Indeterminate' 'nan']
['3.0' '1.0' '2.0' 'Negative' 'Positive' <NA> 'Stained but IHC missing'
 'Missing Data Data' 'Equivocal' 'Indeterminate']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concated.pr_status[(concated.pr_status == 'nan') | (concated.pr_status.isna())] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concated.er_status[(concated.er_status == 'nan') | (concated.er_status.isna())] = pd.NA


In [2]:
taylor_meta = pd.read_csv("metadata_csvs/largest_with_taylor.csv")

  taylor_meta = pd.read_csv("metadata_csvs/largest_with_taylor.csv")


In [31]:
len(taylor_meta[(taylor_meta["dfs"] > 365*5)])

7048

In [30]:
len(taylor_meta[(taylor_meta["dfs"] <= 365*5) & (taylor_meta["typefdfs"].isin([1,2,3,4,5]))])

585