In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from wsi.core import utils
np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
desired_magnification = 10
tile_size = 256

In [3]:
carmel_data = [pd.read_excel(f"/mnt/gipmed_new/Data/Breast/Carmel/{'1-8' if (i >= 1 and i <= 8) else '9-11'}/Batch_{i}/CARMEL{i}/slides_data_CARMEL{i}.xlsx") for i in range(1,12)]
tcga_data = pd.read_excel(f"/mnt/gipmed_new/Data/Breast/TCGA/slides_data_TCGA.xlsx")
abctb_data = pd.read_excel(f"/mnt/gipmed_new/Data/Breast/ABCTB_TIF/slides_data_ABCTB.xlsx")
sheba_data = [pd.read_excel(f"/mnt/gipmed_new/Data/Breast/Sheba/Batch_{i}/SHEBA{i}/slides_data_SHEBA{i}.xlsx") for i in range(2,7)]
haemek_data = pd.read_excel(f"/mnt/gipmed_new/Data/Breast/Haemek/Haemek_cancer_HE/Batch_1/HAEMEK1/slides_data_HAEMEK1.xlsx")


In [4]:
keys_needed = "file,patient_barcode,id,mpp,tiles_count,legitimate_tiles,width,height,magnification,er_status,pr_status,her2_status,grade,tumor_type,ki_67_status,onco_ki_67,onco_score_11,onco_score_18,onco_score_26,onco_score_31,onco_score_all,fold".split(",")

In [5]:
dataset_paths = {f"CARMEL{i}": f"/mnt/gipmed_new/Data/Breast/Carmel/{'1-8' if (i >= 1 and i <= 8) else '9-11'}/Batch_{i}/CARMEL{i}/" for i in range(1,12)}
dataset_paths = dict(dataset_paths, **{f"SHEBA{i}": f"/mnt/gipmed_new/Data/Breast/Sheba/Batch_{i}/SHEBA{i}/" for i in range (2,7)})
dataset_paths["TCGA"] = "/mnt/gipmed_new/Data/Breast/TCGA/"
dataset_paths["ABCTB"] = "/mnt/gipmed_new/Data/Breast/ABCTB_TIF/"
dataset_paths["HAEMEK1"] = "/mnt/gipmed_new/Data/Breast/Haemek/Haemek_cancer_HE/Batch_1/HAEMEK1/"

In [6]:
carmel_data = pd.concat(carmel_data, axis=0)
carmel_data.rename(columns={"MPP": "mpp", 
                            "Width": "width", 
                            "Height": "height", 
                            'Manipulated Objective Power': 'magnification', 
                            "ER status": "er_status",
                            "PR status": "pr_status",
                            "Her2 status": "her2_status",
                            "Ki67 status": "ki67_status",
                            "Grade": 'grade',
                            "test fold idx": "fold"
                            }, inplace=True)
carmel_data = carmel_data.convert_dtypes().replace("Positive", "1")
carmel_data = carmel_data.convert_dtypes().replace("Negative", "0")
carmel_data = carmel_data.convert_dtypes().replace("Missing Data", pd.NA)

In [7]:
keys_needed

['file',
 'patient_barcode',
 'id',
 'mpp',
 'tiles_count',
 'legitimate_tiles',
 'width',
 'height',
 'magnification',
 'er_status',
 'pr_status',
 'her2_status',
 'grade',
 'tumor_type',
 'ki_67_status',
 'onco_ki_67',
 'onco_score_11',
 'onco_score_18',
 'onco_score_26',
 'onco_score_31',
 'onco_score_all',
 'fold']

In [8]:
carmel_data.keys()

Index(['Unnamed: 0', 'file', 'patient barcode', 'id', 'DX', 'mpp', 'width',
       'height', 'magnification', 'Objective Power', 'Scan Date', 'er_status',
       'pr_status', 'her2_status', 'TissueType', 'PatientIndex', 'fold',
       'test fold idx breast', 'slide barcode', 'ki67_status', 'ER score',
       'PR score', 'Her2 score', 'Ki67 score', 'Age', 'grade', 'ER100 status',
       'is_cancer status', 'Unnamed: 18'],
      dtype='object')

In [9]:
carmel_for_df = carmel_data[["file", "patient barcode", "id", "mpp", "width", "height", "magnification", "er_status", "pr_status", "her2_status", "ki67_status", "grade", "fold"]]

In [10]:
tcga_data.keys()


Index(['file', 'DX', 'patient barcode', 'id', 'MPP', 'Width', 'Height',
       'Manipulated Objective Power', 'Objective Power', 'Scan Date',
       'ER status', 'PR status', 'Her2 status', 'test fold idx',
       'test fold idx breast', 'test fold idx breast - original for carmel'],
      dtype='object')

In [11]:
tcga_data.rename(columns={"MPP": "mpp",
                            "Width": "width",
                            "Height": "height",
                            'Manipulated Objective Power': 'magnification',
                            "ER status": "er_status",
                            "PR status": "pr_status",
                            "Her2 status": "her2_status",
                            "Ki67 status": "ki67_status",
                            "Grade": 'grade',
                            "test fold idx": "fold"
                            }, inplace=True)
tcga_data.replace("Positive", 1, inplace=True)
tcga_data.replace("Negative", 0, inplace=True)
tcga_data.replace("Missing Data", pd.NA, inplace=True)
tcga_for_df = tcga_data[["file", "patient barcode", "id", "mpp", "width", "height", "magnification", "er_status", "pr_status", "her2_status", "fold"]]
tcga_for_df.keys()

Index(['file', 'patient barcode', 'id', 'mpp', 'width', 'height',
       'magnification', 'er_status', 'pr_status', 'her2_status', 'fold'],
      dtype='object')

In [12]:
abctb_data.rename(columns={"MPP": "mpp", 
                            "Width": "width", 
                            "Height": "height", 
                            'Manipulated Objective Power': 'magnification', 
                            "ER status": "er_status",
                            "PR status": "pr_status",
                            "Her2 status": "her2_status",
                            "Ki67 status": "ki67_status",
                            "Grade": 'grade',
                            "test fold idx breast": "fold",
                            'survival status': 'survstat',
                            }, inplace=True)
abctb_data.replace("Positive", 1, inplace=True)
abctb_data.replace("Negative", 0, inplace=True)
abctb_data.replace("test", 6, inplace=True)
abctb_data.replace("Missing Data", pd.NA, inplace=True)
abctb_data.mpp = abctb_data.mpp * 4 #source data is inconsistant between magnification and mpp.
abctb_data.keys()

Index(['Unnamed: 0', 'file', 'patient barcode', 'id', 'DX', 'mpp', 'width',
       'height', 'magnification', 'Scan Date', 'er_status', 'pr_status',
       'her2_status', 'test fold idx', 'fold', 'Censored_Omer',
       'survival status old', 'Folder', 'Year of Diagnosis',
       'Follow-up Months Since Diagnosis', 'Year of Last Follow-up',
       'Follow-up Status', 'Breast Events (status)', 'Notes',
       'Exclude for time prediction?', 'Censored', 'survstat'],
      dtype='object')

In [13]:
abctb_for_df = abctb_data[["file", "patient barcode", "id", "mpp", "width", "height", "magnification", "er_status", "pr_status", "her2_status", "fold"]]

In [14]:
sheba_data = pd.concat(sheba_data, axis=0)
sheba_data.rename(columns={"MPP": "mpp",
                            "Width": "width",
                            "Height": "height",
                            'Manipulated Objective Power': 'magnification',
                            "ER ": "er_status",
                            "PR ": "pr_status",
                            "HER-2 IHC ": "her2_status",
                            "Grade": 'grade',
                            "test fold idx": "fold"
                            }, inplace=True)
sheba_data.replace("Positive", 1, inplace=True)
sheba_data.replace("Negative", 0, inplace=True)
sheba_data.replace("test", 6, inplace=True)
sheba_data = sheba_data.applymap(lambda x: pd.NA if x == "X" or x == "Missing Data" or x == "Non Amplification" else x)

In [15]:
sheba_data.keys()

Index(['patient barcode', 'id', 'file', 'DX', 'mpp', 'width', 'height',
       'magnification', 'Age@testing', 'Code', 'er_status', 'grade',
       'HER-2 FISH ', 'her2_status', 'Histology',
       'Lymph node status for OncotypeDX', 'Nodal Status',
       'Oncotype DX Breast Cancer Assay', 'Oncotype ER (RT-PCR)',
       'Oncotype HER2 (RT-PCR)', 'Oncotype PR (RT-PCR)', 'pr_status',
       'Proliferation (Ki-67) Oncotype', 'Tumor Size', 'PatientID',
       'onco_score_11 status', 'onco_score_18 status', 'onco_score_26 status',
       'onco_score_31 status', 'onco_score_all status', 'slide barcode',
       'CodeID', 'TCodeID', 'Gender', 'Neoadjuvant', 'Comments', 'prev folds',
       'fold'],
      dtype='object')

In [16]:
def ER_PR_func(x):
    if pd.isna(x):
        return pd.NA
    else:
        return 1 if x>1 else 0
    
def Her2_func(x):
    if pd.isna(x):
        return pd.NA
    else:
        return 1 if x>2 else 0
    
sheba_data["er_status"] = sheba_data["er_status"].apply(ER_PR_func).astype(pd.UInt8Dtype())
sheba_data["pr_status"] = sheba_data["pr_status"].apply(ER_PR_func).astype(pd.UInt8Dtype())
sheba_data["her2_status"] = sheba_data["her2_status"].apply(Her2_func).astype(pd.UInt8Dtype())


In [17]:
sheba_for_df = sheba_data[["file", "patient barcode", "id", "mpp", "width", "height", "grade", "magnification", "er_status", "pr_status", "her2_status", "fold"]]

In [18]:
haemek_data.rename(columns={"MPP": "mpp",
                            "Width": "width",
                            "Height": "height",
                            'Manipulated Objective Power': 'magnification',
                            "ER status": "er_status",
                            "PR status": "pr_status",
                            "Her2 status": "her2_status",
                            "Ki67 status": "ki67_status",
                            "test fold idx": "fold"
                            }, inplace=True)
haemek_data.replace("Positive", 1, inplace=True)
haemek_data.replace("Negative", 0, inplace=True)
haemek_data = haemek_data.applymap(lambda x: pd.NA if x == "Was not stained" or x == "Inconclusive" or x == "Stained but IHC missing" else x)

In [19]:
haemek_data.keys()

Index(['#', 'Unnamed: 0', 'Unnamed: 0.1', 'patient barcode', 'id', 'file',
       'DX', 'mpp', 'width', 'height', 'Objective Power', 'magnification',
       'Scan Date', 'er_status', 'pr_status', 'her2_status', 'ki67_status',
       'fold', 'slide barcode', 'PatientIndex', 'TissueID', 'BlockID',
       'ER score', 'PR score', 'Her2 score', 'Ki67 score', 'Gender',
       'Birthdate', 'TissueDate', 'TissueType', 'TumorType', 'BreastSide',
       'Comment', 'patient'],
      dtype='object')

In [20]:
haemek_for_df = haemek_data[["file", "patient barcode", "id", "mpp", "width", "height", "magnification", "er_status", "pr_status", "her2_status", "ki67_status", "fold"]]

In [21]:
concated_df = pd.concat([carmel_for_df, tcga_for_df, abctb_for_df, haemek_for_df, sheba_for_df], axis=0).reindex()
concated_df = concated_df.loc[concated_df.file.notna()]
concated_df = concated_df.loc[concated_df.fold.notna()]

In [22]:
curr_csv = pd.read_csv("metadata_csvs/largest_current_metadata.csv").set_index("file")
# concated_df.set_index("file", inplace=True)
# concated_df = pd.concat([curr_csv, concated_df], axis=0).reindex()
# Assume 'df' is your DataFrame
# duplicates = concated_df.index.duplicated(keep=False)

# 'duplicates' is a boolean Series that is True where the index is duplicated
# print(concated_df[duplicates])

In [23]:
concated_df.fold = concated_df.fold.astype(pd.Int8Dtype())

In [24]:
def _get_tiles_count(row: pd.DataFrame) -> int:
    dataset_path = dataset_paths[row["id"]]
    image_file_name_stem = Path(row["file"]).stem
    assert utils.check_segmentation_data_exists(dataset_path=dataset_path, 
                                            desired_magnification=desired_magnification, 
                                            image_file_name_stem=image_file_name_stem, 
                                            tile_size=tile_size), f"{row['file']} from {dataset_path} is nonsegmented file"
    segmentation_data = utils.load_segmentation_data(dataset_path=dataset_path,
                                                    desired_magnification=desired_magnification,
                                                    image_file_name_stem=image_file_name_stem,
                                                    tile_size=tile_size)
    return segmentation_data.shape[0]

def _does_seg_exist(row: pd.DataFrame) -> bool:
    dataset_path = dataset_paths[row["id"]]
    image_file_name_stem = Path(row["file"]).stem
    return utils.check_segmentation_data_exists(dataset_path=dataset_path,
                                                desired_magnification=desired_magnification,
                                                image_file_name_stem=image_file_name_stem,
                                                tile_size=tile_size)

def _get_total_tiles(row: pd.DataFrame) -> int:
    return int(row.height * row.width / ((tile_size * (row.magnification // desired_magnification)) ** 2))

def _add_tiles_count_and_seg(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df.apply(lambda row: _does_seg_exist(row), axis=1)]
    df["total_tiles"] = df.apply(
        lambda row: _get_total_tiles(row=row), axis=1
    )
    df["tiles_count"] = df.apply(
        lambda row: _get_tiles_count(row=row), axis=1
    )
    df["legitimate_tiles"] = df["tiles_count"].astype(int)
    return df

concated_df = _add_tiles_count_and_seg(concated_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["total_tiles"] = df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tiles_count"] = df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["legitimate_tiles"] = df["tiles_count"].astype(int)


In [25]:
concated_df[pd.isna(concated_df.fold)]

Unnamed: 0,file,patient barcode,id,mpp,width,height,magnification,er_status,pr_status,her2_status,ki67_status,grade,fold,total_tiles,tiles_count,legitimate_tiles


In [26]:
curr_csv

Unnamed: 0_level_0,Unnamed: 0,patient_barcode,id,mpp,total_tiles,tiles_count,legitimate_tiles,width,height,magnification,...,grade,tumor_type,ki_67_status,onco_ki_67,onco_score_11,onco_score_18,onco_score_26,onco_score_31,onco_score_all,fold
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GS0117622202.tiff,0,436.0,SHEBA2,,3605.0,231,231.0,104960.0,35328.0,20.0,...,2,IDC,,5%,Positive,Negative,Negative,Negative,1.0,4
GS133536102.tiff,1,355.0,SHEBA2,,2745.0,690,690.0,62464.0,45568.0,20.0,...,2,IDC,,15%,Positive,Positive,Positive,Negative,3.0,1
GS210801202.tiff,2,435.0,SHEBA2,,2040.0,567,567.0,51712.0,40448.0,20.0,...,3,IDC,,,Positive,Positive,Positive,Positive,4.0,6
GS212441302.tiff,3,400.0,SHEBA2,,3854.0,689,689.0,83968.0,48128.0,20.0,...,2,IDC,,20%,Positive,Positive,Negative,Negative,2.0,4
GS212629102.tiff,4,364.0,SHEBA2,,2940.0,692,692.0,71680.0,42496.0,20.0,...,2,IDC,,20%,Negative,Negative,Negative,Negative,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-S3-AA11-01Z-00-DX1.36B83B37-2928-4DB7-A04A-8D511F1183FD.svs,3107,TCGA-S3-AA11,TCGA,0.2527,4824.0,1415,1415.0,73703.0,67987.0,40.0,...,1,IDC,,,,,,,,2
TCGA-S3-AA11-01A-03-TS3.8C92F642-024A-482C-A5D0-A49910FE31C6.svs,3108,TCGA-S3-AA11,TCGA,0.2527,2938.0,1060,1060.0,115535.0,26241.0,40.0,...,1,IDC,,,,,,,,2
TCGA-WT-AB44-01Z-00-DX1.B6ECEA7C-DA26-4B34-88CE-6834631DFA35.svs,3109,TCGA-WT-AB44,TCGA,0.2527,9718.0,1107,1107.0,115535.0,87439.0,40.0,...,1,OTHER,,,,,,,,3
TCGA-WT-AB44-01A-01-TS1.B6C0EEDB-E5B9-4B0D-8599-23879A0419EB.svs,3110,TCGA-WT-AB44,TCGA,0.2527,484.0,66,66.0,21911.0,21753.0,40.0,...,1,OTHER,,,,,,,,3


In [27]:
set(curr_csv.reset_index().file) - set(concated_df.file)

{'TCGA-OL-A5RY-01Z-00-DX1.AE4E9D74-FC1C-4C1E-AE6D-5DF38899BBA6.svs',
 'TCGA-OL-A5RZ-01Z-00-DX1.6394C05E-1C34-4F4B-8859-F5E961E7EFF9.svs',
 'TCGA-OL-A5S0-01Z-00-DX1.49A7AC9D-C186-406C-BA67-2D73DE82E13B.svs'}

In [28]:
set(concated_df.file) - set(curr_csv.reset_index().file)

{'01-06-008.399.EX.C1.tif',
 '01-06-010.200.LB.E3.tif',
 '01-06-024.222.EX.B1.tif',
 '01-06-085.489.LB.A4.tif',
 '01-07-002.233.EX.C1.tif',
 '01-07-019.799.EX.A1.tif',
 '01-07-052.765.EX.G1.tif',
 '01-07-092.658.LB.D3.tif',
 '01-07-112.171.LB.B3.tif',
 '01-07-117.868.EX.D1.tif',
 '01-08-009.893.EX.C1.tif',
 '01-08-022.497.EX.C1.tif',
 '01-08-043.605.EX.B1.tif',
 '01-08-112.173.EX.A1.tif',
 '01-09-126.366.EX.A1.tif',
 '02-07-016.002.EX.C1.tif',
 '02-10-089.008.EX.4N.tif',
 '02-11-034.539.EX.5T.tif',
 '03-07-002.882.EX.2B.tif',
 '03-07-020.335.EX.C.tif',
 '03-09-001.588.EX.3M.tif',
 '03-09-064.084.EX.1D.tif',
 '03-09-080.347.EX.2B.tif',
 '03-09-082.672.EX.2J.tif',
 '03-09-098.355.EX.2L.tif',
 '03-09-101.062.EX.2W.tif',
 '03-09-118.268.EX.2J.tif',
 '03-09-145.383.EX.2T.tif',
 '03-09-146.661.EX.2M.tif',
 '04-07-071.2B.tif',
 '04-07-107.988.EX.1A.tif',
 '04-09-188.536.EX.1A.tif',
 '04-11-119.818.EX.1A.tif',
 '05-07-028.420.EX.4B.tif',
 '05-07-033.218.EX.1B.tif',
 '05-07-037.431.EX.1B.tif',


In [29]:
error_openning_slides = pd.Series(["19-2066_1_4_j.mrxs"])
concated_df = concated_df[~concated_df.file.isin(error_openning_slides)]

In [31]:
concated_df.to_csv("metadata_csvs/current.csv")