# Set-up

## Modules

In [None]:
import os
import typing as tp
from functools import partial
from pathlib import Path
from copy import deepcopy

import numpy as np
import pandas as pd

def get_default_na_values(exclude=None):
    # list used by pandas to detect missing values: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
    default_na_values = ["", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null ", "null"]
    if exclude:
        return [value for value in default_na_values if value not in exclude]
    return default_na_values

# Specify the values to exclude from the default NaN values list
EXCLUDE = ['N/A']

# Get the modified list of NaN values
CUSTOM_NA_VALUES = get_default_na_values(exclude=EXCLUDE)

RANDOM_STATE = 42
OVERWRITE_FILE_LISTS = False

In [None]:
import sys

sys.path.insert(0, '..')

from source.stratified_patient_split import group_split, split_dataset_by_patient

sys.path.insert(0, './labels_creation_code')

## Directories

In [None]:
labels_dir = '../labels'

# dummy_label_files_dir = f'{labels_dir}/dummy-label-files'
# print(os.listdir(dummy_label_files_dir))

slide_lists_dir = f'{labels_dir}/slide-lists'
os.makedirs(slide_lists_dir, exist_ok=True)
print(sorted(os.listdir(slide_lists_dir)))

experiment_label_files_dir = f'{labels_dir}/experiment-label-files'
os.makedirs(experiment_label_files_dir, exist_ok=True)
print(sorted(os.listdir(experiment_label_files_dir)))

source_files_for_labels_dir = f'{labels_dir}/source_copies_for_label_files'
print(sorted(os.listdir(source_files_for_labels_dir)))

### Make `slide_lists_dir`

In [None]:
# wsi_dir = "../WSI"

# for dataset in os.listdir(wsi_dir):
#     if not os.path.isdir(f"{wsi_dir}/{dataset}/all_classes"):
#         continue
#     output_path = f"{slide_lists_dir}/{dataset}.csv"
#     if os.path.exists(output_path) and not OVERWRITE_FILE_LISTS:
#         continue

#     sorted_files = sorted(os.listdir(f"{wsi_dir}/{dataset}/all_classes"))
#     sorted_wsi_ids = [Path(file_name).stem for file_name in sorted_files]
#     print(dataset, sorted_wsi_ids)

#     # make a csv file with the list of slides, record 1) dataset column and wsi_id column
#     dummy_df = pd.DataFrame(sorted_wsi_ids, columns=["wsi_id"])
#     dummy_df["dataset"] = dataset
#     dummy_df = dummy_df[["dataset", "wsi_id"]]
#     dummy_df.to_csv(output_path, index=False)

## Constants

In [None]:
LUNG_CANCER_MAIN_TYPES = [
    "LUAD",
    "LUSC",
    "Benign",
    # "TC",
    "OtherCancer",
]
LUNG_CANCER_LUAD_LUSC_BENIGN = ["LUAD", "LUSC", "Benign"]
LUNG_CANCER_MAIN_TYPES_2_IDENTIFYING_STRINGS = {
    "LUAD": "adenoca",
    "LUSC": "squamous cell",
    # "TC": "typical carcinoid",
    "Benign": "benign",
}
# get index of the type in LUNG_CANCER_MAIN_TYPES_2_IDENTIFYING_STRINGS but not in the list LUNG_CANCER_MAIN_TYPES
AIDA_NOT_ENOUGH_INFO_STRING = "not enough information on the slide"

ADENOCARCINOMA_MAIN_PATTERNS = sorted(["acinar", "lepidic", "micropapillary", "papillary", "solid"])
ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES =  {
    pattern: f"LUAD_{pattern}"
    for pattern in ADENOCARCINOMA_MAIN_PATTERNS
}


# PRESENT_CONSTANT = 1
# ABSCENT_CONSTANT = 0
UNKNOWN_CONSTANT = -1

FIRST_COLUMNS = ["source", "dataset", "patient_id", "wsi_id"]

## Functions

In [None]:
def predominant_pattern_is_in_all_patterns(row, predominant_pattern_col_name='adenocarcinomaPredominantPattern', all_patterns_col_name='adenocarcinoma-All-Patterns-Including-Predominant', ignore_values=['unknown', 'NotApplicable']):
    """
    Usage:  ouh_slides_with_subtyping_df.apply(
                partial(predominant_pattern_is_in_all_patterns, predominant_pattern_col_name='adenocarcinomaPredominantPattern', all_patterns_col_name='adenocarcinoma-All-Patterns-Including-Predominant'),
                axis=1
            )
    axis=1 is used to apply the function to each row as a whole
    """
    all_patterns = row[all_patterns_col_name].split('-')
    predominant_pattern = row[predominant_pattern_col_name]
    # returns true if predominant pattern is one of the ignore_values or if it is in the all patterns
    return (predominant_pattern in ignore_values) or (predominant_pattern in all_patterns)

def extract_cancer_subtype_presence(s: str):
    subtype_presence = [0 for _ in range(len(LUNG_CANCER_MAIN_TYPES))]
    
    total_sum = 0
    for subtype, subtype_str in LUNG_CANCER_MAIN_TYPES_2_IDENTIFYING_STRINGS.items():
        if subtype_str in s.lower():
            subtype_presence[LUNG_CANCER_MAIN_TYPES.index(subtype)] = 1
            total_sum += 1
    
    # OtherCancer needs to be last
    if total_sum == 0:
        subtype_presence[-1] = 1

    return subtype_presence

def extract_adenocarcinoma_pattern_presence(s: str):
    s_list = s.split('-')
    pattern_presence = []

    for pattern in ADENOCARCINOMA_MAIN_PATTERNS:
        if pattern in s_list:
            pattern_presence.append(1)
        else:
            pattern_presence.append(0)
    
    return pattern_presence

# DHMC

## DHMC Adenocarcinoma patterns

In [None]:
os.listdir(slide_lists_dir)

In [None]:
dhmc_20x = pd.read_csv(f"{slide_lists_dir}/DHMC_20x.csv")
dhmc_40x = pd.read_csv(f"{slide_lists_dir}/DHMC_40x.csv")

dhmc_all = pd.concat([dhmc_20x, dhmc_40x])
dhmc_all['label'] = 0

display(dhmc_all)
dhmc_all.to_csv(f"{experiment_label_files_dir}/DHMC_LUAD_LUSC_all.csv", index=False)

In [None]:
dhmc_metadata_df = pd.read_csv(f'{source_files_for_labels_dir}/DHMC_MetaData_Release_1.0.csv')
dhmc_metadata_df['wsi_id'] = dhmc_metadata_df['File Name'].map(lambda x: Path(x).stem)
dhmc_metadata_df


In [None]:
temp_df = deepcopy(dhmc_all)
dhmc_detailed_labels = temp_df.merge(dhmc_metadata_df, on="wsi_id", how="inner")[
    ["dataset", "wsi_id", "Class"]
]
dhmc_detailed_labels["source"] = "DHMC"

dhmc_detailed_labels["LUAD"] = 1
dhmc_detailed_labels["LUSC"] = 0
dhmc_detailed_labels["Benign"] = 0
# dhmc_detailed_labels["TC"] = 0
# dhmc_detailed_labels['OtherCancer'] = 0

dhmc_detailed_labels = pd.get_dummies(
    dhmc_detailed_labels, columns=["Class"], prefix="LUAD", dtype=int
)
dhmc_detailed_labels[
    dhmc_detailed_labels[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 1
] = 2
dhmc_detailed_labels[
    dhmc_detailed_labels[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 0
] = UNKNOWN_CONSTANT

dhmc_detailed_labels["patient_id"] = dhmc_detailed_labels["wsi_id"]

# puth FIRST_COLUMNS at the beginning
dhmc_detailed_labels = dhmc_detailed_labels[[*FIRST_COLUMNS, *dhmc_detailed_labels.columns.difference(FIRST_COLUMNS)]]

display(dhmc_detailed_labels)

In [None]:
print("Number of slides with cancer pattern:")
display(dhmc_detailed_labels[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(dhmc_detailed_labels) == dhmc_detailed_labels[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display((dhmc_detailed_labels[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((dhmc_detailed_labels[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
dhmc_detailed_labels.to_csv(
    f"{experiment_label_files_dir}/DETAILED_DHMC_LUAD_LUSC_BENIGN.csv", index=False
)

# TCGA

## Make All TCGA labels

In [None]:
# tcga_info_file_df = pd.read_csv("/well/rittscher-dart/shared/datasets/lung/TCGA/data/classes_extended_info.csv")
tcga_info_file_df = pd.read_csv(f"{source_files_for_labels_dir}/tcga_classes_extended_info.csv")
tcga_info_file_df["slide_id"] = tcga_info_file_df["slide_file_name"].apply(lambda x: Path(x).stem)
print("tcga_info_file_df after making slide_id column")
display(tcga_info_file_df)

# tcga_tiatoolbox_excluded_slides
tcga_tiatoolbox_excluded_slides_df = pd.read_csv(f"{source_files_for_labels_dir}/tcga_tiatoolbox_excluded_slides.csv")
print("tcga_tiatoolbox_excluded_slides_df after reading")
display(tcga_tiatoolbox_excluded_slides_df)

# list of slides that were used in the dsmil project
tcga_dsmil_used_slides_df = pd.read_csv(f"{source_files_for_labels_dir}/tcga_dsmil_all_ids.csv")
print("tcga_dsmil_used_slides_df after reading")
display(tcga_dsmil_used_slides_df)

# subset tcga_info_file_df to only include slides that were used in the dsmil project
# tcga_info_file_df column: slide_id_short
# tcga_dsmil_used_slides_df column: wsi_id
tcga_info_file_df = tcga_info_file_df[
    tcga_info_file_df["slide_id_short"].isin(tcga_dsmil_used_slides_df["wsi_id"])
]
print("tcga_info_file_df after subsetting to only include slides used in the dsmil project")
display(tcga_info_file_df)

# subset tcga_info_file_df to only include slides that were not excluded by the tiatoolbox
# tcga_info_file_df column: slide_id_short
# tcga_tiatoolbox_excluded_slides_df column: slide_id_short
tcga_info_file_df = tcga_info_file_df[
    ~tcga_info_file_df["slide_id_short"].isin(
        tcga_tiatoolbox_excluded_slides_df["slide_id_short"]
    )
]

# list of slides present in the WSI/TCGA-lung directory
tcga_dsmil_labels_df = pd.read_csv(f"{slide_lists_dir}/TCGA-lung.csv")
tcga_dsmil_labels_df["source"] = "TCGA"
tcga_dsmil_labels_df["LUAD"] = None
tcga_dsmil_labels_df["LUSC"] = None
tcga_dsmil_labels_df["Benign"] = 0
# tcga_dsmil_labels_df["TC"] = 0
# tcga_dsmil_labels_df['OtherCancer'] = 0
print("tcga_dsmil_labels_df after reading and adding source, LUAD, LUSC, Benign columns")
display(tcga_dsmil_labels_df)

# tcga_info_file_df has 2 columns of interest "slide_id_short" and "cancer_type", the rest we are not interested in
# tcga_dsmil_labels_df has a column "wsi_id", which corresponds to "slide_id_short" in tcga_info_file_df
# we need to populate the "LUAD" and "LUSC" columns in tcga_dsmil_labels_df based on the "cancer_type" column in tcga_info_file_df
tcga_dsmil_labels_df = tcga_dsmil_labels_df.merge(
    tcga_info_file_df[["slide_id", "cancer_type"]],
    left_on="wsi_id",
    right_on="slide_id",
    how="inner",
)
# drop the "slide_id_short" column as it is not needed anymore
tcga_dsmil_labels_df.drop(columns=["slide_id"], inplace=True)
# populate the "LUAD" and "LUSC" columns based on the "cancer_type" column
tcga_dsmil_labels_df["LUAD"] = tcga_dsmil_labels_df["cancer_type"].apply(
    lambda x: 1 if x == "LUAD" else 0
)
tcga_dsmil_labels_df["LUSC"] = tcga_dsmil_labels_df["cancer_type"].apply(
    lambda x: 1 if x == "LUSC" else 0
)
# drop the "cancer_type" column as it is not needed anymore
tcga_dsmil_labels_df.drop(columns=["cancer_type"], inplace=True)
print("tcga_dsmil_labels_df after merging with tcga_info_file_df and populating LUAD and LUSC columns")
display(tcga_dsmil_labels_df)

assert tcga_dsmil_labels_df.isna().sum().sum() == 0, "There are missing values"
tcga_detailed_labels_df = deepcopy(tcga_dsmil_labels_df)


# We know that all TCGA slides are LUAD or LUSC, so we can initialise all adenocarcinoma patterns to 0 for all slides
# We do not know which patterns of adenocarcinoma are present on LUAD slides - we will set them to UNKNOWN_CONSTANT
for pattern_label_name in ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values():
    tcga_detailed_labels_df[pattern_label_name] = 0
    tcga_detailed_labels_df.loc[
        (tcga_detailed_labels_df["LUAD"] == 1), pattern_label_name
    ] = UNKNOWN_CONSTANT


def extract_tcga_patient_id_from_wsi_id(s: str):
    wsi_file_name = Path(s).name
    patient_id = wsi_file_name[:12]
    return patient_id


tcga_detailed_labels_df["patient_id"] = tcga_detailed_labels_df["wsi_id"].apply(
    extract_tcga_patient_id_from_wsi_id
)
tcga_detailed_labels_df = tcga_detailed_labels_df[dhmc_detailed_labels.columns]
tcga_detailed_labels_df

In [None]:
tcga_detailed_labels_df.LUAD.sum(), tcga_detailed_labels_df.LUSC.sum()

In [None]:
tcga_detailed_labels_df[tcga_detailed_labels_df["LUAD"] == 1]["patient_id"].nunique()

In [None]:
print("Number of slides with cancer pattern:")
display(tcga_detailed_labels_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(tcga_detailed_labels_df) == tcga_detailed_labels_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display((tcga_detailed_labels_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((tcga_detailed_labels_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
tcga_detailed_labels_df.to_csv(f'{experiment_label_files_dir}/DETAILED_TCGA_LUAD_LUSC_BENIGN.csv', index=False)

## Split into Train and Test

In [None]:
def extract_tcga_slide_id_from_file_path_str(s: str):
    wsi_file_name = Path(s).name
    slide_id = wsi_file_name[:23]
    return slide_id

tcga_detailed_labels_df = pd.read_csv(f'{experiment_label_files_dir}/DETAILED_TCGA_LUAD_LUSC_BENIGN.csv')
display(tcga_detailed_labels_df)

In [None]:
tcga_dsmil_test_set = pd.read_csv(f"{source_files_for_labels_dir}/tcga_dsmil_test_ids.csv")
tcga_dsmil_test_set.columns = ["slide_id_short", "label"]

# exclude the slides that were excluded by the tiatoolbox
tcga_dsmil_test_set = tcga_dsmil_test_set[
    ~tcga_dsmil_test_set["slide_id_short"].isin(
        tcga_tiatoolbox_excluded_slides_df["slide_id_short"]
    )
]

# get the full slide_id from the slide_id_short using tcga_info_file_df
tcga_dsmil_test_set["wsi_id"] = tcga_dsmil_test_set["slide_id_short"].apply(
    lambda x: tcga_info_file_df[tcga_info_file_df["slide_id_short"] == x]["slide_id"].values[0]
)

# check that all of these slide ids from the test set file are present as slide ids in the tcga_detailed_labels_df
assert tcga_dsmil_test_set["wsi_id"].isin(tcga_detailed_labels_df["wsi_id"]).all()

In [None]:
# create a bool mask for test slide ids
in_test_condition = tcga_detailed_labels_df["wsi_id"].isin(tcga_dsmil_test_set["wsi_id"])

# make test df
tcga_detailed_labels_test_df = tcga_detailed_labels_df[in_test_condition].reset_index(drop=True)
display(tcga_detailed_labels_test_df.head())

print("Number of slides with cancer pattern:")
display(tcga_detailed_labels_test_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(tcga_detailed_labels_test_df) == tcga_detailed_labels_test_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display((tcga_detailed_labels_test_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((tcga_detailed_labels_test_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())


# make train df
tcga_detailed_labels_trainval_df = tcga_detailed_labels_df[~in_test_condition].reset_index(drop=True)
display(tcga_detailed_labels_trainval_df.head())

print("Number of slides with cancer pattern:")
display(tcga_detailed_labels_trainval_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(tcga_detailed_labels_trainval_df) == tcga_detailed_labels_trainval_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display((tcga_detailed_labels_trainval_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((tcga_detailed_labels_trainval_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
# check the sets of patients are different
assert set(tcga_detailed_labels_trainval_df.patient_id).intersection(set(tcga_detailed_labels_test_df)) == set()

# save files - only after assert passes
tcga_detailed_labels_test_df.to_csv(f'{experiment_label_files_dir}/DETAILED_TCGA_TEST_LUAD_LUSC_BENIGN.csv', index=False)
tcga_detailed_labels_trainval_df.to_csv(f'{experiment_label_files_dir}/DETAILED_TCGA_TRAINVAL_LUAD_LUSC_BENIGN.csv', index=False)

# TCIA-CPTAC

## Files List

In [None]:
tcia_cptac_files_list = pd.read_csv(
    f"{source_files_for_labels_dir}/tcia_cptac_md5sum_hashes.txt",
    sep="  ",
    header=None)
tcia_cptac_files_list.columns = ["md5sum", "file_path"]

def split_path_into_class_and_slide_id(s: str):
    components_list = s.split('/')
    slide_id = Path(components_list[-1]).stem
    class_name = components_list[-2]
    return class_name, slide_id

tcia_cptac_files_list["class_name"], tcia_cptac_files_list["Slide_ID"]= zip(*tcia_cptac_files_list["file_path"].apply(split_path_into_class_and_slide_id))
display(tcia_cptac_files_list)

## Cohort File

In [None]:
tcia_cptac_luad_lusc_cohort_df = pd.read_csv(f"{labels_dir}/source_copies_for_label_files/tcia_cptac_luad_lusc_cohort.csv", na_values=CUSTOM_NA_VALUES, keep_default_na=False)
print(tcia_cptac_luad_lusc_cohort_df.columns)

display(tcia_cptac_luad_lusc_cohort_df.head())
display(tcia_cptac_luad_lusc_cohort_df.describe())
display(tcia_cptac_luad_lusc_cohort_df.info())

a = set(tcia_cptac_luad_lusc_cohort_df[tcia_cptac_luad_lusc_cohort_df["Specimen_Type"] == "normal_tissue"]["Slide_ID"])
b = set(tcia_cptac_luad_lusc_cohort_df[tcia_cptac_luad_lusc_cohort_df["Normal_Free_of_Tumor"] == "Yes"]["Slide_ID"])
assert b.issubset(a)
# drop slides with Slide_ID in a but not in b
print(f"drop {len(a-b)} slides with 'Specimen_Type' == 'normal_tissue' but with 'Normal_Free_of_Tumor' != 'Yes'")
tcia_cptac_luad_lusc_cohort_df = tcia_cptac_luad_lusc_cohort_df[~tcia_cptac_luad_lusc_cohort_df["Slide_ID"].isin(a-b)].reset_index(drop=True)

# we should drop anacceptable slides with Tumor_Segment_Acceptable value "No"
print(f"drop {(tcia_cptac_luad_lusc_cohort_df['Tumor_Segment_Acceptable'] == 'No').sum()} slides with 'Tumor_Segment_Acceptable' == 'No'")
tcia_cptac_luad_lusc_cohort_df = tcia_cptac_luad_lusc_cohort_df[tcia_cptac_luad_lusc_cohort_df["Tumor_Segment_Acceptable"] != "No"].reset_index(drop=True)

# we should separate between the case and the slide level reported histological type
tcia_cptac_luad_lusc_cohort_df["Case_Tumor_Histological_Type"] = tcia_cptac_luad_lusc_cohort_df["Tumor_Histological_Type"].str.strip(' ')
tcia_cptac_luad_lusc_cohort_df.drop(columns=["Tumor_Histological_Type"], inplace=True)

def get_slide_tumor_histological_type(
    row,
    case_tumor_histological_type_col_name="Case_Tumor_Histological_Type",
    specimen_type_col_name="Specimen_Type",
    benign_specimen_type="normal_tissue"
    ):
    if row[specimen_type_col_name] == benign_specimen_type: 
        return "Benign"
    else:
        return row[case_tumor_histological_type_col_name]

tcia_cptac_luad_lusc_cohort_df["Slide_Tumor_Histological_Type"] = tcia_cptac_luad_lusc_cohort_df.apply(
    partial(get_slide_tumor_histological_type, case_tumor_histological_type_col_name='Case_Tumor_Histological_Type', specimen_type_col_name="Specimen_Type", benign_specimen_type="normal_tissue"),
    axis=1
    )
    

display(tcia_cptac_luad_lusc_cohort_df.head())
display(tcia_cptac_luad_lusc_cohort_df.describe())
display(tcia_cptac_luad_lusc_cohort_df.info())

## Manual Mapping

In [None]:
tcia_cptac_luad_lusc_mapping_df = pd.read_csv(f"{source_files_for_labels_dir}/tcia_cptac_string_2_ouh_labels.csv")
display(tcia_cptac_luad_lusc_mapping_df.head())

print(tcia_cptac_luad_lusc_mapping_df.isna().sum())

# fill missing values with strings
tcia_cptac_luad_lusc_mapping_df = tcia_cptac_luad_lusc_mapping_df.fillna("")
print(tcia_cptac_luad_lusc_mapping_df.isna().sum())


assert tcia_cptac_luad_lusc_mapping_df.apply(
        partial(predominant_pattern_is_in_all_patterns, predominant_pattern_col_name='adenocarcinomaPredominantPattern', all_patterns_col_name='adenocarcinoma-All-Patterns-Including-Predominant'),
        axis=1
    ).all()
assert tcia_cptac_luad_lusc_mapping_df.apply(
        partial(predominant_pattern_is_in_all_patterns, predominant_pattern_col_name='secondPredominantPattern', all_patterns_col_name='adenocarcinoma-All-Patterns-Including-Predominant'),
        axis=1
    ).all()


## Merge: Cohort File + Manual Mapping

In [None]:
assert set(tcia_cptac_luad_lusc_cohort_df["Slide_Tumor_Histological_Type"]) == set(tcia_cptac_luad_lusc_mapping_df["Tumor_Histological_Type"]), f"There are missing histological types.  \
    In the mapping file: {set(tcia_cptac_luad_lusc_cohort_df.Slide_Tumor_Histological_Type) - set(tcia_cptac_luad_lusc_mapping_df.Tumor_Histological_Type)}. \
    In the cohort file: {set(tcia_cptac_luad_lusc_mapping_df.Tumor_Histological_Type) - set(tcia_cptac_luad_lusc_cohort_df.Slide_Tumor_Histological_Type)}"
merged_tcia_cptac_cohort_labels_df = tcia_cptac_luad_lusc_cohort_df.merge(
    tcia_cptac_luad_lusc_mapping_df,
    left_on="Slide_Tumor_Histological_Type",
    right_on="Tumor_Histological_Type",
    how="inner",
)
assert len(merged_tcia_cptac_cohort_labels_df) == len(tcia_cptac_luad_lusc_cohort_df)

print(merged_tcia_cptac_cohort_labels_df.columns)
merged_tcia_cptac_cohort_labels_df.drop(
    columns=[
        'Radiology',
        'Weight',
        'Tumor_Site',
        'Tumor_Segment_Acceptable', 'Tumor_Percent_Tumor_Nuclei',
        'Tumor_Percent_Total_Cellularity', 'Tumor_Percent_Necrosis',
        'Genomics',
        'Proteomics',
        'Genomics_Available',
        'GDC_Link',
        'Proteomics_Available',
        'PDC_Link',
        'Gender', 'Age_at_Diagnosis', 'Ethnicity', 'Race', 'Vital_Status',
        'Patholgy', 'HasRadiology', 'Pathology',
        ],
    inplace=True)
display(merged_tcia_cptac_cohort_labels_df.head())
print(merged_tcia_cptac_cohort_labels_df.shape)

In [None]:
merged_tcia_cptac_cohort_labels_df.isna().sum()

## Subset (merged cohort + manual mapping) with files in files list

In [None]:
display(tcia_cptac_files_list.head())

# slides in cohort that are not present as files
print(set(merged_tcia_cptac_cohort_labels_df["Slide_ID"]) - set(tcia_cptac_files_list["Slide_ID"]))

In [None]:
# tcia_cptac_detailed_labels_df = deepcopy(merged_tcia_cptac_cohort_labels_df[
#     merged_tcia_cptac_cohort_labels_df["Slide_ID"].isin(tcia_cptac_files_list["Slide_ID"])
# ].reset_index(drop=True))

tcia_cptac_detailed_labels_df = merged_tcia_cptac_cohort_labels_df.merge(
    tcia_cptac_files_list,
    on="Slide_ID",
    how="inner",
)
assert (
    tcia_cptac_detailed_labels_df["Tumor"]
    == tcia_cptac_detailed_labels_df["class_name"]
).all()
tcia_cptac_detailed_labels_df.drop(columns=["class_name"], inplace=True)

# create a column with the path to the csv file with features
tcia_cptac_detailed_labels_df["dataset"] = "TCIA-CPTAC"
tcia_cptac_detailed_labels_df["wsi_id"] = tcia_cptac_detailed_labels_df["Slide_ID"]

# record the source of the data
tcia_cptac_detailed_labels_df["source"] = "TCIA-CPTAC"

# apply the function extract_cancer_subtype_presence to the column cancerSubtype
tcia_cptac_detailed_labels_df[LUNG_CANCER_MAIN_TYPES] = tcia_cptac_detailed_labels_df[
    "cancerSubtype"
].apply(lambda x: pd.Series(extract_cancer_subtype_presence(x)))

# apply the function extract_adenocarcinoma_pattern_presence to the column adenocarcinomaAllPatternsIncludingPredominant
tcia_cptac_detailed_labels_df[ADENOCARCINOMA_MAIN_PATTERNS] = (
    tcia_cptac_detailed_labels_df[
        "adenocarcinoma-All-Patterns-Including-Predominant"
    ].apply(lambda x: pd.Series(extract_adenocarcinoma_pattern_presence(x)))
)

# apply the function extract_adenocarcinoma_pattern_presence to the column adenocarcinomaPredominantPattern
tcia_cptac_detailed_labels_df[
    [
        f"adenocarcinomaPredominantPattern_{pattern}"
        for pattern in ADENOCARCINOMA_MAIN_PATTERNS
    ]
] = tcia_cptac_detailed_labels_df["adenocarcinomaPredominantPattern"].apply(
    lambda x: pd.Series(extract_adenocarcinoma_pattern_presence(x))
)

# apply the function extract_adenocarcinoma_pattern_presence to the column adenocarcinomaPredominantPattern
tcia_cptac_detailed_labels_df[
    [f"secondPredominantPattern_{pattern}" for pattern in ADENOCARCINOMA_MAIN_PATTERNS]
] = tcia_cptac_detailed_labels_df["secondPredominantPattern"].apply(
    lambda x: pd.Series(extract_adenocarcinoma_pattern_presence(x))
)

# add the values of f"adenocarcinomaPredominantPattern_{pattern}"" columns to the corresponding f"{pattern}" columns
# predominant pattern columns will have 2 to signify that the pattern is present and is predominant
for pattern in ADENOCARCINOMA_MAIN_PATTERNS:
    tcia_cptac_detailed_labels_df[pattern] += tcia_cptac_detailed_labels_df[
        f"adenocarcinomaPredominantPattern_{pattern}"
    ]
    tcia_cptac_detailed_labels_df[pattern] += tcia_cptac_detailed_labels_df[
        f"secondPredominantPattern_{pattern}"
    ]

# drop the f"adenocarcinomaPredominantPattern_{pattern}" columns as they are not needed anymore
tcia_cptac_detailed_labels_df.drop(
    columns=[
        f"adenocarcinomaPredominantPattern_{pattern}"
        for pattern in ADENOCARCINOMA_MAIN_PATTERNS
    ],
    inplace=True,
)
tcia_cptac_detailed_labels_df.drop(
    columns=[
        f"secondPredominantPattern_{pattern}"
        for pattern in ADENOCARCINOMA_MAIN_PATTERNS
    ],
    inplace=True,
)

# for TCIA-CPTAC we only know which patterns are present on LUAD slides, BUT we do not know which patterns are not present on LUAD slides
# for all LUAD slides, for all LUAD patterns: substitute all 0 values with UNKNOWN_CONSTANT
for pattern in ADENOCARCINOMA_MAIN_PATTERNS:
    condition = (tcia_cptac_detailed_labels_df["LUAD"] == 1) & (
        tcia_cptac_detailed_labels_df[pattern] == 0
    )
    tcia_cptac_detailed_labels_df.loc[condition, pattern] = UNKNOWN_CONSTANT

tcia_cptac_detailed_labels_df.rename(
    mapper=ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES, axis=1, inplace=True
)
display(tcia_cptac_detailed_labels_df.head())
print(tcia_cptac_detailed_labels_df.shape)

print("Number of slides with cancer pattern:")
display(tcia_cptac_detailed_labels_df[LUNG_CANCER_MAIN_TYPES].sum(axis=0))

assert (
    len(tcia_cptac_detailed_labels_df)
    == tcia_cptac_detailed_labels_df[LUNG_CANCER_MAIN_TYPES].sum().sum()
), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display(
    (
        tcia_cptac_detailed_labels_df[
            LUNG_CANCER_MAIN_TYPES
            + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())
        ]
        >= 1
    ).sum()
)

print("Number of slides with predominant patterns")
display(
    (
        tcia_cptac_detailed_labels_df[
            ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()
        ]
        == 2
    ).sum()
)
# check that only 0 (abscent), 1 (present), 2 (present and predominant) are present in the columns with adenocarcinoma patterns
assert (
    tcia_cptac_detailed_labels_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()]
    .isin([-1, 0, 1, 2])
    .all()
    .all()
)

# drop the rest of the columns
tcia_cptac_detailed_labels_df = tcia_cptac_detailed_labels_df[
    ["dataset", "wsi_id", "Case_ID", "source"]
    + LUNG_CANCER_MAIN_TYPES
    + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())
]
tcia_cptac_detailed_labels_df = tcia_cptac_detailed_labels_df.rename(
    columns={"Case_ID": "patient_id"}
)
tcia_cptac_detailed_labels_df = tcia_cptac_detailed_labels_df.sort_values(
    by="patient_id"
).reset_index(drop=True)
display(tcia_cptac_detailed_labels_df.head())


# put the FIRST_COLUMNS at the front
tcia_cptac_detailed_labels_df = tcia_cptac_detailed_labels_df[
    [*FIRST_COLUMNS, *tcia_cptac_detailed_labels_df.columns.difference(FIRST_COLUMNS)]
]

# save the result
tcia_cptac_detailed_labels_df.to_csv(
    f"{experiment_label_files_dir}/DETAILED_TCIA-CPTAC_LUAD_LUSC_BENIGN_OTHERCANCER.csv",
    index=False,
)

## Subset the TCIA-CPTAC dataframe: Do not include "OtherCancer"

In [None]:
condition = (tcia_cptac_detailed_labels_df["OtherCancer"] == 0) # & (tcia_cptac_detailed_labels_df["TC"] == 0)
subset_tcia_cptac_detailed_labels_df = deepcopy(tcia_cptac_detailed_labels_df[
    condition
])
subset_tcia_cptac_detailed_labels_df = subset_tcia_cptac_detailed_labels_df.drop(columns=[
    'OtherCancer',
    # "TC",
])
subset_tcia_cptac_detailed_labels_df = subset_tcia_cptac_detailed_labels_df.reset_index(drop=True)
print(subset_tcia_cptac_detailed_labels_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum())
assert len(subset_tcia_cptac_detailed_labels_df) == subset_tcia_cptac_detailed_labels_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

# save the file
subset_tcia_cptac_detailed_labels_df.to_csv(f'{experiment_label_files_dir}/DETAILED_TCIA-CPTAC_LUAD_LUSC_BENIGN.csv', index=False)

## Train Test Split: 50/50 stratified by (LUAD-LUSC-Benign)

In [None]:
import pandas as pd
subset_tcia_cptac_detailed_labels_df = pd.read_csv(f'{experiment_label_files_dir}/DETAILED_TCIA-CPTAC_LUAD_LUSC_BENIGN.csv')
display(subset_tcia_cptac_detailed_labels_df.head())

In [None]:
# patient_groups = subset_tcia_cptac_detailed_labels_df["patient_id"]
# subset_tcia_cptac_detailed_labels_trainval_df, subset_tcia_cptac_detailed_labels_test_df \
#     = group_split(subset_tcia_cptac_detailed_labels_df, patient_groups, test_size=0.5, random_state=RANDOM_STATE, shuffle=True)


# Usage example with a tuple for features
GROUPS_COL_NAME = 'patient_id'
FEATURES_TO_STRATIFY = ['LUAD', 'LUSC', 'Benign']
TCIA_CPTAC_TEST_SIZE = 0.5

subset_tcia_cptac_detailed_labels_trainval_df, subset_tcia_cptac_detailed_labels_test_df \
    = split_dataset_by_patient(
        subset_tcia_cptac_detailed_labels_df,
        groups_col_name=GROUPS_COL_NAME,
        feature_col_names=FEATURES_TO_STRATIFY,
        test_size=TCIA_CPTAC_TEST_SIZE,
        random_state=RANDOM_STATE,
        )

# check that the patient ids are separate
assert set(subset_tcia_cptac_detailed_labels_test_df.patient_id).intersection(set(subset_tcia_cptac_detailed_labels_trainval_df.patient_id)) == set()


In [None]:
subset_tcia_cptac_detailed_labels_trainval_df
display(subset_tcia_cptac_detailed_labels_trainval_df.head())

print("Number of slides with cancer pattern:")
display(subset_tcia_cptac_detailed_labels_trainval_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(subset_tcia_cptac_detailed_labels_trainval_df) == subset_tcia_cptac_detailed_labels_trainval_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display((subset_tcia_cptac_detailed_labels_trainval_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((subset_tcia_cptac_detailed_labels_trainval_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
subset_tcia_cptac_detailed_labels_test_df
display(subset_tcia_cptac_detailed_labels_test_df.head())

print("Number of slides with cancer pattern:")
display(subset_tcia_cptac_detailed_labels_test_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(subset_tcia_cptac_detailed_labels_test_df) == subset_tcia_cptac_detailed_labels_test_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum(), "There might be slides with 2 or more cancer types"

print("Number of slides with each class present:")
display((subset_tcia_cptac_detailed_labels_test_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((subset_tcia_cptac_detailed_labels_test_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
# check that the patient ids are separate
assert set(subset_tcia_cptac_detailed_labels_test_df.patient_id).intersection(set(subset_tcia_cptac_detailed_labels_trainval_df.patient_id)) == set()

# save the files
subset_tcia_cptac_detailed_labels_trainval_df.to_csv(f'{experiment_label_files_dir}/DETAILED_TCIA-CPTAC_TRAINVAL_LUAD_LUSC_BENIGN.csv', index=False)
subset_tcia_cptac_detailed_labels_test_df.to_csv(f'{experiment_label_files_dir}/DETAILED_TCIA-CPTAC_TEST_LUAD_LUSC_BENIGN.csv', index=False)

# Combined: DHMC + TCGA + TCIA-CPTAC

## All together

In [None]:
combined_detailed_df = pd.concat(
    [
        dhmc_detailed_labels,
        tcga_detailed_labels_df,
        subset_tcia_cptac_detailed_labels_df,
    ]
)
combined_detailed_df = combined_detailed_df.reset_index(drop=True)
combined_detailed_df

In [None]:
print( "Number of slides from each site:")
display(combined_detailed_df["source"].value_counts())


print("Number of slides with cancer pattern:")
display(combined_detailed_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(combined_detailed_df) <= combined_detailed_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum()
if len(combined_detailed_df) < combined_detailed_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum():
    print(f"There are {combined_detailed_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum() - len(combined_detailed_df)} slides with 2 or more cancer types")
    

print("Number of slides with each class present:")
display((combined_detailed_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((combined_detailed_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
combined_detailed_df.to_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_LUAD_LUSC_BENIGN.csv', index=False)

## Train (80% TCGA, 0.5 TCIA); Test (20% TCGA, 0.5 TCIA, full DHMC)

In [None]:
combined_trainval_hard_df = pd.concat(
    [
        tcga_detailed_labels_trainval_df,
        subset_tcia_cptac_detailed_labels_trainval_df,
    ]
)
combined_test_hard_df = pd.concat(
    [
        dhmc_detailed_labels,
        tcga_detailed_labels_test_df,
        subset_tcia_cptac_detailed_labels_test_df,
    ]
)
set(combined_trainval_hard_df.patient_id).intersection(set(combined_test_hard_df.patient_id))

In [None]:
print("Total number of slides in train-val set:")
display(len(combined_trainval_hard_df))

print("Number of slides from each site:")
display(combined_trainval_hard_df["source"].value_counts())


print("Number of slides with cancer pattern:")
display(combined_trainval_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(combined_trainval_hard_df) <= combined_trainval_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum()
if len(combined_trainval_hard_df) < combined_trainval_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum():
    print(f"There are {combined_trainval_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum() - len(combined_trainval_hard_df)} slides with 2 or more cancer types")
    

print("Number of slides with each class present:")
display((combined_trainval_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((combined_trainval_hard_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

# --------------------------------------------------------------------------------------------

print("presence, but not predominant pattern is coded with 1")
display(
    combined_trainval_hard_df.drop(columns=["wsi_id", "patient_id", "dataset"])
    .groupby("source")
    .agg(lambda x: (x == 1).sum())
)
print("=" * 80)

# --------------------------------------------------------------------------------------------

print("predominant pattern is coded with 2")
display(
    combined_trainval_hard_df.drop(
        columns=[
            "wsi_id",
            "patient_id",
            "dataset",
            "LUAD",
            "LUSC",
            "Benign",
            # "TC",
            # "OtherCancer",
        ]
    )
    .groupby("source")
    .agg(lambda x: (x == 2).sum())
)
print("=" * 80)

# --------------------------------------------------------------------------------------------

print(
    "presence is codes with 1, predominance with 2, so `>0` gets both at the same time"
)
display(
    combined_trainval_hard_df.drop(columns=["wsi_id", "patient_id", "dataset"])
    .groupby("source")
    .agg(lambda x: (x > 0).sum())
)
print("=" * 80)

# --------------------------------------------------------------------------------------------

print(
    "presence is codes with 1, predominance with 2, so `>0` gets both at the same time"
)
display(
    combined_trainval_hard_df.drop(
        columns=[
            "wsi_id",
            "patient_id",
            "dataset",
            "LUAD",
            "LUSC",
            "Benign",
            # "TC",
            # "OtherCancer",
        ]
    )
    .groupby("source")
    .agg(lambda x: (x > 0).sum())
)
print("=" * 80)

In [None]:
print("Total number of slides in the test set:")
display(len(combined_test_hard_df))

print("Number of slides from each site:")
display(combined_test_hard_df["source"].value_counts())


print("Number of slides with cancer pattern:")
display(combined_test_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(combined_test_hard_df) <= combined_test_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum()
if len(combined_test_hard_df) < combined_test_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum():
    print(f"There are {combined_test_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum() - len(combined_test_hard_df)} slides with 2 or more cancer types")
    

print("Number of slides with each class present:")
display((combined_test_hard_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((combined_test_hard_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

# --------------------------------------------------------------------------------------------

print("presence, but not predominant pattern is coded with 1")
display(
    combined_test_hard_df.drop(columns=["wsi_id", "patient_id", "dataset"])
    .groupby("source")
    .agg(lambda x: (x == 1).sum())
)
print("=" * 80)

# --------------------------------------------------------------------------------------------

print("predominant pattern is coded with 2")
display(
    combined_test_hard_df.drop(
        columns=[
            "wsi_id",
            "patient_id",
            "dataset",
            "LUAD",
            "LUSC",
            "Benign",
            # "TC",
            # "OtherCancer",
        ]
    )
    .groupby("source")
    .agg(lambda x: (x == 2).sum())
)
print("=" * 80)

# --------------------------------------------------------------------------------------------

print(
    "presence is codes with 1, predominance with 2, so `>0` gets both at the same time"
)
display(
    combined_test_hard_df.drop(columns=["wsi_id", "patient_id", "dataset"])
    .groupby("source")
    .agg(lambda x: (x > 0).sum())
)
print("=" * 80)

# --------------------------------------------------------------------------------------------

print(
    "presence is codes with 1, predominance with 2, so `>0` gets both at the same time"
)
display(
    combined_test_hard_df.drop(
        columns=[
            "wsi_id",
            "patient_id",
            "dataset",
            "LUAD",
            "LUSC",
            "Benign",
            # "TC",
            # "OtherCancer",
        ]
    )
    .groupby("source")
    .agg(lambda x: (x > 0).sum())
)
print("=" * 80)

In [None]:
combined_trainval_hard_df.to_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_HARD_TRAINVAL_LUAD_LUSC_BENIGN.csv', index=False)
combined_test_hard_df.to_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_HARD_TEST_LUAD_LUSC_BENIGN.csv', index=False)

### Make a versino by dropping rows with all patterns having -1 (i.e. unknown pattern presence)

In [None]:
combined_trainval_hard_df = pd.read_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_HARD_TRAINVAL_LUAD_LUSC_BENIGN.csv')
combined_test_hard_df = pd.read_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_HARD_TEST_LUAD_LUSC_BENIGN.csv')

In [None]:
combined_trainval_hard_at_least_one_known_pattern_df = combined_trainval_hard_df[(combined_trainval_hard_df == -1).sum(axis=1) < 5].reset_index(drop=True)
combined_test_hard_at_least_one_known_pattern_df = combined_test_hard_df[(combined_test_hard_df == -1).sum(axis=1) < 5].reset_index(drop=True)

In [None]:
print("Dropped rows with no patterns info:", len(combined_trainval_hard_df) - len(combined_trainval_hard_at_least_one_known_pattern_df))
print("Dropped rows with no patterns info:", len(combined_test_hard_df) - len(combined_test_hard_at_least_one_known_pattern_df))

In [None]:
combined_trainval_hard_at_least_one_known_pattern_df.to_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_HARD_TRAINVAL_LUAD_LUSC_BENIGN_AT_LEAST_ONE_KNOWN_PATTERN.csv', index=False)
combined_test_hard_at_least_one_known_pattern_df.to_csv(f'{experiment_label_files_dir}/DETAILED_COMBINED_HARD_TEST_LUAD_LUSC_BENIGN_AT_LEAST_ONE_KNOWN_PATTERN.csv', index=False)

In [None]:
print("Number of slides from each site:")
display(combined_trainval_hard_at_least_one_known_pattern_df["source"].value_counts())


print("Number of slides with cancer pattern:")
display(combined_trainval_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(combined_trainval_hard_at_least_one_known_pattern_df) <= combined_trainval_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum()
if len(combined_trainval_hard_at_least_one_known_pattern_df) < combined_trainval_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum():
    print(f"There are {combined_trainval_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum() - len(combined_trainval_hard_at_least_one_known_pattern_df)} slides with 2 or more cancer types")
    

print("Number of slides with each class present:")
display((combined_trainval_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((combined_trainval_hard_at_least_one_known_pattern_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())

In [None]:
print("Number of slides from each site:")
display(combined_test_hard_at_least_one_known_pattern_df["source"].value_counts())


print("Number of slides with cancer pattern:")
display(combined_test_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum(axis=0))

assert len(combined_test_hard_at_least_one_known_pattern_df) <= combined_test_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum()
if len(combined_test_hard_at_least_one_known_pattern_df) < combined_test_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum():
    print(f"There are {combined_test_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN].sum().sum() - len(combined_test_hard_at_least_one_known_pattern_df)} slides with 2 or more cancer types")
    

print("Number of slides with each class present:")
display((combined_test_hard_at_least_one_known_pattern_df[LUNG_CANCER_LUAD_LUSC_BENIGN + list(ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values())] >= 1).sum())

print("Number of slides with predominant patterns")
display((combined_test_hard_at_least_one_known_pattern_df[ADENOCARCINOMA_MAIN_PATTERNS_2_LABEL_NAMES.values()] == 2).sum())