In [None]:
import os
import sys

import glob

In [None]:
import numpy as np
import pandas as pd

import tqdm

from PIL import Image

In [None]:
import openslide

In [None]:
def extract_short_slide_id_from_long_slide_id_file_name(long_slide_id_file_name: str):
    return long_slide_id_file_name.split(".")[0]

def extract_slide_id_from_csv_path(path_to_csv: str):
    return os.path.basename(path_to_csv).split(".")[0]

def extract_patient_id(slide_id: str):
    return slide_id[:12]

def extract_case_id(slide_id: str):
    return slide_id[:15]

def get_label_name(label_int: int):
    return "LUAD" if label_int == 0 else "LUSC"

In [None]:
TCGA_LUNG_CASES_FOLDER = '../../cases'
# assumes that inside the TCGA_LUNG_CASES_FOLDER there are folders with the case ids as names and inside each of those folders there is a .svs file with the slide id as name
tcga_slides_paths = glob.glob(f"{TCGA_LUNG_CASES_FOLDER}/*/*.svs")

# folder where to save thumbnails - if it does not exist, it will be created
THUMBNAILS_FOLDER = '../thumbnails'
os.makedirs(THUMBNAILS_FOLDER, exist_ok=True)
RECOMPUTE_THUMBNAILS = False
THUMBNAIL_SMALLER_SIDE = 512 # pixels; aspect ratio will be preserved with the smaller side being equal to this value

# Downloaded from TCGA

## Check md5sum of downloaded slides agains manifest files

In [None]:
cancer_type_2_manifest_path = {
    cancer_type: sorted(glob.glob(f"./tcga-download/gdc_manifest.*-TCGA-{cancer_type}.txt"))[-1] # get the latest manifest file
    for cancer_type in ['LUAD', 'LUSC']                                                          # for each cancer type
}

# format of the path to the manifest files
# luad_manifest_path = "tcga-download/gdc_manifest.2021-11-03-TCGA-LUAD.txt"
# lusc_manifest_path = "tcga-download/gdc_manifest.2021-11-03-TCGA-LUSC.txt"

slides_md5_path = './downloaded_md5sum_hashes.txt'
cancer_type_2_manifest_path

In [None]:
manifest_filename_md5sum = {
    "slide_id_short": [],
    "slide_md5sum": [],
    "cancer_type": [],
}

for cancer_type, manifest_path in cancer_type_2_manifest_path.items():

    with open(manifest_path, 'r') as f:
        for i, l in enumerate(f):
            if i == 0:
                continue
            id, filname, md5, size, state = l.split()
            short_slide_id = extract_short_slide_id_from_long_slide_id_file_name(filname)
            # print(md5, short_slide_id)

            manifest_filename_md5sum["slide_id_short"].append(short_slide_id)
            manifest_filename_md5sum["slide_md5sum"].append(md5)
            manifest_filename_md5sum["cancer_type"].append(cancer_type)


manifest_md5_df = pd.DataFrame(manifest_filename_md5sum).sort_values("slide_id_short").reset_index(drop=True)
manifest_md5_df

In [None]:
downloaded_filename_md5sum = {
    "slide_id_short": [],
    "slide_md5sum": []
}

with open(slides_md5_path, 'r') as f:
    for l in f:
        md5sum, slide_path = l.split()
        short_slide_id = extract_short_slide_id_from_long_slide_id_file_name(os.path.basename(slide_path))
        # print(md5sum, short_slide_id)

        downloaded_filename_md5sum["slide_id_short"].append(short_slide_id)
        downloaded_filename_md5sum["slide_md5sum"].append(md5sum)

slides_md5_df = pd.DataFrame(downloaded_filename_md5sum).sort_values("slide_id_short").reset_index(drop=True)
slides_md5_df

In [None]:
assert (manifest_md5_df[["slide_id_short", "slide_md5sum"]] == slides_md5_df[["slide_id_short", "slide_md5sum"]]).all().all()

## Downloaded slides - extract and save info

In [None]:
tcga_cases_paths = glob.glob(f"{TCGA_LUNG_CASES_FOLDER}/*")

len(tcga_cases_paths), tcga_cases_paths

In [None]:
tcga_slides_paths = glob.glob(f"{TCGA_LUNG_CASES_FOLDER}/*/*.svs")

len(tcga_slides_paths), tcga_slides_paths

In [None]:
saved_slides_df = pd.DataFrame(map(os.path.basename, tcga_slides_paths))
saved_slides_df = saved_slides_df.rename(columns={0: "slide_file_name"})

saved_slides_df[["slide_id_short", "slide_id_hash"]] = saved_slides_df["slide_file_name"].str.split(".", expand=True)[[0, 1]]
saved_slides_df["case_id"] = saved_slides_df["slide_id_short"].apply(extract_case_id)
saved_slides_df["patient_id"] = saved_slides_df["slide_id_short"].apply(extract_patient_id)

# checks before merge
assert set(saved_slides_df["slide_id_short"]) == set(slides_md5_df["slide_id_short"])
assert len(saved_slides_df["slide_id_short"]) == len(slides_md5_df["slide_id_short"])
saved_slides_df = saved_slides_df.merge(manifest_md5_df, on=["slide_id_short"], how="inner")

saved_slides_df

In [None]:
# same results with slide_id_short, slide_id_long, slide_id_hash
print("LUAD patients:", saved_slides_df[saved_slides_df["cancer_type"] == "LUAD"].patient_id.nunique())
print("LUSC patients:", saved_slides_df[saved_slides_df["cancer_type"] == "LUSC"].patient_id.nunique())
print("Total patients:", saved_slides_df.patient_id.nunique())
print()
print("LUAD cases:", saved_slides_df[saved_slides_df["cancer_type"] == "LUAD"].case_id.nunique())
print("LUSC cases:", saved_slides_df[saved_slides_df["cancer_type"] == "LUSC"].case_id.nunique())
print("Total cases:", saved_slides_df.case_id.nunique())
print()
print("LUAD slides:", saved_slides_df[saved_slides_df["cancer_type"] == "LUAD"].slide_id_short.nunique())
print("LUSC sildes:", saved_slides_df[saved_slides_df["cancer_type"] == "LUSC"].slide_id_short.nunique())
print("Total slides:", saved_slides_df.slide_id_short.nunique())

In [None]:
saved_slides_df = saved_slides_df.sort_values(["cancer_type", "slide_id_short"]).reset_index(drop=True)

saved_slides_df.to_csv("./classes_extended_info.csv", index=False)
saved_slides_df.to_csv("../classes_extended_info.csv", index=False)

## Make thumbnails of slides

In [None]:
if RECOMPUTE_THUMBNAILS:
    for slide_path in tqdm.notebook.tqdm(tcga_slides_paths):
        slide = openslide.OpenSlide(slide_path)
        print(slide_path)
        slide_id = os.path.basename(slide_path).split(".")[0]

        thumbnain_path = f"../thumbnails/{slide_id}.jpeg"

        h, w = slide.dimensions
        h_small, w_small = np.array([h, w]) / min(h, w) * THUMBNAIL_SMALLER_SIDE

        thumbnail = slide.get_thumbnail([int(h_small), int(w_small)])
        thumbnail.save(thumbnain_path)
        
        slide.close()
else:
    print("Skipping thumbnails recomputation")

print("\nThumbnails folder has {} files".format(len(os.listdir(THUMBNAILS_FOLDER))))

# DSMIL Google Drive and DSMIL GitHub

## Google Drive

In [None]:
gdrive_test_df = pd.read_csv("dsmil-split/google-drive/TEST_ID.csv")
gdrive_test_df.columns = ["slide_id_short", "label"]

In [None]:
gdrive_test_df = pd.read_csv("dsmil-split/google-drive/TEST_ID.csv")
gdrive_test_df.columns = ["slide_id_short", "label"]
gdrive_test_df['subset'] = "TEST"

gdrive_excluded_df = pd.read_csv("dsmil-split/google-drive/EX_ID.csv")
gdrive_excluded_df['subset'] = "EXCLUDED"
gdrive_excluded_df.columns = ["slide_id_short", "subset"]
gdrive_luad_df = pd.read_csv("dsmil-split/google-drive/LUAD_ID.csv")
gdrive_luad_df['subset'] = "LUAD"
gdrive_luad_df.columns = ["slide_id_short", "subset"]
gdrive_lusc_df = pd.read_csv("dsmil-split/google-drive/LUSC_ID.csv")
gdrive_lusc_df['subset'] = "LUSC"
gdrive_lusc_df.columns = ["slide_id_short", "subset"]


# cobmination of luad, lusc and excluded
gdrive_all_df = pd.concat([gdrive_excluded_df, gdrive_luad_df, gdrive_lusc_df]).reset_index(drop=True)
gdrive_all_df.columns = ["slide_id_short", "subset"]
gdrive_all_df["case_id"] = gdrive_all_df["slide_id_short"].apply(extract_case_id)
gdrive_all_df["patient_id"] = gdrive_all_df["slide_id_short"].apply(extract_patient_id)
gdrive_included_df = gdrive_all_df[gdrive_all_df["subset"].isin(["LUAD", "LUSC"])].reset_index(drop=True)
display(gdrive_all_df)
print(len(gdrive_all_df))

# assert luad, lusc and excluded are disjoint
assert len(set(gdrive_luad_df["slide_id_short"]).intersection(set(gdrive_lusc_df["slide_id_short"]))) == 0
assert len(set(gdrive_luad_df["slide_id_short"]).intersection(set(gdrive_excluded_df["slide_id_short"]))) == 0
assert len(set(gdrive_lusc_df["slide_id_short"]).intersection(set(gdrive_excluded_df["slide_id_short"]))) == 0

# assert that all 4 dataframes in google-drive are part of the all dataframe
assert gdrive_luad_df["slide_id_short"].isin(gdrive_all_df["slide_id_short"]).all()
assert gdrive_lusc_df["slide_id_short"].isin(gdrive_all_df["slide_id_short"]).all()
assert gdrive_excluded_df["slide_id_short"].isin(gdrive_all_df["slide_id_short"]).all()
assert gdrive_test_df["slide_id_short"].isin(gdrive_all_df["slide_id_short"]).all()

# assert that the test dataframe is disjoint from the excluded dataframe
# assert len(set(gdrive_test_df["slide_id_short"]).intersection(set(gdrive_excluded_df["slide_id_short"]))) == 0, "test and excluded are not disjoint: {}".format(set(gdrive_test_df["slide_id_short"]).intersection(set(gdrive_excluded_df["slide_id_short"])))

In [None]:
print("test and excluded are not disjoint: {}".format(set(gdrive_test_df["slide_id_short"]).intersection(set(gdrive_excluded_df["slide_id_short"]))))

In [None]:
gdrive_all_df

In [None]:
print("LUAD patients:", gdrive_all_df[gdrive_all_df["subset"] == "LUAD"].patient_id.nunique())
print("LUSC patients:", gdrive_all_df[gdrive_all_df["subset"] == "LUSC"].patient_id.nunique())
print("Excluded patients:", gdrive_all_df[gdrive_all_df["subset"] == "EXCLUDED"].patient_id.nunique())
print("Total patients:", gdrive_all_df.patient_id.nunique())
print()
print("LUAD cases:", gdrive_all_df[gdrive_all_df["subset"] == "LUAD"].case_id.nunique())
print("LUSC cases:", gdrive_all_df[gdrive_all_df["subset"] == "LUSC"].case_id.nunique())
print("Excluded cases:", gdrive_all_df[gdrive_all_df["subset"] == "EXCLUDED"].case_id.nunique())
print("Total cases:", gdrive_all_df.case_id.nunique())
print()
print("LUAD slides:", gdrive_all_df[gdrive_all_df["subset"] == "LUAD"].slide_id_short.nunique())
print("LUSC slides:", gdrive_all_df[gdrive_all_df["subset"] == "LUSC"].slide_id_short.nunique())
print("Excluded slides:", gdrive_all_df[gdrive_all_df["subset"] == "EXCLUDED"].slide_id_short.nunique())
print("Total slides:", gdrive_all_df.slide_id_short.nunique())

In [None]:
gdrive_excluded_patients = set(gdrive_all_df[gdrive_all_df["subset"] == "EXCLUDED"].patient_id)
gdrive_excluded_cases = set(gdrive_all_df[gdrive_all_df["subset"] == "EXCLUDED"].case_id)
gdrive_excluded_slides = set(gdrive_all_df[gdrive_all_df["subset"] == "EXCLUDED"].slide_id_short)

print("Google Drive Excluded patients:")
print(gdrive_excluded_patients)
print()
print("Google Drive Excluded cases:")
print(gdrive_excluded_cases)
print()
print("Google Drive Excluded slides:")
print(gdrive_excluded_slides)

if len(gdrive_excluded_slides) == len(gdrive_excluded_slides) == len(gdrive_excluded_slides):
    display(
        pd.DataFrame({
        "patient_id": sorted(list(gdrive_excluded_patients)),
        "case_id": sorted(list(gdrive_excluded_cases)),
        "slide_id_short": sorted(list(gdrive_excluded_slides)),
        })
    )

In [None]:
assert set(saved_slides_df["patient_id"]) == set(gdrive_all_df["patient_id"])
assert set(saved_slides_df["case_id"]) == set(gdrive_all_df["case_id"])
assert set(saved_slides_df["slide_id_short"]) == set(gdrive_all_df["slide_id_short"])

# merge saved_slides_df and gdrive_all_df on slide_id_short
merged_df = saved_slides_df.merge(gdrive_all_df, on=["patient_id", "case_id", "slide_id_short"], how="inner")
merged_df

In [None]:
merged_df[merged_df["cancer_type"] != merged_df["subset"]]

## GitHub

In [None]:
github_luad_df = pd.read_csv("dsmil-split/repository-download-TCGA-lung-ms/LUAD.csv")
github_lusc_df = pd.read_csv("dsmil-split/repository-download-TCGA-lung-ms/LUSC.csv")
github_all_df = pd.read_csv("dsmil-split/repository-download-TCGA-lung-ms/TCGA-lung-ms.csv")

github_all_df["features_csv_file_path"].iloc[0]

In [None]:
github_all_df["slide_id_short"] = github_all_df["features_csv_file_path"].apply(extract_slide_id_from_csv_path)
github_all_df["case_id"] = github_all_df["slide_id_short"].apply(extract_case_id)
github_all_df["patient_id"] = github_all_df["slide_id_short"].apply(extract_patient_id)
github_all_df["label_name"] = github_all_df["label"].apply(get_label_name)

assert github_all_df["case_id"].isin(saved_slides_df["case_id"]).all()
assert github_all_df["slide_id_short"].isin(saved_slides_df["slide_id_short"]).all()

display(github_all_df)

In [None]:
print("LUAD patients:", github_all_df[github_all_df["label_name"] == "LUAD"].patient_id.nunique())
print("LUSC patients:", github_all_df[github_all_df["label_name"] == "LUSC"].patient_id.nunique())
print("Total patients:", github_all_df.patient_id.nunique())
print()
print("LUAD cases:", github_all_df[github_all_df["label_name"] == "LUAD"].case_id.nunique())
print("LUSC cases:", github_all_df[github_all_df["label_name"] == "LUSC"].case_id.nunique())
print("Total cases:", github_all_df.case_id.nunique())
print()
print("LUAD slides:", github_all_df[github_all_df["label_name"] == "LUAD"].slide_id_short.nunique())
print("LUSC slides:", github_all_df[github_all_df["label_name"] == "LUSC"].slide_id_short.nunique())
print("Total slides:", github_all_df.slide_id_short.nunique())

In [None]:
github_excluded_patients = set(saved_slides_df["patient_id"]) - set(github_all_df["patient_id"])
github_excluded_cases = set(saved_slides_df["case_id"]) - set(github_all_df["case_id"])
github_excluded_slides = set(saved_slides_df["slide_id_short"]) - set(github_all_df["slide_id_short"])

print("GitHub Excluded patients:")
print(len(github_excluded_patients), github_excluded_patients)
print()
print("GitHub Excluded cases:")
print(len(github_excluded_cases), github_excluded_cases)
print()
print("GitHub Excluded slides:")
print(len(github_excluded_slides), github_excluded_slides)

if len(github_excluded_patients) == len(github_excluded_cases) == len(github_excluded_slides):
    display(
        pd.DataFrame({
        "patient_id": sorted(list(github_excluded_patients)),
        "case_id": sorted(list(github_excluded_cases)),
        "slide_id_short": sorted(list(github_excluded_slides)),
        })
    )

## Google Drive vs GitHub

In [None]:
assert github_excluded_slides.issubset(gdrive_excluded_slides)
assert set(gdrive_included_df.slide_id_short).issubset(set(github_all_df.slide_id_short))

In [None]:
set_name_2_set = {
    "GitHub and Google Drive Excluded": github_excluded_slides,
    "Google Drive Only Excluded": gdrive_excluded_slides - github_excluded_slides,
}

for set_name, set_ in set_name_2_set.items():
    print(set_name)
    for slide_id in set_:
        print(slide_id)
        thumbnain_path = f"../thumbnails/{slide_id}.jpeg"
        display(Image.open(thumbnain_path))
        print("="*100)
    print("#"*100)

## GitHub Included vs Google Drive Test

In [None]:
# check that all slides in google drive's test set have not been excluded from the github dataset
assert len(set(gdrive_test_df["slide_id_short"]).intersection(github_excluded_slides)) == 0

In [None]:
# check that all slides in the google drive's test set are part of the github dataset
assert gdrive_test_df["slide_id_short"].isin(github_all_df["slide_id_short"]).all()