In [210]:
import numpy as np
import openslide 
import os
from os.path import join
import pandas as pd
import xml.etree.ElementTree as ET
import pdb

In [211]:
project_root = '/home/fedshyvana/Desktop/projects/lab_clam'
heatmap_dir = join(project_root, 'heatmaps')
pre_cleanup_dir = join(heatmap_dir, 'pre_cleanup_datasheet')
post_cleanup_dir = join(heatmap_dir, 'post_cleanup_datasheet')
production_dir = join(heatmap_dir, 'production_cohorts')

for directory in [project_root, heatmap_dir, pre_cleanup_dir, post_cleanup_dir, production_dir]:
    assert os.path.isdir(directory)


In [212]:
slide_dir = '/data/oncopanel_met_primary_heatmap_slides'
#slide_dir = '/data/tcga_heatmap_slides'
assert os.path.isdir(slide_dir)

#dataset_path = join(project_root, 'dataset_csv','study_v2_all_clean.csv')
dataset_path = join(project_root, 'dataset_csv','cup_final_heatmap.csv')
summary_path = join(project_root, 'summary_files', 'study_v2_mtl_sex_all.csv')

#exp_code = 'init_cohort'
#exp_code = 'tcga_primary'
#exp_code = 'oncopanel_met'
#exp_code = 'primary_met'
exp_code = 'cup_production'
h_dir = join(production_dir, exp_code)
csv_path = join(pre_cleanup_dir, exp_code+'.csv')

for path in [dataset_path, summary_path, csv_path]:
    assert os.path.isfile(path)

assert os.path.isdir(slide_dir)


In [217]:
def fill_missing(h_df, d_df, columns=['site', 'sex', 'oncotree_site', 'diagnosis', 'label']):
    for col in columns:
        if col in h_df.columns:
            mask = h_df[col].isna()
            h_df.loc[mask, col] = d_df.loc[mask, col].copy()
        else:
            h_df.insert(len(h_df.columns), col, d_df[col].values)
    return h_df

def get_site_map_dict(all_sites):
    tcga_map_dict = {'HNSC': 'Head Neck', 'BLCA': 'Bladder', 'ACC': 'Adrenal', 'COAD': 'Colorectal', 
                'TGCT': 'Testis', 'ESCA': 'Esophagogastric', 'STAD': 'Esophagogastric', 'GBM': 'Brain',
                'LGG': 'Brain', 'LUAD': 'Lung', 'LUSC': 'Lung', 'KIRP': 'Kidney', 'KIRC': 'Kidney', 'KICH':'Kidney',
                'OV':'Female Reprod. Tract', 'CESC': 'Female Reprod. Tract', 'THCA': 'Thyroid', 'PRAD': 'Prostate', 
                'READ': 'Colorectal', 'SKCM': 'Skin', 'BRCA': 'Breast', 'LIHC':'Liver', 'PAAD':'Pancreas',
                'UCEC': 'Female Reprod. Tract'}
    site_map_dict = {}
    for site in all_sites:
        if 'TCGA' not in site:
            if site in ['NO_ONCOTREE_NODE_FOUND', 'UNKNOWN', 'OTHER', '']: 
                site_map_dict.update({site: 'Unspecified'})
            elif site.lower() in ['vulva', 'cervix', 'uterus', 'ovary']:
                site_map_dict.update({site: 'Female Reprod. Tract'})
            elif site.lower() in ['stomach']:
                site_map_dict.update({site: 'Esophagogastric'})
            elif site.lower() in ['bowel']:
                site_map_dict.update({site: 'Colorectal'})
            else:
                site_map_dict.update({site:' '.join(site.split('_')).title()})
        else:
            site_map_dict.update({site:tcga_map_dict[site.split('-')[-1]]})
    return site_map_dict

def clean_oncotree_site(h_df, site_map):
    h_df['oncotree_site'] = h_df['oncotree_site'].map(site_map)
    return h_df

def get_meta(slide_path):
    slide = openslide.OpenSlide(slide_path)
    try:
        x = slide.properties['aperio.OriginalWidth']
    except KeyError:
        x = slide.properties['openslide.level[0].width']
        #x = slide.properties['aperio.Originalwidth']
    try:
        y = slide.properties['aperio.OriginalHeight']
    except KeyError:
        y = slide.properties['openslide.level[0].height']
        #y = slide.properties['aperio.Originalheight']
    mpp = float(slide.properties['aperio.MPP'])
    if mpp < 0.3:
        scan_mag = '40X'
    else:
        scan_mag = '20X'
    #scan_mag = slide.properties['aperio.AppMag']+'X'
        
    return x, y, mpp, scan_mag

def calc_area(mpp, bsize):
    return bsize * 256**2 * mpp**2 / (1e3 **2)

import xml.etree.ElementTree as ET
from lxml import etree as ET

def pd2xml(df, save_path):
    root = ET.Element('METADATA');

    for row in df.itertuples(index=False):
        patient = ET.SubElement(root, 'PATIENT')
        for col in df.columns:
            item = ET.SubElement(patient, col)
            value = getattr(row, col)
            if col =='TISSUECONTENT':
                value = '{:.1f}'.format(value)
            elif col in ['X', 'Y', 'WIDTH', 'HEIGHT']:
                value = str(int(value))
            elif isinstance(value, float):
                value = '{:.4f}'.format(value)
            else:
                value = str(value)
            item.text = value

    xml_object = ET.tostring(root,
                            pretty_print=True,
                            xml_declaration=True,
                            encoding='UTF-8')

    with open(save_path, "wb") as writter:
        writter.write(xml_object)

def clean_single_class_df(c_df):
    c_df.insert(0, 'a_id', ['PATIENT_{}'.format(i+1) for i in range(len(c_df))])
    c_df = c_df[['a_id', 'oncotree_site', 'site', 'sex', 'diagnosis', 'x', 'y', 'img_w', 'img_h',
             'tissue_area', 'scan_mag', 'bag_size',
             'label',  'Pred_0',  'Pred_1', 'Pred_2', 'p_0', 'p_1',  'p_2']]
    
    c_df = c_df.rename(columns={'a_id':'SLIDEID', 'oncotree_site': 'TISSUESITE', 'site': 'SAMPLETYPE', 
                       'sex': 'GENDER', 'diagnosis': 'DIAGNOSIS', 'x': 'X', 'y': 'Y', 'img_w': 'WIDTH', 
                       'img_h': 'HEIGHT', 'tissue_area': 'TISSUECONTENT', 'scan_mag':'SCANRESOLUTION',
                       'bag_size': 'NUMOFPATCHES', 'label': 'ORIGIN', 'Pred_0': 'MODELPREDICTION1', 'Pred_1':'MODELPREDICTION2',
                       'Pred_2': 'MODELPREDICTION3', 'p_0': 'MODELCONFIDENCE1', 'p_1': 'MODELCONFIDENCE2',
                       'p_2': 'MODELCONFIDENCE3'})
    return c_df

from PIL import Image
import PIL
PIL.Image.MAX_IMAGE_PIXELS = 933120000

def clean_heatmaps(df, c_h_dir):
    files =  os.listdir(c_h_dir)
    for file in files:
        slide_id = file.split('_')[0].replace('.jpg', '')
        if slide_id not in df.index:
            print(file, 'slide_id not found')
            slide_id = '_'.join(file.split('_')[:2]).replace('.jpg', '')
            patient_id = slide_id

        else:
            patient_id = df.loc[slide_id, 'SLIDEID']
        
        if 'orig' in file:
            new_name = '{}_HE.jpg'.format(patient_id)
        else:
            new_name = '{}.jpg'.format(patient_id)
        
        if not os.path.isfile(join(c_h_dir, new_name)):
            os.system('mv {} {}'.format(join(c_h_dir, file), join(c_h_dir, new_name)))
        #print(c_h_dir)
        #print(file)
        #print(new_name)
        img_path = join(c_h_dir, new_name)
        im = Image.open(img_path)
        w, h = im.size
        df = df.reset_index().set_index('SLIDEID')
        #print(df.loc[patient_id, 'WIDTH'])
        df.loc[patient_id, 'WIDTH'] = w
        df.loc[patient_id, 'HEIGHT'] = h
        #print(df.loc[patient_id, 'WIDTH'])
        df = df.reset_index().set_index('slide_id')
    return df

def reverse_clean_heatmaps(df, c_h_dir):
    files =  os.listdir(c_h_dir)
    df = df.reset_index().set_index('SLIDEID')
    for file in files:
        patient_id = '_'.join(file.split('_')[:2]).replace('.jpg', '')
        
        if patient_id in df.index:
            slide_id = df.loc[patient_id, 'slide_id']
        
        else:
            continue
       
        if 'HE' in file:
            new_name = '{}_orig.jpg'.format(slide_id)
        
        else:
            new_name = '{}.jpg'.format(slide_id)
        
        os.system('mv {} {}'.format(join(c_h_dir, file), join(c_h_dir, new_name)))
    df = df.reset_index().set_index('slide_id')
    return df

In [218]:
all_sites =  pd.read_csv(dataset_path).oncotree_site.unique()
site_map = get_site_map_dict(all_sites)
# Heatmap results
h_df = pd.read_csv(csv_path).set_index('slide_id')
# Dataset
d_df = pd.read_csv(dataset_path).set_index('slide_id').loc[h_df.index]
# Summary of dataset
s_df = pd.read_csv(summary_path).set_index('slide_id').loc[h_df.index]
if 'Y_hat' in h_df.columns:
    h_df.drop(columns=['Y_hat'], inplace=True)

AttributeError: 'float' object has no attribute 'lower'

In [207]:
h_df = fill_missing(h_df, d_df)
h_df = fill_missing(h_df, s_df, ['bag_size'])
h_df = clean_oncotree_site(h_df, site_map)
h_df['site'] = h_df['site'].map({'Metastatic Recurrence': 'Metastatic', 'TCGA Primary Tumor': 'Primary'})

AttributeError: 'DataFrame' object has no attribute 'oncotree_site'

In [208]:
h_df

Unnamed: 0_level_0,Pred_0,p_0,Pred_1,p_1,Pred_2,p_2,site,sex,oncotree_site,diagnosis,label,bag_size,x,y,img_w,img_h,scan_mag,tissue_area
slide_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TCGA-OR-A5K5-01Z-00-DX5.ECB798BF-5EEC-44A4-830F-C0884D40FD22,Breast,0.936008,Adrenal,4.003184e-02,Lung,1.761593e-02,Primary,F,Adrenal,TCGA-ACC,Adrenal,18518,103632,80898,25908.0,20224.0,40X,77.496929
TCGA-OR-A5LK-01Z-00-DX4.A7EF6459-643B-4735-AD52-97DD44335DF3,Adrenal,0.975163,Skin,2.382900e-02,Renal,9.178295e-04,Primary,M,Adrenal,TCGA-ACC,Adrenal,17831,107696,82861,26924.0,20715.0,40X,74.503795
TCGA-H4-A2HQ-01Z-00-DX1.DFBDDFDD-0C65-4130-96F5-4E3838190D13,Bladder,0.756439,Ovarian,9.073485e-02,Endometrial,7.836767e-02,Primary,F,Bladder,TCGA-BLCA,Bladder,15127,121000,89519,30250.0,22379.0,40X,63.606737
TCGA-A7-A426-01Z-00-DX1.3058E873-9442-4872-80DA-E8A5B35054D2,Breast,0.999999,Bladder,7.065032e-07,Pancreatic,2.438403e-07,Primary,F,Breast,TCGA-BRCA,Breast,6045,135904,39004,33976.0,9751.0,40X,24.130513
TCGA-BH-A1EX-01Z-00-DX1.16B6A817-6729-446E-9FCF-A4A333C5295D,Breast,0.999999,Head Neck,1.984951e-07,Lung,1.579766e-07,Primary,F,Breast,TCGA-BRCA,Breast,14471,142000,83068,35500.0,20767.0,40X,59.510546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BL-19-F22360,Lung,0.619714,Ovarian,1.334617e-01,Breast,1.321452e-01,Metastatic,F,Brain,Poorly Differentiated Non-Small Cell Lung Cancer,Lung,21434,73152,36061,36576.0,18030.0,20X,345.717697
BL-15-R37941,Liver,0.391631,Breast,2.798859e-01,Lung,1.300319e-01,,F,Liver,Hepatocellular Carcinoma,Liver,20956,60960,45214,30480.0,22607.0,20X,338.007841
BL-15-T42840,Liver,0.364725,Prostate,3.444593e-01,Bladder,8.213067e-02,,M,Liver,Hepatocellular Carcinoma,Liver,493,14224,25223,7112.0,12611.0,20X,7.951797
BL-14-G07425,Glioma,0.429045,Cervix,3.346865e-01,Endometrial,8.879243e-02,,F,Female Reprod. Tract,Cervical Squamous Cell Carcinoma,Cervix,986,38608,15441,19304.0,7720.0,20X,15.903595


In [202]:
for slide_id in h_df.index:
    slide_path = join(slide_dir, slide_id+'.svs')
    x, y, mpp, scan_mag = get_meta(slide_path)
    if scan_mag == '20X':
        scale = 2
    elif scan_mag == '40X':
        scale = 4
    else:
        raise NotImplementedError
    
    image_width = int(x)//scale
    image_height = int(y)//scale
        
    h_df.loc[slide_id, 'x'] = x
    h_df.loc[slide_id, 'y'] = y
    h_df.loc[slide_id, 'img_w'] = image_width
    h_df.loc[slide_id, 'img_h'] = image_height
    h_df.loc[slide_id, 'scan_mag'] = scan_mag
    bsize = h_df.loc[slide_id, 'bag_size']
    a = calc_area(mpp, bsize)
    try:
        h_df.loc[slide_id, 'tissue_area'] = a
    except:
        pdb.set_trace()

In [203]:
for class_label, counts in h_df.label.value_counts().items():
    c_df = h_df.loc[h_df.label == class_label].copy()
    #pdb.set_trace()
    c_df = clean_single_class_df(c_df)
    class_label = '_'.join(class_label.split())
    c_h_dir = join(h_dir, class_label)
    c_df = clean_heatmaps(c_df, c_h_dir)
    #c_df = reverse_clean_heatmaps(c_df, c_h_dir)
    c_df.to_csv(join(post_cleanup_dir, exp_code+'_{}.csv'.format(class_label)))
    xml_path = join(post_cleanup_dir, exp_code+'_{}.xml'.format(class_label))
    pd2xml(c_df, xml_path)
    

PATIENT_2_HE.jpg slide_id not found
PATIENT_1_HE.jpg slide_id not found
PATIENT_6.jpg slide_id not found
PATIENT_3.jpg slide_id not found
PATIENT_9_HE.jpg slide_id not found
PATIENT_2.jpg slide_id not found
PATIENT_7.jpg slide_id not found
PATIENT_1.jpg slide_id not found
PATIENT_5.jpg slide_id not found
PATIENT_3_HE.jpg slide_id not found
PATIENT_4_HE.jpg slide_id not found
PATIENT_7_HE.jpg slide_id not found
PATIENT_8.jpg slide_id not found
PATIENT_6_HE.jpg slide_id not found
PATIENT_8_HE.jpg slide_id not found
PATIENT_4.jpg slide_id not found
PATIENT_9.jpg slide_id not found
PATIENT_5_HE.jpg slide_id not found
PATIENT_2_HE.jpg slide_id not found
PATIENT_1_HE.jpg slide_id not found
PATIENT_6.jpg slide_id not found
PATIENT_3.jpg slide_id not found
PATIENT_2.jpg slide_id not found
PATIENT_1.jpg slide_id not found
PATIENT_5.jpg slide_id not found
PATIENT_3_HE.jpg slide_id not found
PATIENT_4_HE.jpg slide_id not found
PATIENT_6_HE.jpg slide_id not found
PATIENT_4.jpg slide_id not found
P

In [204]:
c_df

Unnamed: 0_level_0,SLIDEID,TISSUESITE,SAMPLETYPE,GENDER,DIAGNOSIS,X,Y,WIDTH,HEIGHT,TISSUECONTENT,SCANRESOLUTION,NUMOFPATCHES,ORIGIN,MODELPREDICTION1,MODELPREDICTION2,MODELPREDICTION3,MODELCONFIDENCE1,MODELCONFIDENCE2,MODELCONFIDENCE3
slide_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
TCGA-H4-A2HQ-01Z-00-DX1.DFBDDFDD-0C65-4130-96F5-4E3838190D13,PATIENT_1,Bladder,Primary,F,TCGA-BLCA,121000,89519,29021.0,22354.0,63.606737,40X,15127,Bladder,Bladder,Ovarian,Endometrial,0.756439,0.090735,0.078368
