## Find HCA project data on TCGA cancer tissues

In [1]:
import pandas as pd

a = pd.read_csv("TCGA.csv")
Disease_set = set(a["Disease"].values.tolist())
Tissues = set(a["Tissue(Primary site)"].values.tolist())



In [2]:
Tissues # Primary site in TCGA

{'Adrenal gland',
 'Bile duct',
 'Bladder',
 'Bone',
 'Bone Marrow',
 'Brain',
 'Breast',
 'Cervix',
 'Colorectal',
 'Esophagus',
 'Eye',
 'Head and Neck',
 'Kidney',
 'Liver',
 'Lung',
 'Lymph Nodes',
 'Nervous System',
 'Ovary',
 'Pancreas',
 'Pleura',
 'Prostate',
 'Skin',
 'Soft Tissue',
 'Stomach',
 'Testis',
 'Thymus',
 'Thyroid',
 'Uterus'}

In [3]:
import os
path = 'E:\\HCA_save\\project metadata'
os.chdir(path)

HCA_meta = pd.read_csv("HCA-project-metadata.tsv", sep = '\t') # The project metadata in HCA

In [4]:
HCA_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316581 entries, 0 to 316580
Data columns (total 55 columns):
 #   Column                                                           Non-Null Count   Dtype  
---  ------                                                           --------------   -----  
 0   source_id                                                        316581 non-null  object 
 1   source_name                                                      316581 non-null  object 
 2   bundle_uuid                                                      316581 non-null  object 
 3   bundle_version                                                   316581 non-null  object 
 4   file_document_id                                                 316581 non-null  object 
 5   file_type                                                        316581 non-null  object 
 6   file_name                                                        316581 non-null  object 
 7   file_format                  

In [5]:
project_organ = HCA_meta[['project.project_core.project_title', 'specimen_from_organism.organ', 'specimen_from_organism.organ_part']]
project_organ.head()

Unnamed: 0,project.project_core.project_title,specimen_from_organism.organ,specimen_from_organism.organ_part
0,Melanoma infiltration of stromal and immune cells,tumor,
1,Reconstructing the human first trimester fetal...,decidua,
2,Tabula Muris: Transcriptomic characterization ...,muscle organ,
3,Tabula Muris: Transcriptomic characterization ...,heart,
4,Tabula Muris: Transcriptomic characterization ...,heart,


In [7]:
cleaned = project_organ.dropna(axis = 0, thresh = 2)  # remove rows with both organ and organ_part are NaN 
cleaned.values

array([['Melanoma infiltration of stromal and immune cells', 'tumor',
        nan],
       ['Reconstructing the human first trimester fetal-maternal interface using single cell transcriptomics',
        'decidua', nan],
       ['Tabula Muris: Transcriptomic characterization of 20 organs and tissues from Mus musculus at single cell resolution',
        'muscle organ', nan],
       ...,
       ['Precursors of human CD4+ cytotoxic T lymphocytes identified by single-cell transcriptome analysis',
        'blood', 'peripheral blood mononuclear cell'],
       ['A single-cell molecular map of mouse gastrulation and early organogenesis',
        'embryo', nan],
       ['Single-Cell RNAseq analysis of diffuse neoplastic infiltrating cells at the migrating front of human glioblastoma',
        'brain', 'cerebral cortex']], dtype=object)

In [8]:
organ_and_organ_part_pairs = set([tuple(xi) for xi in cleaned[['specimen_from_organism.organ', 'specimen_from_organism.organ_part']].values])
organ_and_organ_part_pairs # Find all pairs of organ and organ part, match each pair, if proper, to a TCGA cancer primary site

{('Immune system', 'cerebrospinal fluid'),
 ('abdomen', 'dermis'),
 ('adipose tissue || aorta || bladder organ || bone tissue || brain || diaphragm || heart || kidney || large intestine || liver || lung || mammary gland || muscle organ || pancreas || skin of body || spleen || thymus || tongue || trachea',
  nan),
 ('adipose tissue',
  'axillary fat pad || inguinal fat pad || white adipose tissue'),
 ('adipose tissue', 'subcutaneous abdominal adipose tissue'),
 ('adipose tissue', nan),
 ('aorta', nan),
 ('bladder organ', nan),
 ('blastocyst', 'inner cell mass'),
 ('blood vessel', 'ascending aorta'),
 ('blood vessel', nan),
 ('blood || bone marrow || spleen', nan),
 ('blood || cell culture || skin', nan),
 ('blood || decidua || placenta', nan),
 ('blood || hematopoietic system || lung || mediastinal lymph node', nan),
 ('blood || kidney', nan),
 ('blood || liver || spleen', nan),
 ('blood || liver', nan),
 ('blood || mouth', nan),
 ('blood', 'blood'),
 ('blood', 'peripheral blood mononuc

In [9]:
from collections import defaultdict
import numpy as np

os.chdir("C:\\Users\\Robert Yang F\\Desktop\\granatumx")
df = pd.read_csv("TCGA-HCA.csv", sep = ',')

pairs = df.values.tolist()
primary_site_to_HCA_organ = defaultdict(list)
for i in pairs:
    if type(i[1]) == type(''):
        tmp = i[1].split(',')
        primary_site_to_HCA_organ[i[0]] += tmp
    else:
        primary_site_to_HCA_organ[i[0]] = []
        
def clean_str(s):
    return s.strip().replace('(', '').replace(')', '').replace("'", '')

def TCGA_to_HCA(prim_site): # The imput is TCGA primary set, the output is the matching HCA project title and organ
    HCA_organ = []
    HCA_organ_part = []
    project_list = []
    for i in range(len(primary_site_to_HCA_organ[prim_site])):
        if i % 2 == 0:
            HCA_organ.append(clean_str(primary_site_to_HCA_organ[prim_site][i]))
        if i % 2 == 1:
            tmp =  clean_str(primary_site_to_HCA_organ[prim_site][i])
            if tmp == "nan":
                HCA_organ_part.append(float('NaN'))
            else:
                HCA_organ_part.append(tmp)
    for i in range(len(HCA_organ)):
        organ = HCA_organ[i]
        organ_part = HCA_organ_part[i]
        if type(organ_part) != type('str'):
            tmp = project_organ[(project_organ['specimen_from_organism.organ'] == organ) & 
                          (project_organ['specimen_from_organism.organ_part'].isnull())]
        else:
            tmp = project_organ[(project_organ['specimen_from_organism.organ'] == organ) &
                                (project_organ['specimen_from_organism.organ_part'] == organ_part)]
        project_list += list(set([tuple(xi) for xi in tmp[['project.project_core.project_title', 'specimen_from_organism.organ']].values]))
    return set(project_list)
            

In [10]:
TCGA_to_HCA('Brain')

{('1.3 Million Brain Cells from E18 Mice', 'brain'),
 ('A human single cell atlas of the substantia nigra reveals novel cell specific pathways associated with the genetic risk of Parkinson’s disease and neuropsychiatric disorders.',
  'brain'),
 ('A survey of human brain transcriptome diversity at the single cell level',
  'brain'),
 ('Altered human oligodendrocyte heterogeneity in multiple sclerosis',
  'brain'),
 ('Characterization of the transcriptional landscape of human developing hippocampus',
  'brain'),
 ('Dissecting the clonal nature of allelic expression in somatic cells by single-cell RNA-seq',
  'brain'),
 ('Dissecting the clonal nature of allelic expression in somatic cells by single-cell RNA-seq',
  'brain || liver || skin'),
 ('High throughput error corrected Nanopore single cell transcriptome sequencing.',
  'brain'),
 ('Human cerebral organoids recapitulate gene expression programs of fetal neocortex development.',
  'brain'),
 ('Human cerebral organoids recapitulate g

In [11]:
df2 = pd.read_csv("HCA_project_id_name.csv")
df2.set_index('Project_name').T.to_dict('records') 
# transfer dataframe to dictionary, the key is the title name and the value is the project ID

[{'1.3 Million Brain Cells from E18 Mice': '74b6d569-3b11-42ef-b6b1-a0454522b4a0',
  'A Cellular Atlas of Pitx2-Dependent Cardiac Development.': '7027adc6-c9c9-46f3-84ee-9badc3a4f53b',
  'A Single-Cell Transcriptomic Map of the Human and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure': 'f86f1ab4-1fbb-4510-ae35-3ffd752d4dfc',
  'A cell atlas of human thymic development defines T cell repertoire formation': 'c1810dbc-16d2-45c3-b45e-3e675f88d87b',
  "A human single cell atlas of the substantia nigra reveals novel cell specific pathways associated with the genetic risk of Parkinson's disease and neuropsychiatric disorders.": '996120f9-e84f-409f-a01e-732ab58ca8b9',
  'A revised airway epithelial hierarchy includes CFTR-expressing ionocytes': '6072616c-8794-4b20-8f52-fb15992ea5a4',
  'A single cell atlas of the human proximal epididymis reveals cell-type specific functions and distinct roles for CFTR.': '842605c7-375a-47c5-9e2c-a71c2c00fcad',
  'A single-cell molecular map

## Meta data in TCGA

In [14]:
import os
path = 'E:\\HCA_save'
os.chdir(path)

DNA = pd.read_csv('jhu-usc.edu_UCEC.HumanMethylation27.2.lvl-3.TCGA-A5-A0G2-01A-11D-A039-05.gdc_hg38.txt', sep = '\t')
DNA

Unnamed: 0,Composite Element REF,Beta_value,Chromosome,Start,End,Gene_Symbol,Gene_Type,Transcript_ID,Position_to_TSS,CGI_Coordinate,Feature_Type
0,cg00000292,0.944106,chr16,28878779,28878780,ATP2A1;ATP2A1;ATP2A1;ATP2A1;ATP2A1,protein_coding;protein_coding;protein_coding;p...,ENST00000357084.6;ENST00000395503.7;ENST000005...,373;290;-1275;-465;-83,CGI:chr16:28879633-28880547,N_Shore
1,cg00002426,0.128014,chr3,57757816,57757817,SLMAP;SLMAP;SLMAP;SLMAP;SLMAP;SLMAP,protein_coding;protein_coding;protein_coding;p...,ENST00000295951.6;ENST00000295952.6;ENST000003...,1585;368;261;257;257;514,CGI:chr3:57756198-57757263,S_Shore
2,cg00003994,0.285217,chr7,15686237,15686238,MEOX2,protein_coding,ENST00000262041.5,576,CGI:chr7:16399497-16399700,.
3,cg00005847,0.631192,chr2,176164345,176164346,AC009336.19;HOXD3;HOXD3;HOXD3;RP11-387A1.5,protein_coding;protein_coding;protein_coding;p...,ENST00000468418.4;ENST00000249440.4;ENST000004...,13259;267;3453;27387;1372,CGI:chr2:176164685-176165509,N_Shore
4,cg00006414,,chr7,149125745,149125746,RN7SL521P;ZNF398;ZNF425;ZNF425,misc_RNA;protein_coding;protein_coding;protein...,ENST00000488398.3;ENST00000426851.5;ENST000003...,242;-672;602;562,CGI:chr7:149126122-149127136,N_Shore
...,...,...,...,...,...,...,...,...,...,...,...
27573,cg27657283,0.036363,chr1,42767852,42767853,C1orf50;C1orf50;C1orf50;P3H1;P3H1;P3H1;P3H1;P3...,protein_coding;protein_coding;protein_coding;p...,ENST00000372525.5;ENST00000464081.1;ENST000006...,582;559;545;-839;-828;-849;-767;-862;-834;-829,CGI:chr1:42766543-42767688,S_Shore
27574,cg27661264,0.446497,chr20,58852683,58852684,GNAS;GNAS;GNAS;GNAS;GNAS;GNAS;GNAS;GNAS;GNAS;G...,protein_coding;protein_coding;protein_coding;p...,ENST00000306120.3;ENST00000313949.10;ENST00000...,-774;12964;12934;12934;-581;-32;-581;12178;123...,CGI:chr20:58852636-58852940,Island
27575,cg27662379,0.013272,chr3,128650910,128650911,RPN1;RPN1;RPN1;RPN1;RPN1;RPN1,protein_coding;protein_coding;protein_coding;p...,ENST00000296255.6;ENST00000476931.1;ENST000004...,-60;-89;-88;620;30166;-93,CGI:chr3:128650086-128650962,Island
27576,cg27662877,0.020575,chr18,74292360,74292361,CYB5A;CYB5A;CYB5A;CYB5A;CYB5A,protein_coding;protein_coding;protein_coding;p...,ENST00000299438.12;ENST00000340533.7;ENST00000...,-1318;-343;-386;-424;-402,CGI:chr18:74290906-74292535,Island
