In [231]:
import os
import pandas as pd

#### Load metadata file for all cases for the Vanderbilt dataset

In [232]:
caseDF = pd.read_csv("metadata/cases.tsv", sep="\t")
caseDF = caseDF[["HTAN Participant ID", "Atlas Name", "Age at Diagnosis (years)", "Year Of Birth", "Race", "Gender", "Ethnicity", "Vital Status"]]
caseDF.head()

Unnamed: 0,HTAN Participant ID,Atlas Name,Age at Diagnosis (years),Year Of Birth,Race,Gender,Ethnicity,Vital Status
0,HTA11_347,HTAN Vanderbilt,0,1959.0,Other,female,not hispanic or latino,Alive
1,HTA11_78,HTAN Vanderbilt,0,1952.0,white,male,not hispanic or latino,Alive
2,HTA11_696,HTAN Vanderbilt,0,1969.0,white,male,not hispanic or latino,Alive
3,HTA11_83,HTAN Vanderbilt,0,1955.0,black or african american,male,not hispanic or latino,Alive
4,HTA11_104,HTAN Vanderbilt,0,1965.0,white,male,not hispanic or latino,Alive


#### Load metadata file for all biospeciemens for the Vanderbilt dataset

In [233]:
bioDF = pd.read_csv("metadata/biospecimens.tsv", sep="\t")
bioDF = bioDF[["HTAN Parent ID", "HTAN Biospecimen ID", "Collection Days from Index", "Storage Method", "Processing Days from Index", "Preservation Method", "Tumor Tissue Type"]]
bioDF.head()

Unnamed: 0,HTAN Parent ID,HTAN Biospecimen ID,Collection Days from Index,Storage Method,Processing Days from Index,Preservation Method,Tumor Tissue Type
0,HTA11_347,HTA11_347_2000001011,21802,Fresh,21802,Fresh,Premalignant
1,HTA11_78,HTA11_78_2000001011,24370,Fresh,24370,Fresh,Premalignant
2,HTA11_696,HTA11_696_2000002011,18342,Fresh,18342,Fresh,Normal
3,HTA11_696,HTA11_696_2000002021,18342,Fresh,18342,Fresh,Normal
4,HTA11_83,HTA11_83_2000001011,23394,Fresh,23394,Fresh,Atypia - hyperplasia


#### Mapping each biospecimen to a patient

In [234]:
caseBioDF = pd.merge(caseDF, bioDF, left_on="HTAN Participant ID", right_on="HTAN Parent ID", how="outer")
caseBioDF.head()

Unnamed: 0,HTAN Participant ID,Atlas Name,Age at Diagnosis (years),Year Of Birth,Race,Gender,Ethnicity,Vital Status,HTAN Parent ID,HTAN Biospecimen ID,Collection Days from Index,Storage Method,Processing Days from Index,Preservation Method,Tumor Tissue Type
0,HTA11_347,HTAN Vanderbilt,0,1959.0,Other,female,not hispanic or latino,Alive,HTA11_347,HTA11_347_2000001011,21802,Fresh,21802,Fresh,Premalignant
1,HTA11_347,HTAN Vanderbilt,0,1959.0,Other,female,not hispanic or latino,Alive,HTA11_347,HTA11_347_2000009901,21802,Frozen at -80C,21802,Frozen,Not Otherwise Specified
2,HTA11_78,HTAN Vanderbilt,0,1952.0,white,male,not hispanic or latino,Alive,HTA11_78,HTA11_78_2000001011,24370,Fresh,24370,Fresh,Premalignant
3,HTA11_78,HTAN Vanderbilt,0,1952.0,white,male,not hispanic or latino,Alive,HTA11_78,HTA11_78_2000002011,24370,Fresh,24370,Fresh,Normal
4,HTA11_78,HTAN Vanderbilt,0,1952.0,white,male,not hispanic or latino,Alive,HTA11_78,HTA11_78_2000009901,24370,Frozen at -80C,24370,Frozen,Not Otherwise Specified


#### Load metadata file for the Vanderbilt dataset obtained from bigQuery (provided by Ino). Extract the HTAN_Data_File_ID to match with the biospecimen ID

In [235]:
queryDF = pd.read_csv("metadata/bigquery_vanderbilt_everything.csv")
queryDF = queryDF[queryDF["Component"].isin(["ScRNA-seqLevel3", "10xVisiumSpatialTranscriptomics-RNA-seqLevel3"])]
bioQuery = queryDF[["HTAN_Data_File_ID", "HTAN_Assayed_Biospecimen_ID"]]

#### Load the synapse metadata file for scRNA Level3 data

In [251]:
scDF = pd.read_csv("metadata/syn36266953_ScRNALevel3.csv")
# generate participant ID
scDF["HTAN Participant ID"] = scDF["HTAN Data File ID"].apply(lambda X: "_".join(X.split("_")[0:2]))
# rename the columns
scDF.columns = [f"sc_{X}" for X in scDF.columns] 
# link the biospecimen ID for each sample
scBioDF = pd.merge(scDF, bioQuery, left_on="sc_HTAN Data File ID", right_on="HTAN_Data_File_ID", how="inner")
scBioDF.head()

Unnamed: 0,sc_Component,sc_Filename,sc_File Format,sc_HTAN Parent Data File ID,sc_HTAN Data File ID,sc_Data Category,sc_Matrix Type,sc_Linked Matrices,sc_Cell Median Number Reads,sc_Cell Median Number Genes,...,sc_scRNAseq Workflow Parameters Description,sc_Workflow Link,sc_Workflow Version,sc_entityId,sc_Id,sc_Workflow End Datetime,sc_Workflow Start Datetime,sc_HTAN Participant ID,HTAN_Data_File_ID,HTAN_Assayed_Biospecimen_ID
0,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3620as1.csv,csv,HTA11_347_200000101112111,HTA11_347_200000101113111,Gene Expression Quantification,Raw Counts,,9176.0,2849.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn23521628,794ac446-4434-40c2-ba37-3c7d1314093e,0/0/0000,0/0/0000,HTA11_347,HTA11_347_200000101113111,HTA11_347_2000001011
1,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3620as2.csv,csv,HTA11_78_200000101112111,HTA11_78_200000101113111,Gene Expression Quantification,Raw Counts,,5054.0,1896.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn23521635,e2064be8-393a-4981-972c-2c73889791a8,0/0/0000,0/0/0000,HTA11_78,HTA11_78_200000101113111,HTA11_78_2000001011
2,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3803as1.csv,csv,HTA11_696_200000201112111,HTA11_696_200000201113111,Gene Expression Quantification,Raw Counts,,6112.0,2293.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn23521725,2b942a2a-945b-4fc7-b7ae-3529068b379c,0/0/0000,0/0/0000,HTA11_696,HTA11_696_200000201113111,HTA11_696_2000002011
3,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3803as2.csv,csv,HTA11_696_200000202112111,HTA11_696_200000202113111,Gene Expression Quantification,Raw Counts,,6444.0,2324.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn23521726,912ea32b-8e17-484d-9775-75a9565212ce,0/0/0000,0/0/0000,HTA11_696,HTA11_696_200000202113111,HTA11_696_2000002021
4,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3882as1.csv,csv,HTA11_83_200000101112111,HTA11_83_200000101113111,Gene Expression Quantification,Raw Counts,,10910.0,2689.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn23521727,034225c5-59e0-4adb-af24-fcfad8aedf81,0/0/0000,0/0/0000,HTA11_83,HTA11_83_200000101113111,HTA11_83_2000001011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,ScRNA-seqLevel3,single_cell_RNAseq_level_3/7306-YX-7_immune.csv,csv,HTA11_13701_200000101112111,HTA11_13701_200000101113112,Gene Expression Quantification,Raw Counts,,1753.0,941.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn51069173,20fe7008-32ae-4225-b371-ce28f3169a0b,,,HTA11_13701,HTA11_13701_200000101113112,HTA11_13701_2000001011
306,ScRNA-seqLevel3,single_cell_RNAseq_level_3/7306-YX-8_epi.csv,csv,HTA11_13701_200000102112111,HTA11_13701_200000102113111,Gene Expression Quantification,Raw Counts,,2654.5,1193.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn51069094,a1b1c62a-93dd-489d-af2a-8e1d72c38d2e,,,HTA11_13701,HTA11_13701_200000102113111,HTA11_13701_2000001021
307,ScRNA-seqLevel3,single_cell_RNAseq_level_3/7306-YX-8_immune.csv,csv,HTA11_13701_200000102112111,HTA11_13701_200000102113112,Gene Expression Quantification,Raw Counts,,1015.0,653.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn51069174,31f506d0-2c24-4366-8fbd-f694fa0a5bf6,,,HTA11_13701,HTA11_13701_200000102113112,HTA11_13701_2000001021
308,ScRNA-seqLevel3,single_cell_RNAseq_level_3/7306-YX-9_epi.csv,csv,HTA11_13794_200000101112111,HTA11_13794_200000101113111,Gene Expression Quantification,Raw Counts,,3695.0,1140.0,...,"dropkick and QCPipe, default parameters",https://github.com/KenLauLab/STAR_Protocol,"1.2.2, 1.0.0",syn51069095,a3c4fbe8-4b35-4887-9a46-c54985b45c2b,,,HTA11_13794,HTA11_13794_200000101113111,HTA11_13794_2000001011


#### merge dataframes to assign biospecimen info to each scRNA file

In [256]:
scCaseBioDF = pd.merge(scBioDF, caseBioDF, left_on="HTAN_Assayed_Biospecimen_ID", right_on="HTAN Biospecimen ID", how="inner")
scCaseBioDF.to_csv("metadata/scRNA_metadata.csv", index=False)
scCaseBioDF.head()

Unnamed: 0,sc_Component,sc_Filename,sc_File Format,sc_HTAN Parent Data File ID,sc_HTAN Data File ID,sc_Data Category,sc_Matrix Type,sc_Linked Matrices,sc_Cell Median Number Reads,sc_Cell Median Number Genes,...,Gender,Ethnicity,Vital Status,HTAN Parent ID,HTAN Biospecimen ID,Collection Days from Index,Storage Method,Processing Days from Index,Preservation Method,Tumor Tissue Type
0,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3620as1.csv,csv,HTA11_347_200000101112111,HTA11_347_200000101113111,Gene Expression Quantification,Raw Counts,,9176.0,2849.0,...,female,not hispanic or latino,Alive,HTA11_347,HTA11_347_2000001011,21802,Fresh,21802,Fresh,Premalignant
1,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3906as1.csv,csv,HTA11_347_200000101112211,HTA11_347_200000101113211,Gene Expression Quantification,Raw Counts,,12590.0,3571.0,...,female,not hispanic or latino,Alive,HTA11_347,HTA11_347_2000001011,21802,Fresh,21802,Fresh,Premalignant
2,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3620as2.csv,csv,HTA11_78_200000101112111,HTA11_78_200000101113111,Gene Expression Quantification,Raw Counts,,5054.0,1896.0,...,male,not hispanic or latino,Alive,HTA11_78,HTA11_78_2000001011,24370,Fresh,24370,Fresh,Premalignant
3,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3882as4.csv,csv,HTA11_78_200000101112211,HTA11_78_200000101113211,Gene Expression Quantification,Raw Counts,,6722.0,2312.5,...,male,not hispanic or latino,Alive,HTA11_78,HTA11_78_2000001011,24370,Fresh,24370,Fresh,Premalignant
4,ScRNA-seqLevel3,single_cell_RNAseq_level_3/3803as1.csv,csv,HTA11_696_200000201112111,HTA11_696_200000201113111,Gene Expression Quantification,Raw Counts,,6112.0,2293.0,...,male,not hispanic or latino,Alive,HTA11_696,HTA11_696_2000002011,18342,Fresh,18342,Fresh,Normal


#### obtain participant IDs is scRNA available

In [257]:
scCaseRace = scCaseBioDF[["HTAN Participant ID", "Race"]].drop_duplicates()
aaIDs = scCaseRace[scCaseRace["Race"]=="black or african american"]["HTAN Participant ID"].tolist()
whiteIDs = scCaseRace[scCaseRace["Race"]=="white"]["HTAN Participant ID"].tolist()
raceCounts = scCaseRace['Race'].value_counts()
raceCounts

white                        73
Not Reported                 18
black or african american    14
Other                         1
Name: Race, dtype: int64

#### african americans with scRNA and specific tumor type

In [258]:
scAA = scCaseBioDF[scCaseBioDF["HTAN Participant ID"].isin(aaIDs)][["HTAN Participant ID","Tumor Tissue Type"]]
scAA[["HTAN Participant ID","Tumor Tissue Type"]].drop_duplicates()

Unnamed: 0,HTAN Participant ID,Tumor Tissue Type
8,HTA11_83,Atypia - hyperplasia
10,HTA11_83,Normal
38,HTA11_5212,Premalignant
39,HTA11_5212,Normal
40,HTA11_5216,Atypia - hyperplasia
41,HTA11_5216,Normal
44,HTA11_3252,Normal
59,HTA11_2235,Normal
71,HTA11_2951,Premalignant
72,HTA11_2951,Normal


#### load visium metadata

In [259]:
visiumDF = pd.read_csv("metadata/syn51469914_10xLevel3.csv")
visiumDF["HTAN Participant ID"] = visiumDF["HTAN Data File ID"].apply(lambda X: "_".join(X.split("_")[0:2]))
visiumDF.columns = [f"visium_{X}" for X in visiumDF.columns] #164
visiumBioDF = pd.merge(visiumDF, bioQuery, left_on="visium_HTAN Data File ID", right_on="HTAN_Data_File_ID", how="inner")
visiumBioDF.head() #192

Unnamed: 0,visium_Component,visium_Filename,visium_File Format,visium_HTAN Parent Biospecimen ID,visium_HTAN Parent Data File ID,visium_HTAN Data File ID,visium_Run ID,visium_Visium File Type,visium_Workflow Version,visium_Workflow Link,...,visium_Sequencing Saturation,visium_Proportion Reads Mapped,visium_Proportion Reads Mapped to Transcriptome,visium_Median UMI Counts per Spot,visium_entityId,visium_Id,visium_eTag,visium_HTAN Participant ID,HTAN_Data_File_ID,HTAN_Assayed_Biospecimen_ID
0,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_barcodes.tsv.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433111,6723-KL,barcodes,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,0.836251,,0.972046,11356.0,syn51425743,5ef03a71-6669-42e4-b628-426ffc5461fb,ee09f07c-fea5-4646-aa17-f982e8c68009,HTA11_99999971397,HTA11_99999971397_8038433111,HTA11_99999971397_80384
1,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_features.tsv.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433121,6723-KL,features,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,0.836251,,0.972046,11356.0,syn51425744,df5c3ac8-3e93-49be-912b-06a87505e81f,4608acf5-ee99-44b0-b792-1cfd91792835,HTA11_99999971397,HTA11_99999971397_8038433121,HTA11_99999971397_80384
2,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_matrix.mtx.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433131,6723-KL,filtered mex,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,0.836251,,0.972046,11356.0,syn51425745,8c8c2fb7-430d-4e49-bed6-4756d75f82a4,29460827-339b-46e7-904e-615d97a4496c,HTA11_99999971397,HTA11_99999971397_8038433131,HTA11_99999971397_80384
3,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_tissue_positions.csv,csv,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433141,6723-KL,tissue_positions,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,0.836251,,0.972046,11356.0,syn51425746,350f0a49-5ab4-4bef-8c0b-b44ad9777f9d,a6d5d102-4e53-40b8-bd51-8114a7e31cef,HTA11_99999971397,HTA11_99999971397_8038433141,HTA11_99999971397_80384
4,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_2_barcodes.tsv.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432211,HTA11_99999971397_8038433211,6723-KL,barcodes,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,0.606543,,0.986709,11038.0,syn51425747,f8909dc9-1245-49de-861d-8fa63f39d915,07d3ca4d-b177-4caa-859e-dca729b1e54e,HTA11_99999971397,HTA11_99999971397_8038433211,HTA11_99999971397_80384


#### Add biospecimen information

In [261]:
visiumCaseDF = pd.merge(visiumBioDF, caseBioDF, left_on="HTAN_Assayed_Biospecimen_ID", right_on="HTAN Biospecimen ID", how="inner")
visiumCaseDF.to_csv("metadata/visium_metadata.csv")
visiumCaseDF.head()

Unnamed: 0,visium_Component,visium_Filename,visium_File Format,visium_HTAN Parent Biospecimen ID,visium_HTAN Parent Data File ID,visium_HTAN Data File ID,visium_Run ID,visium_Visium File Type,visium_Workflow Version,visium_Workflow Link,...,Gender,Ethnicity,Vital Status,HTAN Parent ID,HTAN Biospecimen ID,Collection Days from Index,Storage Method,Processing Days from Index,Preservation Method,Tumor Tissue Type
0,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_barcodes.tsv.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433111,6723-KL,barcodes,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,Not Reported,Not Reported,Not Reported,HTA11_99999971397,HTA11_99999971397_80384,0,unknown,0,unknown,Not Otherwise Specified
1,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_features.tsv.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433121,6723-KL,features,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,Not Reported,Not Reported,Not Reported,HTA11_99999971397,HTA11_99999971397_80384,0,unknown,0,unknown,Not Otherwise Specified
2,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_matrix.mtx.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433131,6723-KL,filtered mex,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,Not Reported,Not Reported,Not Reported,HTA11_99999971397,HTA11_99999971397_80384,0,unknown,0,unknown,Not Otherwise Specified
3,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_1_tissue_positions.csv,csv,HTA11_99999971397_80384,HTA11_99999971397_8038432111,HTA11_99999971397_8038433141,6723-KL,tissue_positions,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,Not Reported,Not Reported,Not Reported,HTA11_99999971397,HTA11_99999971397_80384,0,unknown,0,unknown,Not Otherwise Specified
4,10xVisiumSpatialTranscriptomics-RNA-seqLevel3,visium_level_3/6723_KL_2_barcodes.tsv.gz,gzip,HTA11_99999971397_80384,HTA11_99999971397_8038432211,HTA11_99999971397_8038433211,6723-KL,barcodes,spaceranger-2.0.0,https://support.10xgenomics.com/spatial-gene-e...,...,Not Reported,Not Reported,Not Reported,HTA11_99999971397,HTA11_99999971397_80384,0,unknown,0,unknown,Not Otherwise Specified


In [262]:
visiumAA = visiumCaseDF[visiumCaseDF["HTAN Participant ID"].isin(aaIDs)]
# visiumAA["HTAN Participant ID"].drop_duplicates()
visiumAA[["HTAN Participant ID", "Tumor Tissue Type"]].drop_duplicates()

Unnamed: 0,HTAN Participant ID,Tumor Tissue Type
104,HTA11_7862,Premalignant
112,HTA11_7663,Premalignant


In [263]:
visiumWhite = visiumCaseDF[visiumCaseDF["HTAN Participant ID"].isin(whiteIDs)]
visiumWhite[["HTAN Participant ID", "Tumor Tissue Type"]].drop_duplicates()

Unnamed: 0,HTAN Participant ID,Tumor Tissue Type
48,HTA11_8622,Premalignant
56,HTA11_1938,Premalignant
108,HTA11_10711,Premalignant
116,HTA11_6134,Atypia - hyperplasia


In [None]:
# B-cell: CD20, CD21, CD19
# Fibroblasts: VIM, FAP, FSP1, SMA
# T-cell: CD8, CD4, CD3, CD28
# MYE: CD68, CD80, CD86, CD206, CD163
# PLA: CD38, MUM1
# MAS: CD117
# END: CD31, CD34, VEGF

In [265]:
# import requests, sys

# genes = ["CD20", "CD21", "CD19", "VIM", "FAP", "FSP1", "SMA", "CD8", "CD4", "CD3", "CD28",
#          "CD68", "CD80", "CD86", "CD206", "CD163", "CD38", "MUM1", "CD117", "CD31", "CD34", "VEGF"]
 
# for gene in genes:
#   server = "https://rest.ensembl.org"
#   ext = f"/lookup/symbol/homo_sapiens/{gene}?expand=1"
  
#   r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
  
#   if not r.ok:
#     r.raise_for_status()
#     sys.exit()
  
#   decoded = r.json()
#   print(repr(decoded))
#   break

HTTPError: 400 Client Error: Bad Request for url: https://rest.ensembl.org/lookup/symbol/homo_sapiens/CD20?expand=1