In [2]:
import synapseclient
from synapseclient import Project, File, Folder
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import itertools
import pandas as pd
import numpy as np

In [3]:
syn = synapseclient.Synapse()
syn.login()

Welcome, Victor Baham!



In [49]:
lbp_wgs_query = syn.tableQuery(f"SELECT * FROM {syn.get('syn60943813').id}")
lbp_wgs = pd.read_csv(lbp_wgs_query.filepath)

In [50]:
lbp_scrna_query = syn.tableQuery(f"SELECT * FROM {syn.get('syn60943834').id}")
lbp_scrna = pd.read_csv(lbp_scrna_query.filepath)

### Check that all previous individuals or specimens are present in the most recently uploaded metadata files:

In [9]:
old_lbp_bio = pd.read_csv(syn.get('syn28545761').path)
old_lbp_ind = pd.read_csv(syn.get('syn28545760').path)

In [10]:
new_lbp_bio = pd.read_csv(syn.get('syn58890594').path)
new_lbp_ind = pd.read_csv(syn.get('syn58829961').path)

In [14]:
new_specs = [x for x in old_lbp_bio['specimenID'].astype(str).tolist() 
             if x not in new_lbp_bio['specimenID'].astype(str).tolist() ]
new_specs

[]

In [15]:
new_inds = [x for x in old_lbp_ind['individualID'].astype(str).tolist() 
             if x not in new_lbp_ind['individualID'].astype(str).tolist() ]
new_inds

[]

### Get assay metadata for snRNAseq and WGS

In [40]:
lbp_wgs_meta = pd.read_csv(syn.get('syn58890612').path)
lbp_scrna_meta = pd.read_csv(syn.get('syn58849847').path)

lbp_assay_meta = pd.concat([lbp_wgs_meta, lbp_scrna_meta]).reset_index(drop=True)

### Check that no specimens are common between scRNAseq and WGS assay metadata files

In [44]:
mis = [x for x in lbp_wgs_meta['specimenID'].astype(str).tolist()
      if x in lbp_scrna_meta['specimenID'].astype(str).tolist()]
mis

[]

### specimenIDs present in the WGS metadata but not the biospecimen metadata

In [71]:
miss = [x for x in lbp_wgs_meta['specimenID'].astype(str).tolist() 
        if x not in new_lbp_bio['specimenID'].astype(str).tolist()]

In [70]:
miss

['5721-EC-501', '5721-EC-502', '5721-EC-503']

### Check column names in golden assay metadata file:

In [46]:
lbp_assay_meta.columns

Index(['Component', 'specimenID', 'libraryID', 'assay', 'platform',
       'referenceSet', 'dnaBatch', 'sequencingBatch', 'libraryPrep',
       'libraryPreparationMethod', 'readLength', 'runType', 'totalReads',
       'numberCells', 'ratio260over280', 'ratio260over230', 'GQN', 'Id',
       'entityId', 'sampleBarcode', 'RIN', 'rnaBatch', 'libraryBatch',
       'libraryVersion', 'libraryType', 'isStranded', 'readStrandOrigin',
       'validBarcodeReads', 'medianGenes', 'medianUMIs'],
      dtype='object')

### Create dicts for the following annotations: 

* `sex`, `organ`, `tissue`, `runType`, `platform`, `readLength`, `isPostMortem`, and `nucleicAcidSource`

In [48]:
ind_to_sex = dict(zip(new_lbp_ind['individualID'], new_lbp_ind['sex']))
spec_to_organ = dict(zip(new_lbp_bio['specimenID'], new_lbp_bio['organ']))
spec_to_tissue = dict(zip(new_lbp_bio['specimenID'], new_lbp_bio['tissue']))
spec_to_runtype = dict(zip(lbp_assay_meta['specimenID'], lbp_assay_meta['runType']))
spec_to_platform = dict(zip(lbp_assay_meta['specimenID'], lbp_assay_meta['platform']))
spec_to_readlength = dict(zip(lbp_assay_meta['specimenID'], lbp_assay_meta['readLength']))
spec_to_ispostmortem = dict(zip(new_lbp_bio['specimenID'], new_lbp_bio['isPostMortem']))
spec_to_nucleicacidsource = dict(zip(new_lbp_bio['specimenID'], new_lbp_bio['nucleicAcidSource']))

### Annotate LBP WGS files:

In [72]:
lbp_wgs['sex'] = lbp_wgs['individualID'].map(ind_to_sex)
lbp_wgs['organ'] = lbp_wgs['specimenID'].map(spec_to_organ)
lbp_wgs['tissue'] = lbp_wgs['specimenID'].map(spec_to_tissue)
lbp_wgs['runType'] = lbp_wgs['specimenID'].map(spec_to_runtype)
lbp_wgs['platform'] = lbp_wgs['specimenID'].map(spec_to_platform)
lbp_wgs['readLength'] = lbp_wgs['specimenID'].map(spec_to_readlength)
lbp_wgs['isPostMortem'] = lbp_wgs['specimenID'].map(spec_to_ispostmortem)
lbp_wgs['nucleicAcidSource'] = lbp_wgs['specimenID'].map(spec_to_nucleicacidsource)

### Annotate LBP scRNAseq files:

In [74]:
lbp_scrna['sex'] = lbp_scrna['individualID'].map(ind_to_sex)
lbp_scrna['organ'] = lbp_scrna['specimenID'].map(spec_to_organ)
lbp_scrna['tissue'] = lbp_scrna['specimenID'].map(spec_to_tissue)
lbp_scrna['runType'] = lbp_scrna['specimenID'].map(spec_to_runtype)
lbp_scrna['platform'] = lbp_scrna['specimenID'].map(spec_to_platform)
lbp_scrna['readLength'] = lbp_scrna['specimenID'].map(spec_to_readlength)
lbp_scrna['isPostMortem'] = lbp_scrna['specimenID'].map(spec_to_ispostmortem)
lbp_scrna['nucleicAcidSource'] = lbp_scrna['specimenID'].map(spec_to_nucleicacidsource)

### Remove `Id`, `chromosome`, `Component` annotations from both FileViews

In [75]:
lbp_wgs[['Id', 'chromosome', 'Component']] = ''
lbp_scrna[['Id', 'chromosome', 'Component']] = ''

### Check FileViews before storing to Synapse

In [76]:
lbp_wgs

Unnamed: 0,ROW_ID,ROW_VERSION,ROW_ETAG,id,name,Id,analysisType,assay,chromosome,consortium,...,study,Component,sex,organ,tissue,runType,platform,readLength,isPostMortem,nucleicAcidSource
0,58623367,66,1b38ce85-54c6-41c8-a20e-69e0589c700e,syn58623367,5721-EC-10_S1_L005_R1_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,female,brain,left cerebral hemisphere,pairedEnd,Illumina NovaSeq 6000,297.13,True,bulk cell
1,58623368,66,2458d5a9-44a3-43f9-86fe-14cfc82c1304,syn58623368,5721-EC-10_S1_L005_R2_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,female,brain,left cerebral hemisphere,pairedEnd,Illumina NovaSeq 6000,297.13,True,bulk cell
2,58623369,66,e531d46c-7de3-4844-84e4-ccae46f9d818,syn58623369,5721-EC-11_S1_L005_R1_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,male,brain,left cerebral hemisphere,pairedEnd,Illumina NovaSeq 6000,286.75,True,bulk cell
3,58623370,66,ecf65890-2c91-49a7-99fb-1e20ce96cd3f,syn58623370,5721-EC-11_S1_L005_R2_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,male,brain,left cerebral hemisphere,pairedEnd,Illumina NovaSeq 6000,286.75,True,bulk cell
4,58623371,66,0b2a2014-8f06-4a75-ae71-8103c8a7787d,syn58623371,5721-EC-112_S1_L005_R2_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,male,brain,unspecified,pairedEnd,Illumina NovaSeq 6000,276.49,True,bulk cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,58810615,1,76f60bb4-3a86-4a63-8da3-7828d18acda9,syn58810615,5721-EC-485_S1_L005_R1_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,male,blood,blood,pairedEnd,Illumina NovaSeq 6000,308.98,False,bulk cell
993,58810654,1,7f33e78f-9c94-4bc7-83d0-8d39fd703e90,syn58810654,5721-EC-502_S1_L005_R1_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,male,,,pairedEnd,Illumina NovaSeq 6000,290.20,,
994,58810886,1,0be40556-29ce-4476-bd6a-0dbca4416754,syn58810886,5721-EC-500_S1_L005_R1_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,female,blood,blood,pairedEnd,Illumina NovaSeq 6000,298.99,False,bulk cell
995,58811198,1,3b8ec3c1-ee05-4fb1-8507-aa4e917617b6,syn58811198,5721-EC-485_S1_L005_R2_001.fastq.gz,,expression quantitative trait loci detection,wholeGenomeSeq,,CDCP,...,LBP,,male,blood,blood,pairedEnd,Illumina NovaSeq 6000,308.98,False,bulk cell


In [77]:
lbp_scrna

Unnamed: 0,ROW_ID,ROW_VERSION,ROW_ETAG,id,name,Id,analysisType,assay,chromosome,consortium,...,study,Component,sex,organ,tissue,runType,platform,readLength,isPostMortem,nucleicAcidSource
0,58577328,1,484eaa40-044c-4d8b-8fb8-4e6ab36258c3,syn58577328,PT-0236R_S2_L004_I1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,right cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,False,bulk cell
1,58577329,1,4537364e-1a12-4242-ab9d-a8343b63e80b,syn58577329,PT-0236R_S2_L002_I1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,right cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,False,bulk cell
2,58577330,1,6695d6c0-ac6e-4a7e-bed4-a43a45a7bace,syn58577330,PT-0236R_S2_L001_I1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,right cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,False,bulk cell
3,58577331,1,22abc633-62e3-4e99-9bdb-01932ec7e687,syn58577331,PT-0236R_S2_L003_I1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,right cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,False,bulk cell
4,58577332,1,cd9c8129-088d-4c4a-a37f-b4265c782a2b,syn58577332,PT-0237R_S1_L003_I1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,right cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,False,bulk cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,58589850,1,89916e8d-a8ec-4ce7-a5db-dbfa1c8aa040,syn58589850,T-563_S19_L001_R1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,left cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,True,bulk cell
813,58589928,1,12923b96-f4a1-40ad-aa37-4f8280a08691,syn58589928,T-563_S19_L002_R2_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,left cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,True,bulk cell
814,58590189,1,16750604-97a8-46be-9a76-aa457a29852c,syn58590189,T-563_S19_L004_R1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,left cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,True,bulk cell
815,58590191,1,4824ac44-0565-4d56-90f3-642cae3f7321,syn58590191,T-563_S19_L003_R1_001.fastq.gz,,Gene expression comparison,scrnaSeq,,CDCP,...,LBP,,male,brain,left cerebral hemisphere,singleEnd,Illumina NovaSeq 6000,91.0,True,bulk cell


### Store annotations to Synapse

In [79]:
syn.store(Table('syn60943813', lbp_wgs))
syn.store(Table('syn60943834', lbp_scrna))

Uploading: 100%|███████████████████| 371k/371k [00:00<00:00, 385kB/s, table.csv]
Uploading: 100%|███████████████████| 299k/299k [00:00<00:00, 343kB/s, table.csv]


<synapseclient.table.CsvFileTable at 0x7fc8890b00d0>