In [1]:
import synapseclient
from synapseclient import Project, File, Folder
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import itertools
import pandas as pd
import numpy as np

In [2]:
syn = synapseclient.Synapse()
syn.login()

Welcome, Victor Baham!



#### Issue: [syn52368902](https://www.synapse.org/Synapse:syn52368902) does not contain `libraryID` data
#### BUT! Hansruedi provided a manifest with libraryIDs, which I applied as annotations to all files besides the `snRNAseq - raw Smart-seq2` files
#### Solution: fill column `libraryID` in [syn52368902](https://www.synapse.org/Synapse:syn52368902) with `libraryID` annotations on the `snRNAseq - raw Smart-seq2` files

#### Update: renaming a column in a FileView schema apparently keeps the previously titled annotation and adds the new one on all files!

#### Get all MIT_ROSMAP_Multiomics multi-region snRNAseq - raw 10x genomics files

In [37]:
# MRM multi-region snRNAseq - raw 10x genomics files
schema_mrm_mr_snrna_raw_10x = syn.get('syn52420188')
results_mrm_mr_snrna_raw_10x = syn.tableQuery(f"SELECT * FROM {schema_mrm_mr_snrna_raw_10x.id}")  
mrm_mr_snrna_raw_10x = pd.read_csv(results_mrm_mr_snrna_raw_10x.filepath)

Downloading files: 100%|████████| 976k/976k [00:00<00:00, 3.10MB/s, syn52420188]

Downloaded syn52420188 to /Users/vbaham/.synapseCache/842/145038842/SYNAPSE_TABLE_QUERY_145038842.csv


Downloading files: 100%|████████| 976k/976k [00:00<00:00, 2.99MB/s, syn52420188]


#### Get all MIT_ROSMAP_Multiomics snRNAseq - raw 10x genomics files

In [38]:
# MRM snRNAseq - raw 10x genomics files 
schema_mrm_snrna_raw_10x = syn.get('syn52420143')
results_mrm_snrna_raw_10x = syn.tableQuery(f"SELECT * FROM {schema_mrm_snrna_raw_10x.id}")  
mrm_snrna_raw_10x = pd.read_csv(results_mrm_snrna_raw_10x.filepath)

Downloading files: 100%|██████| 1.85M/1.85M [00:00<00:00, 10.4MB/s, syn52420143]

Downloaded syn52420143 to /Users/vbaham/.synapseCache/845/145038845/SYNAPSE_TABLE_QUERY_145038845.csv


Downloading files: 100%|██████| 1.85M/1.85M [00:00<00:00, 9.92MB/s, syn52420143]


#### Get all MIT_ROSMAP_Multiomics snRNAseq - raw Smart-seq2 files

In [39]:
# MRM snRNAseq - raw Smart-seq2 files
schema_mrm_snrna_raw_ss2 = syn.get('syn52420161')
results_mrm_snrna_raw_ss2 = syn.tableQuery(f"SELECT * FROM {schema_mrm_snrna_raw_ss2.id}")  
mrm_snrna_raw_ss2 = pd.read_csv(results_mrm_snrna_raw_ss2.filepath)

Downloading files:  92%|█████▌| 2.10M/2.29M [00:00<00:00, 10.6MB/s, syn52420161]

Downloaded syn52420161 to /Users/vbaham/.synapseCache/853/145038853/SYNAPSE_TABLE_QUERY_145038853.csv


Downloading files: 100%|██████| 2.29M/2.29M [00:00<00:00, 10.8MB/s, syn52420161]


#### Functions to get `libraryID` from file names [customized to each use case of filename formats due to time constraints]

`get_lib_id_S` is for:
* MIT_ROSMAP_Multiomics multi-region snRNAseq - raw 10x genomics files
* MIT_ROSMAP_Multiomics snRNAseq - raw 10x genomics files

`get_lib_id_N` is for:
* MIT_ROSMAP_Multiomics snRNAseq - raw Smart-seq2 files

In [40]:
def get_lib_id_S(name):
    s_idx = name.index('S')
    return name[:s_idx-1]

def get_lib_id_N(name):
    d_idx = name.index('D')
    n_idx = name.index('N')
    return name[d_idx:n_idx-1]

mrm_mr_snrna_raw_10x['libraryID'] = mrm_mr_snrna_raw_10x['name'].map(get_lib_id_S)
mrm_snrna_raw_10x['libraryID'] = mrm_snrna_raw_10x['name'].map(get_lib_id_S)
mrm_snrna_raw_ss2['libraryID'] = mrm_snrna_raw_ss2['name'].map(get_lib_id_N)

In [41]:
mrm_mr_snrna_raw_10x

Unnamed: 0,ROW_ID,ROW_VERSION,ROW_ETAG,id,name,assay,consortium,dataSubtype,dataType,fileFormat,...,tissue,nucleicAcidSource,referenceSet,libraryPrep,runType,organ,platform,species,libraryID,libraryBatch
0,52409373,1,b7878e0b-68af-4a9b-809a-eacb8829b3aa,syn52409373,D19-8387_S1_L001_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,angular gyrus,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-8387,D19-8387
1,52409374,1,1ebffa32-b4bb-48ed-a6f1-746d65bec4ca,syn52409374,D19-8387_S1_L002_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,angular gyrus,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-8387,D19-8387
2,52409375,1,2dc8e946-d8b6-469c-b3e6-8e9247010dde,syn52409375,D19-8388_S2_L002_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,angular gyrus,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-8388,D19-8388
3,52409376,1,87aac540-daa2-40e3-a570-068100bbb9f6,syn52409376,D19-8388_S2_L001_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,angular gyrus,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-8388,D19-8388
4,52409377,1,b0ede12c-a6f2-4678-80f6-22a1dff7ed04,syn52409377,D19-8387_S1_L001_R1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,angular gyrus,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-8387,D19-8387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2593,52416533,1,32f0e8d6-b1a0-4272-bd10-6c7cbdcb83ea,syn52416533,D19-4645_S29_L001_R2_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,hippocampus,single nucleus,GRCh38,10x,,brain,IlluminaNovaseq6000,Human,D19-4645,D19-4645
2594,52416534,1,fe7f4e3f-07db-46f0-974d-6c059b257418,syn52416534,D19-4645_S29_L002_R2_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,hippocampus,single nucleus,GRCh38,10x,,brain,IlluminaNovaseq6000,Human,D19-4645,D19-4645
2595,52416535,1,9754001e-1d9e-4445-bcf9-a8a0659f15ff,syn52416535,D19-4648_S32_L001_R2_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,hippocampus,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-4648,D19-4648
2596,52416537,1,60086ddd-e4d9-40c2-ae2e-32c04e5d40db,syn52416537,D19-4647_S31_L002_R2_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,hippocampus,single nucleus,GRCh38,10x,,brain,IlluminaNovaseq6000,Human,D19-4647,D19-4647


In [42]:
mrm_snrna_raw_10x

Unnamed: 0,ROW_ID,ROW_VERSION,ROW_ETAG,id,name,assay,consortium,dataSubtype,dataType,fileFormat,...,tissue,nucleicAcidSource,referenceSet,libraryPrep,runType,organ,platform,species,libraryID,libraryBatch
0,52368872,2,da05e848-97ad-4341-8623-0aa2928052cd,syn52368872,D19-10914_1_S1_L001_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-10914_1,D19-10914_1
1,52368873,2,5c19c26e-c1b1-4316-9cbe-9fb08de84948,syn52368873,D19-10914_1_S1_L002_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-10914_1,D19-10914_1
2,52368874,2,77c59728-4a23-4664-8aaf-78413cee76b6,syn52368874,D19-10914_2_S2_L001_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-10914_2,D19-10914_2
3,52368875,2,7cdeedad-51b8-4087-8c24-b4d58303e0ff,syn52368875,D19-10914_2_S2_L002_I1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-10914_2,D19-10914_2
4,52368876,2,6d2d6d22-8a89-4de2-b9f8-e45de827193d,syn52368876,D19-10914_1_S1_L001_R1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-10914_1,D19-10914_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4873,52404503,1,64d7978f-b543-4e38-9e3e-9efdcfe0eeeb,syn52404503,D19-2488_S4_L004_R1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-2488,D19-2488
4874,52404504,1,4033f68d-dbcf-43cd-b5a3-b3447dc09a36,syn52404504,D19-2488_S4_L002_R1_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-2488,D19-2488
4875,52404505,1,288d26ee-0ce9-42f6-9f7c-be92ad4917e0,syn52404505,D19-2488_S4_L003_R2_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-2488,D19-2488
4876,52404506,1,87ea3a39-39d5-44ee-987f-c0025ed566dd,syn52404506,D19-2488_S4_L004_R2_001.fastq.gz,rnaSeq,CDCP,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,10x,pairedEnd,brain,IlluminaNovaseq6000,Human,D19-2488,D19-2488


In [43]:
mrm_snrna_raw_ss2

Unnamed: 0,ROW_ID,ROW_VERSION,ROW_ETAG,id,name,assay,consortium,dataSubtype,dataType,fileFormat,...,tissue,nucleicAcidSource,referenceSet,libraryPrep,runType,organ,platform,species,libraryID,libraryBatch
0,52400111,8,832ccf4e-7327-42f3-84c1-a2a633605f2f,syn52400111,170725Tsa_D17-155463_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-155463,D17-155463
1,52400112,8,41521fc6-c63a-4b6c-9d37-61b496e0dd17,syn52400112,170725Tsa_D17-155523_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-155523,D17-155523
2,52400113,8,40c32629-ee4c-42d8-a1f9-576e00058bf1,syn52400113,170725Tsa_D17-155528_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-155528,D17-155528
3,52400115,8,38d8b4b6-e23a-43ee-8d6f-bb41fe5a054f,syn52400115,170725Tsa_D17-155492_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-155492,D17-155492
4,52400116,8,b2023690-2b99-4437-942b-20e520dd9da1,syn52400116,170725Tsa_D17-155515_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-155515,D17-155515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5816,52406887,1,b867213c-8bdf-4597-a2c3-4f4a8b434085,syn52406887,170725Tsa_D17-154365_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-154365,D17-154365
5817,52406888,1,634d8727-dc22-4c98-9117-2923194f5ec6,syn52406888,170725Tsa_D17-154321_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-154321,D17-154321
5818,52406889,1,37c76535-fd89-42af-9eaf-6f9248cf2235,syn52406889,170725Tsa_D17-154332_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-154332,D17-154332
5819,52406890,1,fc66b5d9-c315-4556-aff4-124b205a359b,syn52406890,170725Tsa_D17-154301_NA_sequence.fastq.gz,snrnaSeq,AMP-AD,raw,geneExpression,fastq,...,prefrontal cortex,single nucleus,GRCh38,Smart-seq2,singleEnd,brain,IlluminaNovaseq6000,Human,D17-154301,D17-154301


#### Delete incorrect `libraryBatch` annotation that is inconsistent with assay metadata file

In [44]:
mrm_mr_snrna_raw_10x['libraryBatch'] = ''
mrm_snrna_raw_10x['libraryBatch'] = ''
mrm_snrna_raw_ss2['libraryBatch'] = ''

#### Store changes in Synapse

In [45]:
syn.store(Table('syn52420188', mrm_mr_snrna_raw_10x))
syn.store(Table('syn52420143', mrm_snrna_raw_10x))
syn.store(Table('syn52420161', mrm_snrna_raw_ss2))

Uploading: 100%|██████████████████| 809k/809k [00:00<00:00, 1.06MB/s, table.csv]
Uploading: 100%|████████████████| 1.54M/1.54M [00:00<00:00, 1.73MB/s, table.csv]
Uploading: 100%|████████████████| 1.97M/1.97M [00:01<00:00, 1.37MB/s, table.csv]


<synapseclient.table.CsvFileTable at 0x7f828b7ae1c0>

### Fill in `libraryBatch` column in MIT ROSMAP Multiomics snRNAseq metadata file

#### First, concatenate all dfs to map specimenID to libraryBatch:

In [92]:
mrm_all = pd.concat([mrm_mr_snrna_raw_10x, mrm_snrna_raw_10x, mrm_snrna_raw_ss2],
                   axis=0)
spec_to_lib = dict(zip(mrm_all['specimenID'], mrm_all['libraryID']))

#### Get df representation of [syn52368902](https://www.synapse.org/Synapse:syn52368902):

In [93]:
mrm_snrna_meta = pd.read_csv(syn.get('syn52368902').path)

In [94]:
mrm_snrna_meta['libraryID'] = mrm_snrna_meta['specimenID'].apply(lambda x: spec_to_lib.get(x) 
                                                           if x in spec_to_lib.keys()
                                                                else x)

In [95]:
mrm_snrna_meta

Unnamed: 0,specimenID,platform,libraryPrep,libraryPreparationMethod,isStranded,readStrandOrigin,runType,readLength,assay,RIN,...,sampleBarcode,totalReads,validBarcodeReads,numberCells,medianGenes,medianUMIs,libraryID,referenceSet,libraryVersion,DV200
0,R2626559.1,Illumina NovaSeq 6000,polyAselection,10x,1.0,reverse,pairedEnd,91,,,...,,,,,,,D19-2488,,,
1,R9936070.1,Illumina NovaSeq 6000,polyAselection,10x,1.0,reverse,pairedEnd,91,,,...,,,,,,,D19-2467,,,
2,R2367199.1,Illumina NovaSeq 6000,polyAselection,10x,1.0,reverse,pairedEnd,91,,,...,,,,,,,D19-5948,,,
3,R9891381.1,Illumina NovaSeq 6000,polyAselection,10x,1.0,reverse,pairedEnd,91,,,...,,,,,,,D19-4792,,,
4,R9033345.1,Illumina NovaSeq 6000,polyAselection,10x,1.0,reverse,pairedEnd,91,,,...,,,,,,,D19-4159,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6243,R9551808.83,HiSeq2000,,Smart-seq2,,,singleEnd,40,snrnaSeq,,...,,,,,,,D17-154365,,,
6244,R9551808.84,HiSeq2000,,Smart-seq2,,,singleEnd,40,snrnaSeq,,...,,,,,,,D17-154301,,,
6245,R9551808.85,HiSeq2000,,Smart-seq2,,,singleEnd,40,snrnaSeq,,...,,,,,,,D17-154349,,,
6246,R9551808.86,HiSeq2000,,Smart-seq2,,,singleEnd,40,snrnaSeq,,...,,,,,,,D17-154332,,,


#### Write this to a file with the same name as the current snrnaSeq assay metadata file, then upload this new file to Synapse but leave the `libraryBatch` column as is

In [97]:
mrm_snrna_meta.to_csv('MIT_ROSMAP_Multiomics_assay_snRNAseq_metadata.csv')