In [1]:
import synapseclient
from synapseclient import Project, File, Folder
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import pandas as pd

In [2]:
syn = synapseclient.Synapse()
syn.login(authToken=[redacted])

Welcome, Victor Baham!



#### Query FreshMicro staging FileViews

In [3]:
fm_iso_query = syn.tableQuery(f"SELECT * FROM {syn.get('syn64363483').id}")
fm_rna_query = syn.tableQuery(f"SELECT * FROM {syn.get('syn64367073').id}")
fm_scrna_query = syn.tableQuery(f"SELECT * FROM {syn.get('syn64367074').id}")

fm_iso = pd.read_csv(fm_iso_query.filepath)
fm_rna = pd.read_csv(fm_rna_query.filepath)
fm_scrna = pd.read_csv(fm_scrna_query.filepath)

Downloading files:   0%|          | 0.00/21.4k [00:00<?, ?B/s, syn64363483]

Downloaded syn64363483 to /home/jovyan/.synapseCache/105/150708105/SYNAPSE_TABLE_QUERY_150708105.csv


Downloading files: 100%|██████████| 21.4k/21.4k [00:00<00:00, 290kB/s, syn64363483]
Downloading files:   0%|          | 0.00/80.5k [00:00<?, ?B/s, syn64367073]

Downloaded syn64367073 to /home/jovyan/.synapseCache/106/150708106/SYNAPSE_TABLE_QUERY_150708106.csv


Downloading files: 100%|██████████| 80.5k/80.5k [00:00<00:00, 1.28MB/s, syn64367073]
Downloading files:   0%|          | 0.00/233k [00:00<?, ?B/s, syn64367074]

Downloaded syn64367074 to /home/jovyan/.synapseCache/108/150708108/SYNAPSE_TABLE_QUERY_150708108.csv


Downloading files: 100%|██████████| 233k/233k [00:00<00:00, 4.21MB/s, syn64367074]


#### Get the columns we will need to keep from the joined DataFrames (the columns are the same in each one, so we can arbitrarily select one of them)

In [4]:
fm_iso.columns

Index(['ROW_ID', 'ROW_VERSION', 'ROW_ETAG', 'id', 'name', 'sex', 'assay',
       'grant', 'organ', 'study', 'tissue', 'runType', 'species', 'cellType',
       'dataType', 'platform', 'consortium', 'fileFormat', 'readLength',
       'specimenID', 'dataSubtype', 'libraryPrep', 'individualID',
       'resourceType', 'isModelSystem', 'isMultiSpecimen',
       'nucleicAcidSource'],
      dtype='object')

#### Turn metadata files to DataFrames

In [5]:
fm_indm = pd.read_csv(syn.get('syn64363368').path)
fm_biom = pd.read_csv(syn.get('syn64364077').path)
fm_rnam = pd.read_csv(syn.get('syn54090267').path)
fm_scrnam = pd.read_csv(syn.get('syn64330569').path)
fm_isom = pd.read_csv(syn.get('syn64367034').path)
fm_mani = pd.read_csv(syn.get('syn64369817').path)

#### Map `synID` and `Filename` to `individualID` and `specimenID`

In [6]:
syn_to_ind = dict(zip(fm_mani['synID'], fm_mani['individualID']))
syn_to_spec = dict(zip(fm_mani['synID'], fm_mani['specimenID']))

file_to_ind = dict(zip(fm_mani['Filename'], fm_mani['individualID']))
file_to_spec = dict(zip(fm_mani['Filename'], fm_mani['specimenID']))

# also, specimenID to individualID
spec_to_ind = dict(zip(fm_mani['specimenID'], fm_mani['individualID']))

#### Fill in `specimenID` column for each df

In [7]:
fm_iso['specimenID'] = fm_iso['id'].map(syn_to_spec)
fm_rna['specimenID'] = fm_rna['id'].map(syn_to_spec)
fm_scrna['specimenID'] = fm_scrna['id'].map(syn_to_spec)

#### Fill in `individualID` column for each df

In [8]:
fm_iso['individualID'] = fm_iso['specimenID'].map(spec_to_ind)
fm_rna['individualID'] = fm_rna['specimenID'].map(spec_to_ind)
fm_scrna['individualID'] = fm_scrna['specimenID'].map(spec_to_ind)

#### Match attributes to `individualID` and `specimenID` (these are independent of assay metadata)

In [9]:
ind_to_sex = dict(zip(fm_indm['individualID'], fm_indm['sex']))
spec_to_assay = dict(zip(fm_biom['specimenID'], fm_biom['assay']))
spec_to_tissue = dict(zip(fm_biom['specimenID'], fm_biom['tissue']))
spec_to_celltype = dict(zip(fm_biom['specimenID'], fm_biom['cellType']))

In [10]:
fm_iso.columns

Index(['ROW_ID', 'ROW_VERSION', 'ROW_ETAG', 'id', 'name', 'sex', 'assay',
       'grant', 'organ', 'study', 'tissue', 'runType', 'species', 'cellType',
       'dataType', 'platform', 'consortium', 'fileFormat', 'readLength',
       'specimenID', 'dataSubtype', 'libraryPrep', 'individualID',
       'resourceType', 'isModelSystem', 'isMultiSpecimen',
       'nucleicAcidSource'],
      dtype='object')

#### Fill in the rest of the columns in `fm_iso` (spontaneously create dicts here since they are assay specific and fill in values that are static)

In [None]:
fm_iso['sex'] = fm_iso['individualID'].map(ind_to_sex)
fm_iso['assay'] = fm_iso['specimenID'].map(spec_to_assay)
fm_iso['grant'] = 'R01AG065582'
fm_iso['organ'] = 'brain'
fm_iso['study'] = 'FreshMicro'
fm_iso['tissue'] = fm_iso['specimenID'].map(spec_to_tissue)
fm_iso['runType'] = fm_iso['specimenID'].map(dict(zip(fm_isom['specimenID'], fm_isom['runType'])))
fm_iso['species'] = 'Human'
fm_iso['cellType'] = fm_iso['specimenID'].map(spec_to_celltype)
fm_iso['dataType'] = 'gene expression'
fm_iso['platform'] = fm_iso['specimenID'].map(dict(zip(fm_isom['specimenID'], fm_isom['platform'])))
fm_iso['consortium'] = 'AMP-AD'
fm_iso['fileFormat'] = 'bam'
fm_iso['readLength'] = fm_iso['specimenID'].map(dict(zip(fm_isom['specimenID'], fm_isom['readLength'])))
fm_iso['dataSubtype'] = 'raw'
fm_iso['libraryPrep'] = fm_iso['specimenID'].map(dict(zip(fm_isom['specimenID'], fm_isom['libraryPrep'])))
fm_iso['resourceType'] = 'experimentalData'
fm_iso['isModelSystem'] = False
fm_iso['isMultiSpecimen'] = False
fm_iso['nucleicAcidSource'] = fm_iso['specimenID'].map(dict(zip(fm_biom['specimenID'], fm_biom['nucleicAcidSource'])))

#### Store `fm_iso` to `syn64363483`

In [None]:
#syn.store(Table('syn64363483', fm_iso))

#### Fill in the rest of the columns in `fm_rna` (spontaneously create dicts here since they are assay specific and fill in values that are static)

In [11]:
fm_rna['sex'] = fm_rna['individualID'].map(ind_to_sex)
fm_rna['assay'] = 'rnaSeq'
fm_rna['grant'] = 'R01AG065582'
fm_rna['organ'] = 'brain'
fm_rna['study'] = 'FreshMicro'
fm_rna['tissue'] = 'prefrontal cortex'
fm_rna['runType'] = 'pairedEnd'
fm_rna['species'] = 'Human'
fm_rna['cellType'] = 'microglia'
fm_rna['dataType'] = 'gene expression'
fm_rna['platform'] = 'IlluminaNovaseq6000'
fm_rna['consortium'] = 'AMP-AD'
fm_rna['fileFormat'] = 'fastq'
fm_rna['readLength'] = '10'
fm_rna['dataSubtype'] = 'raw'
fm_rna['libraryPrep'] = 'totalRNA'
fm_rna['resourceType'] = 'experimentalData'
fm_rna['isModelSystem'] = False
fm_rna['isMultiSpecimen'] = False
fm_rna['nucleicAcidSource'] = 'sorted cells'

#### Store `fm_rna` to `syn64367073` (RUN THIS AGAIN THE MORNING OF 12/11/2024)

In [12]:
syn.store(Table('syn64367073', fm_rna))

Uploading: 100%|██████████| 80.0k/80.0k [00:00<00:00, 298kB/s, table.csv]


<synapseclient.table.CsvFileTable at 0x7fae5d9501d0>

#### Fill in the rest of the columns in `fm_scrna` (spontaneously create dicts here since they are assay specific and fill in values that are static)

In [None]:
fm_scrna['sex'] = fm_scrna['individualID'].map(ind_to_sex)
fm_scrna['assay'] = 'scrnaSeq'
fm_scrna['grant'] = 'R01AG065582'
fm_scrna['organ'] = 'brain'
fm_scrna['study'] = 'FreshMicro'
fm_scrna['tissue'] = fm_scrna['specimenID'].map(spec_to_tissue)
fm_scrna['runType'] = fm_scrna['specimenID'].map(dict(zip(fm_scrnam['specimenID'], fm_scrnam['runType'])))
fm_scrna['species'] = 'Human'
fm_scrna['cellType'] = fm_scrna['specimenID'].map(spec_to_celltype)
fm_scrna['dataType'] = 'gene expression'
fm_scrna['platform'] = fm_scrna['specimenID'].map(dict(zip(fm_scrnam['specimenID'], fm_scrnam['platform'])))
fm_scrna['consortium'] = 'AMP-AD'
fm_scrna['fileFormat'] = 'fastq'
fm_scrna['readLength'] = '100'
fm_scrna['dataSubtype'] = 'raw'
fm_scrna['libraryPrep'] = fm_scrna['specimenID'].map(dict(zip(fm_scrnam['specimenID'], fm_scrnam['libraryPrep'])))
fm_scrna['resourceType'] = 'experimentalData'
fm_scrna['isModelSystem'] = False
fm_scrna['isMultiSpecimen'] = False
fm_scrna['nucleicAcidSource'] = 'single cell'

#### Store `fm_scrna` to `syn64367074`

In [None]:
#syn.store(Table('syn64367074', fm_scrna))