In [83]:
import synapseclient
from synapseclient import Project, File, Folder
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import itertools
import pandas as pd
import numpy as np

In [84]:
syn = synapseclient.Synapse()
syn.login()

Welcome, Victor Baham!



### Provide breakdown of AMP-AD 1.0 and AMP-AD 2.0 stats combined

#### AMP-AD 1.0: ROSMAP, MSBB, MayoRNAseq







In [85]:
rosmap_bio = pd.read_csv(syn.get('syn21323366').path)
msbb_bio = pd.read_csv(syn.get('syn21893059').path)
mayo_bio = pd.read_csv(syn.get('syn20827192').path)

#### AMP-AD 2.0: Diverse Cohorts, NPS-AD where cohort != "HBCC"

In [86]:
divco_bio = pd.read_csv(syn.get('syn51757645').path)
nps_bio = pd.read_csv(syn.get('syn55251032').path)
nps_ind = pd.read_csv(syn.get('syn55251012').path)

#### nps_bio and nps_ind are joined to obtain the gold NPS metadata set so that the specimens where cohort != "HBCC" can be filtered out

#### also, it is helpful to define the columns that are present in the NPS biospecimen metadata set to drop any extraneous columns in the gold NPS metadata set

In [87]:
nps_gold = pd.merge(nps_ind, nps_bio, on='individualID')
nps_cols_keep = nps_bio.columns.tolist()
nps_cols_keep = ['individualID',
 'specimenID',
 'specimenIdSource',
 'organ',
 'tissue',
 'isPostMortem',
 'BrodmannArea',
 'nucleicAcidSource',
 'cellType',
 'samplingAge',
 'samplingAgeUnits',
 'sampleStatus',
 'fastingState',
 'visitNumber',
 'assay',
 'Id',
 'entityId']

In [88]:
nps_filter_out_hbcc = (nps_gold['cohort'] != 'HBCC')
nps_bio_by_cohort = nps_gold[nps_filter_out_hbcc]
nps_bio_by_cohort = nps_bio_by_cohort[nps_bio_by_cohort.columns[nps_bio_by_cohort.columns.isin(['individualID',
 'specimenID',
 'specimenIdSource',
 'organ',
 'tissue',
 'isPostMortem',
 'BrodmannArea',
 'nucleicAcidSource',
 'cellType',
 'samplingAge',
 'samplingAgeUnits',
 'sampleStatus',
 'fastingState',
 'visitNumber',
 'assay',
 'Id',
 'entityId'])]].reset_index(drop=True)

### AMP-AD_1.0 golden biospecimen metadata:

In [89]:
amp_ad_1_gold = pd.concat([rosmap_bio, msbb_bio, mayo_bio]).reset_index(drop=True)

#### Add column `AMP-AD_phase` to denote this data belonging to AMP-AD_1.0

In [90]:
amp_ad_1_gold['AMP-AD_phase'] = 'AMP-AD_1.0'

#### Inspect for rows where `specimenID` is null:

In [91]:
amp_ad_1_gold[amp_ad_1_gold['specimenID'].isna()]

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,...,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate,Exclusion_Category,AMP-AD_phase
16879,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
16880,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
16881,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
16882,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
16883,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19656,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
19657,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
19658,,,,,,,,,,,...,,,,,,,,,,AMP-AD_1.0
19659,,,,,,,,,,,...,,,,,,,sample swap,,,AMP-AD_1.0


#### Drop these rows:

In [92]:
filter_amp_ad_1_nans = (~amp_ad_1_gold['specimenID'].isna())
amp_ad_1_gold = amp_ad_1_gold[filter_amp_ad_1_nans].reset_index(drop=True)

### AMP-AD_2.0 golden biospecimen metadata:

In [93]:
amp_ad_2_gold = pd.concat([divco_bio, nps_bio_by_cohort]).reset_index(drop=True)

#### Add column `AMP-AD_phase` to denote this data belonging to AMP-AD_2.0

In [94]:
amp_ad_2_gold['AMP-AD_phase'] = 'AMP-AD_2.0'

#### Inspect for rows where `specimenID` is null:

In [95]:
amp_ad_2_gold[amp_ad_2_gold['specimenID'].isna()]

Unnamed: 0,individualID,specimenID,dataGenerationSite,organ,tissue,BrodmannArea,isPostMortem,sampleStatus,nucleicAcidSource,assay,specimenMetadataSource,specimenIdSource,cellType,samplingAge,samplingAgeUnits,fastingState,visitNumber,Id,entityId,AMP-AD_phase


### AMP_AD_1.0_2.0 golden biospecimen metadata:

In [96]:
amp_ad_1_2_gold = pd.concat([amp_ad_1_gold, amp_ad_2_gold]).reset_index(drop=True)

#### Inspecting for rows where `specimenID` is null as a sanity check:

In [97]:
amp_ad_1_2_gold[amp_ad_1_2_gold['specimenID'].isna()]

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,...,assay,exclude,excludeReason,samplingDate,Exclusion_Category,AMP-AD_phase,dataGenerationSite,specimenMetadataSource,Id,entityId


In [98]:
amp_ad_1_2_gold

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,...,assay,exclude,excludeReason,samplingDate,Exclusion_Category,AMP-AD_phase,dataGenerationSite,specimenMetadataSource,Id,entityId
0,R1743384,190403-B4-A_R1743384,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,scrnaSeq,False,,,,AMP-AD_1.0,,,,
1,R2670295,190403-B4-A_R2670295,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,scrnaSeq,False,,,,AMP-AD_1.0,,,,
2,R4119160,190403-B4-A_R4119160,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,scrnaSeq,True,RNA genotype discordant with WGS,,,AMP-AD_1.0,,,,
3,R4641987,190403-B4-A_R4641987,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,scrnaSeq,False,,,,AMP-AD_1.0,,,,
4,R5693901,190403-B4-A_R5693901,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,scrnaSeq,True,Duplicated donor,,,AMP-AD_1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24570,R5508487,R5508487,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,snrnaSeq,,,,,AMP-AD_2.0,,,2b02ebb9-a0e9-4883-9e2e-7ba7350499f7,
24571,R9380629,R9380629,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,snrnaSeq,,,,,AMP-AD_2.0,,,b9ad1423-925f-40b5-8429-943386b99ef2,
24572,R5909439,R5909439,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,snrnaSeq,,,,,AMP-AD_2.0,,,10407afc-e3e9-4fea-8b37-044eab939981,
24573,R8594936,R8594936,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,...,snrnaSeq,,,,,AMP-AD_2.0,,,a8ddcdd4-4be0-4200-bd3c-9a84228bfb6f,


In [108]:
counts_by_specimen_and_region = amp_ad_1_2_gold.groupby(['specimenID', 'AMP-AD_phase', 'tissue']).size().unstack('tissue', fill_value=0.0)

In [112]:
counts_by_specimen_and_region

Unnamed: 0_level_0,tissue,Head of caudate nucleus,blood,caudate nucleus,cerebellar cortex,cerebellum,dorsolateral prefrontal cortex,frontal cortex,frontal pole,inferior frontal gyrus,occipital visual cortex,parahippocampal gyrus,posterior cingulate cortex,prefrontal cortex,serum,superior temporal gyrus,temporal cortex,temporal pole,unspecified
specimenID,AMP-AD_phase,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
01_120405,AMP-AD_1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02_120405,AMP-AD_1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
03_120405,AMP-AD_1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
04_120405,AMP-AD_1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05_120405,AMP-AD_1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sd_b25.131N,AMP-AD_2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sd_b25.132C,AMP-AD_2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sd_b25.132N,AMP-AD_2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sd_b25.133C,AMP-AD_2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
counts_by_specimen_and_region.to_csv('AMP-AD_1.0_2.0_counts_by_specimen__phase_and_region_May_7_24.csv')

In [120]:
summary_by_counts = amp_ad_1_2_gold.groupby(["tissue", "AMP-AD_phase"]).size()

In [121]:
summary_by_counts.to_csv('AMP-AD_1.0_2.0_counts_by_phase_and_region_summary_May_7_24.csv')