Context: [[ADEL-312] fix mismatched identifiers in ROSMAP snpArray data](https://sagebionetworks.jira.com/browse/ADEL-312):
> Jaro Bendl at MSSM recently brought up that we’ve had a persistent issue where the specimenIDs in the old ROSMAP snpArray data files do not match the specimenIDs in the metadata – the metadata uses an “R” prefix and the data files use “ROS” or “MAP” prefixes. 

> If we have a minute to adjust our specimenIDs in the snpArray assay metadata and the biospecimen metadata files and make sure they match the data, that would be very helpful to contributors!

> Forum discussion here: Project ID for GWAS Illumina chopped #syn7769936 

In [5]:
import synapseclient
from synapseclient import Project, File, Folder
from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
import pandas as pd
from pandas_plink import read_plink1_bin

In [2]:
syn = synapseclient.Synapse()
syn.login(authToken=[redacted])

Welcome, Victor Baham!



#### Exploring the bed/bim/fam files:

In [3]:
r_bed = syn.get('syn3221153').path
r_bim = syn.get('syn3221155').path
r_fam = syn.get('syn3221157').path

G = read_plink1_bin(r_bed, r_bim, r_fam, verbose=False)
print(G)

<xarray.DataArray 'genotype' (sample: 1708, variant: 750173)> Size: 5GB
dask.array<transpose, shape=(1708, 750173), dtype=float32, chunksize=(1024, 1024), chunktype=numpy.ndarray>
Coordinates: (12/14)
  * sample   (sample) <U11 75kB 'ROS20275399' 'ROS10442701' ... 'ROS20701008'
  * variant  (variant) <U13 39MB 'variant0' 'variant1' ... 'variant750172'
    fid      (sample) object 14kB 'KronosII_P01_6.0_A01_ROS20275399.CEL' ... ...
    iid      (sample) object 14kB 'ROS20275399' 'ROS10442701' ... 'ROS20701008'
    father   (sample) object 14kB '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0'
    mother   (sample) object 14kB '0' '0' '0' '0' '0' ... '0' '0' '0' '0' '0'
    ...       ...
    chrom    (variant) object 6MB '0' '0' '0' '0' '0' ... '26' '26' '26' '26'
    snp      (variant) object 6MB 'SNP_A-8282312' ... 'rs3134562'
    cm       (variant) float64 6MB 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0
    pos      (variant) int32 3MB 0 0 0 0 0 0 ... 15608 15671 15785 15925 16141
    a0   

In [6]:
ros_clin = pd.read_csv(syn.get('syn3191087').path)
ros_clin

Downloading files: 100%|██████████| 335k/335k [00:00<00:00, 1.58MB/s, syn3191087]

Downloaded syn3191087 to /home/jovyan/.synapseCache/791/75431791/ROSMAP_clinical.csv


Downloading files: 100%|██████████| 335k/335k [00:00<00:00, 1.54MB/s, syn3191087]


Unnamed: 0,projid,Study,msex,educ,race,spanish,apoe_genotype,age_at_visit_max,age_first_ad_dx,age_death,cts_mmse30_first_ad_dx,cts_mmse30_lv,pmi,braaksc,ceradsc,cogdx,dcfdx_lv,individualID
0,10101589,ROS,1.0,20.0,1.0,2.0,34.0,90+,90+,90+,18.0,5.0,9.916667,4.0,2.0,4.0,4.0,R6939144
1,86767530,MAP,0.0,10.0,1.0,2.0,33.0,90+,90+,90+,18.0,10.0,6.500000,4.0,2.0,4.0,4.0,R3893503
2,9650662,MAP,0.0,15.0,1.0,2.0,23.0,90+,90+,90+,0.0,0.0,3.850000,3.0,2.0,4.0,4.0,R8937093
3,50402855,MAP,0.0,21.0,1.0,2.0,33.0,90+,,,,27.0,,,,,1.0,R7139444
4,20544321,ROS,0.0,16.0,1.0,2.0,23.0,90+,90+,,13.0,14.0,,,,,4.0,R4971237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3579,22207815,ROS,0.0,18.0,2.0,2.0,23.0,57.653661875427787,,,,29.0,,,,,1.0,R5306025
3580,22207941,ROS,0.0,16.0,2.0,2.0,34.0,56.651608487337441,,,,27.0,,,,,1.0,R6142763
3581,49333806,MAP,0.0,12.0,2.0,2.0,,56.599589322381931,,,,30.0,,,,,1.0,R4468842
3582,59720188,MAP,0.0,13.0,1.0,1.0,,54.622861054072551,,,,29.0,,,,,1.0,R9446033


Downloading files: 100%|██████████| 106k/106k [00:00<00:00, 605kB/s, syn21314550]66]

#### Use ROSMAP_clinical.csv to establish relationship between existing `individualID` and `specimenID` within the bed/bim/fam files:

In [11]:
ind_to_spec_update = dict(zip(ros_clin['individualID'], ros_clin['Study'] + ros_clin['projid'].astype(str)))

#### Goal: change the `specimenID` for all `snpArray` specimens unless the specimenID starts with `11AD`

In [57]:
ros_bio = pd.read_csv(syn.get('syn21323366').path)
ros_bio                                           

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,cellType,fastingState,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate
0,R1743384,190403-B4-A_R1743384,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
1,R2670295,190403-B4-A_R2670295,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
2,R4119160,190403-B4-A_R4119160,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,RNA genotype discordant with WGS,
3,R4641987,190403-B4-A_R4641987,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
4,R5693901,190403-B4-A_R5693901,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,Duplicated donor,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13340,R4244951,ROSMAP210.b14.132N.R4244951,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,
13341,R2039807,ROSMAP210.b14.132C.R2039807,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,
13342,R5938989,ROSMAP210.b14.133N.R5938989,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,
13343,R6622577,ROSMAP210.b14.133C.R6622577,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,


In [48]:
change_bio = ros_bio[ros_bio['assay'] == 'snpArray']
change_bio

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,cellType,fastingState,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate
6820,R9976232,R9976232,,,,,,,,,,,,,,,snpArray,,,
6821,R9974729,R9974729,,,,,,,,,,,,,,,snpArray,,,
6822,R9940994,R9940994,,,,,,,,,,,,,,,snpArray,,,
6823,R9936070,R9936070,,,,,,,,,,,,,,,snpArray,,,
6824,R9907075,R9907075,,,,,,,,,,,,,,,snpArray,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8905,R1020037,11AD39718,,,,,,,,,,,,,,,snpArray,,,
8906,R1017692,R1017692,,,,,,,,,,,,,,,snpArray,,,
8907,R1004922,R1004922,,,,,,,,,,,,,,,snpArray,,,
8908,,11AD39812,,,,,,,,,,,,,,,snpArray,,,


In [50]:
change_bio = change_bio[change_bio['specimenID'].str.startswith('11AD') == False]

In [53]:
change_bio['specimenID'] = change_bio['specimenID'].map(lambda x: ind_to_spec_update.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  change_bio['specimenID'] = change_bio['specimenID'].map(lambda x: ind_to_spec_update.get(x))



In [54]:
change_bio

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,cellType,fastingState,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate
6820,R9976232,ROS21183160,,,,,,,,,,,,,,,snpArray,,,
6821,R9974729,ROS20594669,,,,,,,,,,,,,,,snpArray,,,
6822,R9940994,MAP27560777,,,,,,,,,,,,,,,snpArray,,,
6823,R9936070,MAP2899847,,,,,,,,,,,,,,,snpArray,,,
6824,R9907075,ROS11616707,,,,,,,,,,,,,,,snpArray,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8902,R1039896,ROS10479506,,,,,,,,,,,,,,,snpArray,,,
8903,R1034846,MAP47652086,,,,,,,,,,,,,,,snpArray,,,
8906,R1017692,ROS20787136,,,,,,,,,,,,,,,snpArray,,,
8907,R1004922,MAP18301541,,,,,,,,,,,,,,,snpArray,,,


In [55]:
fin_bio = pd.concat([ros_bio, change_bio])

In [56]:
fin_bio

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,cellType,fastingState,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate
0,R1743384,190403-B4-A_R1743384,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
1,R2670295,190403-B4-A_R2670295,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
2,R4119160,190403-B4-A_R4119160,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,RNA genotype discordant with WGS,
3,R4641987,190403-B4-A_R4641987,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
4,R5693901,190403-B4-A_R5693901,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,Duplicated donor,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8902,R1039896,ROS10479506,,,,,,,,,,,,,,,snpArray,,,
8903,R1034846,MAP47652086,,,,,,,,,,,,,,,snpArray,,,
8906,R1017692,ROS20787136,,,,,,,,,,,,,,,snpArray,,,
8907,R1004922,MAP18301541,,,,,,,,,,,,,,,snpArray,,,


In [58]:
fin_bio = fin_bio[~fin_bio.index.duplicated(keep='last')]

In [60]:
fin_bio.sort_index(axis=0)
fin_bio

Unnamed: 0,individualID,specimenID,specimenIdSource,organ,tissue,BrodmannArea,sampleStatus,tissueWeight,tissueVolume,nucleicAcidSource,cellType,fastingState,isPostMortem,samplingAge,samplingAgeUnits,visitNumber,assay,exclude,excludeReason,samplingDate
0,R1743384,190403-B4-A_R1743384,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
1,R2670295,190403-B4-A_R2670295,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
2,R4119160,190403-B4-A_R4119160,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,RNA genotype discordant with WGS,
3,R4641987,190403-B4-A_R4641987,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,False,,
4,R5693901,190403-B4-A_R5693901,,brain,dorsolateral prefrontal cortex,,frozen,,,single nucleus,,,True,,,,scrnaSeq,True,Duplicated donor,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13340,R4244951,ROSMAP210.b14.132N.R4244951,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,
13341,R2039807,ROSMAP210.b14.132C.R2039807,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,
13342,R5938989,ROSMAP210.b14.133N.R5938989,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,
13343,R6622577,ROSMAP210.b14.133C.R6622577,,brain,dorsolateral prefrontal cortex,9,,,,,,,True,,,,TMT quantitation,,,


In [61]:
fin_bio.to_csv('ROSMAP_biospecimen_metadata.csv')

In [78]:
ros_snp = pd.read_csv(syn.get('syn21314550').path)
ros_snp

Unnamed: 0,assay,specimenID,platform,dnaBatch,arrayBatch,260/280,260/230,GQN
0,snpArray,11AD39713,Illumina_HumanOmniExpress,,,,,
1,snpArray,11AD39714,Illumina_HumanOmniExpress,,,,,
2,snpArray,11AD39715,Illumina_HumanOmniExpress,,,,,
3,snpArray,11AD39716,Illumina_HumanOmniExpress,,,,,
4,snpArray,11AD39717,Illumina_HumanOmniExpress,,,,,
...,...,...,...,...,...,...,...,...
2085,snpArray,R3743746,Affy6.0,,,,,
2086,snpArray,R9958949,Affy6.0,,,,,
2087,snpArray,R3438797,Affy6.0,,,,,
2088,snpArray,R1177802,Affy6.0,,,,,


In [79]:
change_snp = ros_snp[ros_snp['specimenID'].str.startswith('11AD') == False]

In [80]:
change_snp

Unnamed: 0,assay,specimenID,platform,dnaBatch,arrayBatch,260/280,260/230,GQN
382,snpArray,R1571846,Affy6.0,,,,,
383,snpArray,R3978789,Affy6.0,,,,,
384,snpArray,R2274528,Affy6.0,,,,,
385,snpArray,R8140052,Affy6.0,,,,,
386,snpArray,R6211392,Affy6.0,,,,,
...,...,...,...,...,...,...,...,...
2085,snpArray,R3743746,Affy6.0,,,,,
2086,snpArray,R9958949,Affy6.0,,,,,
2087,snpArray,R3438797,Affy6.0,,,,,
2088,snpArray,R1177802,Affy6.0,,,,,


In [81]:
change_snp['specimenID'] = change_snp['specimenID'].map(lambda x: ind_to_spec_update.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  change_snp['specimenID'] = change_snp['specimenID'].map(lambda x: ind_to_spec_update.get(x))



In [82]:
change_snp

Unnamed: 0,assay,specimenID,platform,dnaBatch,arrayBatch,260/280,260/230,GQN
382,snpArray,MAP285563,Affy6.0,,,,,
383,snpArray,MAP482428,Affy6.0,,,,,
384,snpArray,MAP617643,Affy6.0,,,,,
385,snpArray,MAP668310,Affy6.0,,,,,
386,snpArray,MAP696418,Affy6.0,,,,,
...,...,...,...,...,...,...,...,...
2085,snpArray,ROS95392352,Affy6.0,,,,,
2086,snpArray,ROS95939725,Affy6.0,,,,,
2087,snpArray,ROS97411796,Affy6.0,,,,,
2088,snpArray,ROS97929179,Affy6.0,,,,,


In [83]:
fin_snp = pd.concat([ros_snp, change_snp])

In [84]:
fin_snp

Unnamed: 0,assay,specimenID,platform,dnaBatch,arrayBatch,260/280,260/230,GQN
0,snpArray,11AD39713,Illumina_HumanOmniExpress,,,,,
1,snpArray,11AD39714,Illumina_HumanOmniExpress,,,,,
2,snpArray,11AD39715,Illumina_HumanOmniExpress,,,,,
3,snpArray,11AD39716,Illumina_HumanOmniExpress,,,,,
4,snpArray,11AD39717,Illumina_HumanOmniExpress,,,,,
...,...,...,...,...,...,...,...,...
2085,snpArray,ROS95392352,Affy6.0,,,,,
2086,snpArray,ROS95939725,Affy6.0,,,,,
2087,snpArray,ROS97411796,Affy6.0,,,,,
2088,snpArray,ROS97929179,Affy6.0,,,,,


In [85]:
fin_snp = fin_snp[~fin_snp.index.duplicated(keep='last')]

In [86]:
fin_snp.sort_index(axis=0)
fin_snp

Unnamed: 0,assay,specimenID,platform,dnaBatch,arrayBatch,260/280,260/230,GQN
0,snpArray,11AD39713,Illumina_HumanOmniExpress,,,,,
1,snpArray,11AD39714,Illumina_HumanOmniExpress,,,,,
2,snpArray,11AD39715,Illumina_HumanOmniExpress,,,,,
3,snpArray,11AD39716,Illumina_HumanOmniExpress,,,,,
4,snpArray,11AD39717,Illumina_HumanOmniExpress,,,,,
...,...,...,...,...,...,...,...,...
2085,snpArray,ROS95392352,Affy6.0,,,,,
2086,snpArray,ROS95939725,Affy6.0,,,,,
2087,snpArray,ROS97411796,Affy6.0,,,,,
2088,snpArray,ROS97929179,Affy6.0,,,,,


In [87]:
fin_snp.to_csv('ROSMAP_assay_snpArray_metadata.csv')