# Basic samples count

This is an initial pipeline to retrieve the IDs that have all phenotype and covariate data, and select those from the metabolomics data

In [1]:
#### Import libraries
import pandas as pd
import numpy as np

In [2]:
#### Read databases
qt_pad = pd.read_csv('~/mah546/default/datasets/ADNI/Test_Data/Data_for_Challenges/ADNI_QT-PAD/ADNI_adnimerge_20170629_QT-freeze.csv')
qt_pad

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,RID,VISCODE,COLPROT,ORIGPROT,EXAMDATE,DX.bl,AGE,PTGENDER,PTEDUCAT,PTETHCAT,...,ABETA.bl,PTAU.bl,TAU.bl,FDG.bl,PIB.bl,AV45.bl,Years.bl,Month.bl,Month,M
0,2,bl,ADNI1,ADNI1,2005-09-08,CN,74.3,Male,16,Not Hisp/Latino,...,,,,1.369264,,,0.000000,0.000000,0,0
1,2,m06,ADNI1,ADNI1,2006-03-06,CN,74.3,Male,16,Not Hisp/Latino,...,,,,1.369264,,,0.490075,5.868852,6,6
2,2,m36,ADNI1,ADNI1,2008-08-27,CN,74.3,Male,16,Not Hisp/Latino,...,,,,1.369264,,,2.967830,35.540984,36,36
3,2,m60,ADNIGO,ADNI1,2010-09-22,CN,74.3,Male,16,Not Hisp/Latino,...,,,,1.369264,,,5.037645,60.327869,60,60
4,2,m66,ADNIGO,ADNI1,2011-03-04,CN,74.3,Male,16,Not Hisp/Latino,...,,,,1.369264,,,5.483915,65.672131,66,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12744,5295,bl,ADNI2,ADNI2,2013-12-11,SMC,75.5,Female,15,Not Hisp/Latino,...,416.8,8.53,88.69,1.163174,,1.319026,0.000000,0.000000,0,0
12745,5295,m03,ADNI2,ADNI2,2014-02-14,SMC,75.5,Female,15,Not Hisp/Latino,...,416.8,8.53,88.69,1.163174,,1.319026,0.177960,2.131148,3,3
12746,5295,m24,ADNI2,ADNI2,2015-12-08,SMC,75.5,Female,15,Not Hisp/Latino,...,416.8,8.53,88.69,1.163174,,1.319026,1.990418,23.836066,24,24
12747,5296,bl,ADNI2,ADNI2,2013-12-18,SMC,69.3,Male,14,Not Hisp/Latino,...,1460,20.9,224.9,1.429264,,1.138379,0.000000,0.000000,0,0


In [3]:
print('Initially, there are ' + str(len(qt_pad)) + ' rows')

#### Keep only baseline
qt_pad = qt_pad.loc[qt_pad['VISCODE'] == 'bl']
print('After keeping only baseline, there are ' + str(len(qt_pad)) + ' rows')

#### Remove those with at least one phenotype missing or covariate (only selecting APOE because it's the only one with extra missing data)
remove = pd.DataFrame( {'Hippocampus': pd.isnull(qt_pad['Hippocampus']), 'Entorhinal': pd.isnull(qt_pad['Entorhinal']), 'Fusiform': pd.isnull(qt_pad['Fusiform']), 'APOE4' : pd.isnull(qt_pad['APOE4'])} ).any(axis=1)
qt_pad = qt_pad.loc[~remove]
print('After removing those with missing phenotypes, there are ' + str(len(qt_pad)) + ' rows')



Initially, there are 12749 rows
After keeping only baseline, there are 1737 rows
After removing those with missing phenotypes, there are 1393 rows


In [4]:
qt_pad.loc[:,('RID','PTGENDER','DX.bl','AGE','PTEDUCAT','APOE4')].groupby(['PTGENDER', 'DX.bl']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,RID,AGE,PTEDUCAT,APOE4
PTGENDER,DX.bl,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,AD,113,113,113,113
Female,CN,179,179,179,179
Female,EMCI,115,115,115,115
Female,LMCI,175,175,175,175
Female,SMC,52,52,52,52
Male,AD,134,134,134,134
Male,CN,184,184,184,184
Male,EMCI,139,139,139,139
Male,LMCI,268,268,268,268
Male,SMC,34,34,34,34


## Checking samples in metabolomics

### Biocrates

In [5]:
# Read files
p180_adni1_fia   = pd.read_csv('~/mah546/default/datasets/ADNI/Biospecimen/Biospecimen_Results/ADMC/Biocrates_p180/ADMCDUKEP180FIA_01_15_16.csv')
p180_adni2go_fia = pd.read_csv('~/mah546/default/datasets/ADNI/Biospecimen/Biospecimen_Results/ADMC/Biocrates_p180/ADMCDUKEP180FIAADNI2GO.csv')
#Note that fia and uplc methods contain the same samples, so we won't read the uplc

# Remove control samples
def remove_controls(dat):
    dat = dat[dat['RID'] != 999999]
    return(dat)

p180_adni1_fia   = remove_controls(p180_adni1_fia)
p180_adni2go_fia = remove_controls(p180_adni2go_fia)

# Get IDs from ADNI 1, 2 and GO together 
p180_IDs = pd.Series(pd.concat([p180_adni1_fia['RID'], p180_adni2go_fia['RID']]).unique())

# Store IDs that are on both datasets
final_IDs = p180_IDs[p180_IDs.isin(qt_pad['RID'])]
print('There are ' + str(sum(p180_IDs.isin(qt_pad['RID']))) + ' intersected IDs in Biocrates p180 and Phenotypes')

There are 1353 intersected IDs in Biocrates p180 and Phenotypes


### Nightingale

In [6]:
# Read file
nigth2 = pd.read_csv('~/mah546/default/datasets/ADNI/Biospecimen/Biospecimen_Results/ADMC/Nightingale/ADNINIGHTINGALE2.csv')
nigth2_IDs = pd.Series(nigth2['RID'].unique())

# Store IDs that are on all datasets
final_IDs = nigth2_IDs[nigth2_IDs.isin(final_IDs)]
print('There are ' + str(len(final_IDs)) + ' intersected IDs in all datasets')


There are 1320 intersected IDs in all datasets


In [7]:
# Save the final IDs
final_IDs.to_csv('../results/FinalIDs.csv', header=False, index=False) 

## Summary from the final IDs

In [9]:
final_qt_pad = qt_pad[qt_pad['RID'].isin(final_IDs)]
print(final_qt_pad.loc[:,('RID','PTGENDER','DX.bl','AGE','PTEDUCAT','APOE4')].groupby(['PTGENDER', 'DX.bl']).count())

                RID  AGE  PTEDUCAT  APOE4
PTGENDER DX.bl                           
Female   AD     106  106       106    106
         CN     172  172       172    172
         EMCI   112  112       112    112
         LMCI   165  165       165    165
         SMC     50   50        50     50
Male     AD     124  124       124    124
         CN     177  177       177    177
         EMCI   130  130       130    130
         LMCI   250  250       250    250
         SMC     34   34        34     34
