In [2]:
import os
import numpy as np
import pandas as pd

import seaborn as sn

% matplotlib inline



In [6]:
data_dir = os.path.abspath('../primary-processing/agp_processing/')
map_fp = os.path.join(data_dir, '01-raw/raw-metadata.txt')
otu_fp = os.path.join(data_dir, '03-otus/100nt/gg-13_8-97-percent/otu_table_r1_21.biom')


In [7]:
md = pd.read_csv(map_fp,
                   sep='\t',
                   dtype=str,
                   na_values=['NA', 'unknown', '', 'no_data', 'None', 'Unknown'],
                   )
md.set_index('#SampleID', inplace=True)

In [8]:
count_summary = !biom summarize-table -i $otu_fp

count_summary = pd.DataFrame([s.split(': ') for s in count_summary[15:]], columns=['#SampleID', 'counts'])
count_summary.set_index('#SampleID', inplace=True)
count_summary['counts'] = count_summary['counts'].astype(float)

md = md.join(count_summary)

In [19]:
# remap the variety of responses in the fields, and reduce to simpler responses. 

from functools import partial
def mapper(mapping, value):
    return mapping.get(value, value)

diabetes_values_fix = {'I do not have this condition': 'I do not have diabetes',
                       'Diagnosed by a medical professional (doctor, physician assistant)': 'I have diabetes',
                       'Diagnosed by an alternative medicine practitioner': 'I have diabetes',
                       'Type I': 'I have diabetes',
                       'Type II': 'I have diabetes',
                       'Self-diagnosed': 'I have diabetes'}

ibd_values_fix = {"Crohn's disease": "I have an IBD",
                  "Diagnosed by a medical professional (doctor, physician assistant)": "I have an IBD",
                  "Diagnosed by an alternative medicine practitioner": "I have an IBD",
                  "I do not have this condition": "I do not have an IBD",
                  "I do not have IBD": "I do not have an IBD",
                  "Ulcerative colitis": "I have an IBD",
                  "Self-diagnosed": "I have an IBD"}

smoking_values_fix = {'Daily': 'I smoke',
                      'Never': 'I do not smoke',
                      'Occasionally (1-2 times/week)': 'I smoke',
                      'Rarely (a few times/month)': 'I smoke',
                      'Rarely (few times/month)': 'I smoke',
                      'Regularly (3-5 times/week)': 'I smoke'}

diabetes_map = partial(mapper, diabetes_values_fix)
ibd_map = partial(mapper, ibd_values_fix)
smoke_map = partial(mapper, smoking_values_fix)

md['DIABETES'] = md['DIABETES'].apply(diabetes_map)
md['IBD'] = md['IBD'].apply(ibd_map)
md['SMOKING_FREQUENCY'] = md['SMOKING_FREQUENCY'].apply(smoke_map)

We're going to look at participants who submitted at least one sample.

In [20]:
# Drop any body sites that don't make sense
md = md.loc[md['BODY_SITE'].notnull()]
md_1k = md.loc[md['counts'] > 1000]
md_10k = md.loc[md['counts'] > 10000]

# Gets only a single sample for each person
single_ids = []
hsi_ = []
for hsi, ids_ in md.groupby('HOST_SUBJECT_ID').groups.iteritems():
    single_ids.append(ids_[0])
    
single_md = md.loc[single_ids]

single_1k = []
for hsi, ids_  in md_1k.groupby('HOST_SUBJECT_ID').groups.iteritems():
    single_1k.append(ids_[0])
    
single_10k = []
for hsi, ids_  in md_10k.groupby('HOST_SUBJECT_ID').groups.iteritems():
    single_10k.append(ids_[0])

Let's check the total number of participants and total number of samples.

In [21]:
pd.DataFrame([[len(single_ids), len(single_1k), len(single_10k)],
              [md.shape[0], md_1k.shape[0], md_10k.shape[0]]],
             columns=['All', '1k', '10k'], index=['Participants', 'Samples']).transpose()

Unnamed: 0,Participants,Samples
All,5508,6789
1k,5300,6468
10k,4093,4894


Now, let's check the nationality of our participants, and look at the distribution of individuals.

First, we'll look at the number of countries and sovergn states where participants live.

In [22]:
print len(single_md.groupby('COUNTRY').count().max(1))

28


Next, let's look at the actual number of participants living in each nation or territory.

In [23]:
nationalities = pd.DataFrame([single_md.groupby('COUNTRY').count().max(1),
                              md.loc[single_1k].groupby('COUNTRY').count().max(1),
                              md.loc[single_10k].groupby('COUNTRY').count().max(1)],
                             index=['All', 'Returned', 'Analyzed']
                            ).transpose().sort_values('Analyzed', ascending=False)
print nationalities

                       All  Returned  Analyzed
USA                   4589      4440      3418
United Kingdom         557       508       439
Australia              149       148        93
Canada                  70        65        50
Switzerland             12        12         7
France                   8         8         6
Germany                 11        10         6
Thailand                 9         9         6
Belgium                  9         8         6
Netherlands              6         6         5
Norway                   5         5         5
Czech Republic           7         7         4
New Zealand              5         5         3
Spain                    3         3         2
Italy                    4         4         2
Sweden                   2         2         2
United Arab Emirates     1         1         1
Brazil                   1         1         1
Jersey                   1         1         1
Poland                   1         1         1
China        

We're going to focus the first demographic analysis on participants from the US. 

In [24]:
us_md = single_md.loc[single_md.COUNTRY == 'USA']

In [25]:
res_table = {}
n_samples = float(len(us_md))
cats = ['SEX', 'RACE', 'SMOKING_FREQUENCY', 'DIABETES', 'IBD', 'BMI_CAT']

for cat in cats:
    # drop out any null values
    cat_tab = us_md[us_md[cat].notnull()]
    
    # determine how many unique subjects are represented
    n_subjects = float(len(cat_tab.HOST_SUBJECT_ID.unique()))

    # for each value in (e.g., for SEX: Male, Female, Other)
    for val in cat_tab.groupby(cat).HOST_SUBJECT_ID.unique().index:
        # get the number of unique subjects
        count = cat_tab.groupby(cat).HOST_SUBJECT_ID.nunique()[val]
        
        # store the count of subjects and the percentage of the subjects represented
        res_table["%s - %s" % (cat, val)] = (count, (count / n_subjects) * 100)

res = pd.DataFrame.from_dict(res_table, orient='index')
res.columns = ['Count', 'Within group percentage']

In [26]:
# Category/value : percent in US population
census_data = {
               #from http://quickfacts.census.gov/qfd/states/00000.html
               'SEX - female': 50.8,
               'SEX - male': 49.2,  # this is an over estimate as only the % of females is described in the above URL
               'SEX - other': np.nan,  # does not appear to be tracked
               
               # from http://www.census.gov/prod/cen2010/briefs/c2010br-02.pdf
               # doesn't sum to 100% as the fields don't map exactly, so there may be some overlap represented below
               'RACE - African American': 12.6,
               'RACE - Asian or Pacific Islander': 5.0,
               'RACE - Caucasian': 63.7,
               'RACE - Hispanic': 16.3,
               'RACE - Other': 6.2,

               # from http://www.census.gov/compendia/statab/2012/tables/12s0211.pdf
               # using total, non age adjusted values

###### we probably want to filter to > 18yo for these values in the metadata
               'BMI_CAT - Normal': 31.2, 
               'BMI_CAT - Obese': 33.0,
               'BMI_CAT - Overweight': 34.0,
               'BMI_CAT - Underweight': 1.8,

               # from http://www.cdc.gov/diabetes/data/statistics/2014statisticsreport.html
               'DIABETES - I do not have diabetes': 90.7,
               'DIABETES - I have diabetes': 9.3, # This uses 21 million 

               # from http://www.cdc.gov/ibd/ibd-epidemiology.htm
               # using 1.3 million people as the estimate, and US population size for 2014 from
               # http://quickfacts.census.gov/qfd/states/00000.html
               'IBD - I do not have an IBD': 99.6,
               'IBD - I have an IBD': 0.4,
          
               # from http://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/
               'SMOKING_FREQUENCY - I do not smoke': 82.6,
               'SMOKING_FREQUENCY - I smoke': 17.4
}
res['US Census/CDC/NHANES data precentages'] = pd.DataFrame.from_dict(census_data, orient='index')

In [27]:
res.sort_index()

Unnamed: 0,Count,Within group percentage,US Census/CDC/NHANES data precentages
BMI_CAT - Normal,2483,58.894687,31.2
BMI_CAT - Obese,463,10.981973,33.0
BMI_CAT - Overweight,878,20.825427,34.0
BMI_CAT - Underweight,392,9.297913,1.8
DIABETES - I do not have diabetes,4346,98.817644,90.7
DIABETES - I have diabetes,52,1.182356,9.3
IBD - I do not have an IBD,4145,97.414806,99.6
IBD - I have an IBD,110,2.585194,0.4
RACE - African American,52,1.14689,12.6
RACE - Asian or Pacific Islander,185,4.080282,5.0
