In [35]:
import os
import numpy as np
import pandas as pd
from skbio.stats.power import *
import copy

In [36]:
def _check_strs(x):
    r"""Returns False if x is a nan and True is x is a string or number"""

    if isinstance(x, str):
        return True
    elif isinstance(x, (float, int)):
        return not np.isnan(x)
    else:
        raise TypeError('input must be a string, float or a nan')
        
def paired_subsamples(meta, cat, control_cats, order=None, strict_match=True):
    r"""Gets a set of samples to serve as controls
    """

    # Sets the index data
    # Groups meta by category
    cat_groups = meta.groupby(cat).groups

    # Handles the order argument
    if order is None:
        order = sorted(cat_groups.keys())
    order = np.array(order)
    num_groups = len(order)

    # Determines the number of samples, and the experimental and control group
    group_size = np.array([len(cat_groups[o]) for o in order])
    ctrl_name = order[group_size == group_size.min()][0]
    order = order[order != ctrl_name]

    # Gets a control group table
    ctrl_match_groups = meta.groupby(control_cats).groups
    ctrl_group = meta.loc[cat_groups[ctrl_name]
                          ].groupby(list(control_cats)).groups

    ctrl_count = 0
    case_count = 0
    ids = [np.array([])] * num_groups
    # Loops through samples in the experimental group to match for controls
    for check_group, ctrl_ids in viewitems(ctrl_group):
        # Checks the categories have been defined
        undefed_check = np.array([_check_strs(p) for p in check_group])
        if not undefed_check.all() and strict_match:
            continue
        # Removes the matched ids from order
        matched_ids = ctrl_match_groups[check_group]
        for id_ in ctrl_ids:
            matched_ids.remove(id_)
        pos_ids = []
        num_ctrl = len(ctrl_ids)
        num_ids = [len(ctrl_ids)]
        # Gets the matrix of the matched ids and groups them
        exp_group = meta.loc[matched_ids].groupby(cat).groups
        for grp in order:
            # Checks group to be considered is included in the grouping
            if grp not in exp_group:
                break
            # Gets the id associated with the group
            pos_ids.append(exp_group[grp])
            num_ids.append(len(exp_group[grp]))

        # Determines the minimum number of samples
        num_draw = np.array(num_ids).min()
        # Draws samples from possible ids
        exp_ids = [ctrl_ids]
        exp_ids.extend([id_ for id_ in pos_ids])

        if len(exp_ids) == num_groups:
            ctrl_count = ctrl_count + num_ids[0]
            case_count = case_count + num_ids[1]
            for idx in range(num_groups):
                ids[idx] = np.hstack((ids[idx], exp_ids[idx]))



    return ids

In [37]:
data14 = pd.read_csv('/Users/jwdebelius/Desktop/agp_analysis1_14/sample_data/fecal/all_participants_one_sample/AGP_100nt_even10k_fecal.txt',
                     sep='\t',
                     na_values=[''],
                     index_col=False,
                     dtype={'#SampleID': str},
                     low_memory=False)
data15 = pd.read_csv('/Users/jwdebelius/Desktop/agp_analysis_1_15/sample_data/fecal/all_participants_one_sample/AGP_100nt_even10k_fecal.txt',
                     sep='\t',
                     na_values=[''],
                     index_col=False,
                     dtype={'#SampleID': str},
                     low_memory=False)
data16 = pd.read_csv('/Users/jwdebelius/Desktop/agp_analysis/sample_data/fecal/all_participants_one_sample/AGP_100nt_even10k_fecal.txt',
                     sep='\t',
                     na_values=[''],
                     index_col=False,
                     dtype={'#SampleID': str},
                     low_memory=False)

In [38]:
data14.set_index('#SampleID', inplace=True)
data15.set_index('#SampleID', inplace=True)
data16.set_index('#SampleID', inplace=True)

In [39]:
fecal_cats = [('IBD', ['I do not have IBD', 'IBD']),
              ('ANTIBIOTIC_SELECT', ['In the past month',
                                     'Not in the last year']),
              ('TYPES_OF_PLANTS', ['Less than 5', 'More than 30']),
              ('AGE_CAT', ['20s', '60s']),
              ('BMI_CAT', ['Normal', 'Obese']),
              ('COLLECTION_SEASON', ['Winter', 'Summer']),
              ('ALCOHOL_FREQUENCY', ['Never', 'Daily']),
              ('EXERCISE_FREQUENCY', ['Rarely', 'Daily']),
              ('SLEEP_DURATION', ['Less than 6 hours', '8 or more hours'])]

fecal_control_cats = ['IBD', 'BMI_CAT', 'TYPES_OF_PLANTS', 'DIABETES',
                      'ANTIBIOTIC_SELECT', 'AGE_CAT', 'COLLECTION_SEASON',
                      'SLEEP_DURATION']

In [40]:
# Combines individuals with Ulcerative Colitis and Crohn's disease into a
# single category
data14.loc[data14.IBD == "Crohn's disease", 'IBD'] = 'IBD'
data14.loc[data14.IBD == "Ulcerative colitis", 'IBD'] = 'IBD'
data15.loc[data15.IBD == "Crohn's disease", 'IBD'] = 'IBD'
data15.loc[data15.IBD == "Ulcerative colitis", 'IBD'] = 'IBD'
data16.loc[data16.IBD == "Crohn's disease", 'IBD'] = 'IBD'
data16.loc[data16.IBD == "Ulcerative colitis", 'IBD'] = 'IBD'

# Combines the never exercises category with the rarely exercises category
data14.loc[data14.EXERCISE_FREQUENCY == 'Never', 'EXERCISE_FREQUENCY'] = 'Rarely'
data14.loc[data14.EXERCISE_FREQUENCY == 'Rarely (few times/month)', 'EXERCISE_FREQUENCY'] = 'Rarely'
data15.loc[data15.EXERCISE_FREQUENCY == 'Never', 'EXERCISE_FREQUENCY'] = 'Rarely'
data15.loc[data15.EXERCISE_FREQUENCY == 'Rarely (few times/month)', 'EXERCISE_FREQUENCY'] = 'Rarely'
data16.loc[data16.EXERCISE_FREQUENCY == 'Never', 'EXERCISE_FREQUENCY'] = 'Rarely'
data16.loc[data16.EXERCISE_FREQUENCY == 'Rarely (few times/month)', 'EXERCISE_FREQUENCY'] = 'Rarely'

# Combines people who took antibiotics in the past week and in the past month
data14.loc[data14.ANTIBIOTIC_SELECT == 'In the past week', 'ANTIBIOTIC_SELECT'] = 'In the past month'
data15.loc[data15.ANTIBIOTIC_SELECT == 'In the past week', 'ANTIBIOTIC_SELECT'] = 'In the past month'
data16.loc[data16.ANTIBIOTIC_SELECT == 'In the past week', 'ANTIBIOTIC_SELECT'] = 'In the past month'

In [41]:
ids14 = []
ids15 = []
ids16 = []
for cat, order in fecal_cats:
    if cat in fecal_control_cats:        
        ctrl_cats = copy.deepcopy(fecal_control_cats)
        ctrl_cats.remove(cat)
    
    ids14.append(paired_subsamples(data14, cat, ctrl_cats, order))
    ids15.append(paired_subsamples(data15, cat, ctrl_cats, order))
    ids16.append(paired_subsamples(data16, cat, ctrl_cats, order))

In [42]:
fecal_cats

[('IBD', ['I do not have IBD', 'IBD']),
 ('ANTIBIOTIC_SELECT', ['In the past month', 'Not in the last year']),
 ('TYPES_OF_PLANTS', ['Less than 5', 'More than 30']),
 ('AGE_CAT', ['20s', '60s']),
 ('BMI_CAT', ['Normal', 'Obese']),
 ('COLLECTION_SEASON', ['Winter', 'Summer']),
 ('ALCOHOL_FREQUENCY', ['Never', 'Daily']),
 ('EXERCISE_FREQUENCY', ['Rarely', 'Daily']),
 ('SLEEP_DURATION', ['Less than 6 hours', '8 or more hours'])]

In [43]:
print len(ids14[2][0])
print len(ids15[2][0])
print len(ids16[2][0])

p5_14 = set(ids14[2][0])
p5_15 = set(ids15[2][0])
p5_16 = set(ids16[2][0])

41
41
41


In [44]:
print len(ids14[2][1])
print len(ids15[2][1])
print len(ids16[2][1])

p3_14 = ids14[2][1]
p3_15 = ids15[2][1]
p3_16 = ids16[2][1]

86
87
87


In [45]:
print p5_14.difference(p5_15)
print p5_14.difference(p5_16)
print p5_15.difference(p5_16)

set(['000015224', '000003390', '000015123', '000017632', '000012072', '000016369', '000015568', '000014369', '000015264'])
set(['000015224', '000014594.1259618', '000003390', '000015123', '000017632', '000012072', '000016369', '000015568', '000014369', '000015264'])
set(['000014594.1259618', '000015261.fixed171', '000007750.fixed964'])


In [46]:
print set(p3_14).difference(set(p3_15))
print set(p3_14).difference(set(p3_16))
print set(p3_15).difference(set(p3_16))

set(['000015634', '000015865', '000015624', '000002944.1130012', '000015881', '000017760', '000015385', '000001820', '000003475.1076147', '000016716', '000017335', '000015850', '000012976'])
set(['000015634', '000009661.1257123', '000015865', '000005034.1076350', '000015624', '000002944.1130012', '000015881', '000017760', '000015385', '000001820', '000003475.1076147', '000016716', '000017335', '000015850', '000012976'])
set(['000009661.1257123', '000005034.1076350', '000016722.fixed480'])


In [53]:
all_16 = data16 = pd.read_csv('/Users/jwdebelius/Desktop/agp_analysis/sample_data/all/AGP_100nt_even10k.txt',
                     sep='\t',
                     na_values=[''],
                     index_col=False,
                     dtype={'#SampleID': str},
                     low_memory=False)
all_16.set_index('#SampleID', inplace=True)

In [60]:
all_ids = set(p3_14).union(p3_15).union(p3_16)
sub1_16 = all_16.loc[all_ids]
for indv, ids in sub1_16.groupby('HOST_SUBJECT_ID').groups.iteritems():
    if len(ids) > 1:
        print sub1_16.loc[ids[0], 'PD_whole_tree_mean'] - sub1_16.loc[ids[1], 'PD_whole_tree_mean']
# all_16.loc[list(all_ids), 'PD_whole_tree_mean']

2.862647
-0.111211
-3.719389
-5.095132


In [63]:
five_ids = list(set(p5_14).union(set(p5_15)).union(set(p5_16)))
sub2_16 = all_16.loc[five_ids]

In [67]:
print sub2_16.PD_whole_tree_mean.mean()
print sub2_16.PD_whole_tree_mean.std()

29.6177711818
5.1408939365


In [68]:
print sub1_16.PD_whole_tree_mean.mean()
print sub1_16.PD_whole_tree_mean.std()

31.6570856304
5.30139673982


In [69]:
print data14.loc[p5_14, 'PD_whole_tree_mean'].mean(), data14.loc[p5_14, 'PD_whole_tree_mean'].std()
print data15.loc[p5_15, 'PD_whole_tree_mean'].mean(), data15.loc[p5_15, 'PD_whole_tree_mean'].std()
print data16.loc[p5_16, 'PD_whole_tree_mean'].mean(), data16.loc[p5_16, 'PD_whole_tree_mean'].std()

29.1975437561 5.06826289287
29.275380878 5.04782552173
29.4344786341 5.18446395279


In [70]:
print data14.loc[p3_14, 'PD_whole_tree_mean'].mean(), data14.loc[p3_14, 'PD_whole_tree_mean'].std()
print data15.loc[p3_15, 'PD_whole_tree_mean'].mean(), data15.loc[p3_15, 'PD_whole_tree_mean'].std()
print data16.loc[p3_16, 'PD_whole_tree_mean'].mean(), data16.loc[p3_16, 'PD_whole_tree_mean'].std()

31.5187642326 5.25027704863
31.7410535402 5.31949189011
31.751947954 5.29233649098
