In [1]:
import pandas as pd
import numpy as np

In [2]:
# load training data
x_train = pd.read_csv('../../data/04/post_hfe_binary_class/crp_train_binary_no_sf.csv')
x_train.rename(columns={'subject_id':'SEQN'}, inplace=True)
y_train = pd.read_csv('../../data/04/e_adjust/y_train_class.txt', sep='\t', usecols=['SEQN', 'crp_class'])
sw_train = pd.read_csv('../../data/04/e_adjust/sw_train_class.csv')
cov = pd.read_csv('../../data/00/wweia_covariates.csv')

In [3]:
cov = pd.get_dummies(cov)

In [4]:
cov = cov.dropna()

In [6]:
# dont want to include the unknown dummy cat for each covariate
cov.drop(columns=['education_unknown', 'ever_smoker_unknown', 'diabetes_unknown', 'hypertension_unknown'], inplace=True)

In [7]:
cov.SEQN = cov.SEQN.astype(int)

In [8]:
crp_train_out = x_train.merge(cov, on='SEQN', how='left')

In [10]:
crp_train_out.drop(columns=['feature_of_interest'],inplace=True)
crp_train_out.to_csv('../../data/04/processed_for_ml/binary_class/x_train_taxahfe_no_sf.csv', index=None)

In [11]:
# save diet features for ML
crp_train_out[['SEQN', 'l3_milk_cow', 'l1_meat_poultry_fish_and_mixtures', 'l1_eggs',
       'l5_soy_flour', 'l2_seeds_and_seed_mixtures',
       'l2_pastas_cooked_cereals', 'l2_dried_fruits',
       'l3_white_potatoes_baked_and_boiled', 'l2_darkgreen_vegetables',
       'l5_carrots_raw', 'l2_tomatoes_and_tomato_mixtures', 'l5_garlic',
       'l3_other_vegetables_cooked', 'l2_fats', 'l4_canola_oil',
       'l2_nonalcoholic_beverages', 'l2_alcoholic_beverages']].to_csv('../../data/04/processed_for_ml/binary_class/x_train_taxahfe_no_sf_diet.csv', index=None)

In [12]:
y_train = y_train[y_train['SEQN'].isin(crp_train_out['SEQN'])]

In [13]:
sw_train = sw_train[sw_train['SEQN'].isin(crp_train_out['SEQN'])]

In [14]:
sw_train.to_csv('../../data/04/processed_for_ml/binary_class/sw_train_taxahfe.csv',index=None)

In [15]:
# parse levels of taxa for selecting

In [16]:
crp_train_out.columns.sort_values()

Index(['Age', 'BMI', 'Ethnicity_Mexican_American',
       'Ethnicity_Non-Hispanic_Black', 'Ethnicity_Non-Hispanic_White',
       'Ethnicity_Other_Hispanic', 'Ethnicity_Other_Multi-Racial', 'SEQN',
       'Sex_Female', 'Sex_Male', 'diabetes_no', 'diabetes_yes',
       'education_college graduate',
       'education_high school graduate or equivalent',
       'education_less than high school graduate', 'education_some college',
       'ever_smoker_no', 'ever_smoker_yes', 'family_pir', 'hypertension_no',
       'hypertension_yes', 'l1_eggs', 'l1_meat_poultry_fish_and_mixtures',
       'l2_alcoholic_beverages', 'l2_darkgreen_vegetables', 'l2_dried_fruits',
       'l2_fats', 'l2_nonalcoholic_beverages', 'l2_pastas_cooked_cereals',
       'l2_seeds_and_seed_mixtures', 'l2_tomatoes_and_tomato_mixtures',
       'l3_milk_cow', 'l3_other_vegetables_cooked',
       'l3_white_potatoes_baked_and_boiled', 'l4_canola_oil', 'l5_carrots_raw',
       'l5_garlic', 'l5_soy_flour'],
      dtype='object')

In [17]:
L1_list = ['l1_eggs', 'l1_meat_poultry_fish_and_mixtures']

L2_list = ['l2_alcoholic_beverages', 'l2_darkgreen_vegetables', 'l2_dried_fruits',
       'l2_fats', 'l2_nonalcoholic_beverages', 'l2_pastas_cooked_cereals',
       'l2_seeds_and_seed_mixtures', 'l2_tomatoes_and_tomato_mixtures']

L3_list = ['l3_milk_cow', 'l3_other_vegetables_cooked',
       'l3_white_potatoes_baked_and_boiled']

L4_list = ['l4_canola_oil']

L5_list = ['l5_carrots_raw',
       'l5_garlic', 'l5_soy_flour']

In [18]:
# load full datasets to generate test set
x_all = pd.read_csv('../../data/03/ingred_otu_e_adjust_drywt_no_salt_water.txt', sep='\t')
x_all.rename(columns={'subject_id':'SEQN'}, inplace=True)
y_all = pd.read_csv('../../data/04/e_adjust/y_train_class.txt', sep='\t', usecols=['SEQN', 'crp_class'])
sw_all = pd.read_csv('../../data/04/e_adjust/sw_train_class.csv')

In [20]:
train_set = y_train['SEQN'].astype(str)

In [21]:
x_test = x_all[x_all.columns[~x_all.columns.isin(train_set)]]

In [22]:
levels = x_test['clade_name'].str.split('|', expand=True)

In [23]:
levels = levels.apply(lambda x: x.astype(str).str.lower())

In [24]:
otu_test_levels = pd.concat([levels, x_test], axis=1)

In [26]:
otu_test_levels = otu_test_levels.rename(columns=({0:'L1', 1:'L2', 2:'L3', 3:'L4', 4:'L5', 5:'ingredient'}))

In [27]:
L1_group = otu_test_levels[otu_test_levels['L1'].isin(L1_list)]

L1 = L1_group.groupby('L1')
L1 = L1.sum()

L1.reset_index(inplace=True)
L1.rename(columns={'L1': 'clade_name'},inplace=True)

In [28]:
L2_group = otu_test_levels[otu_test_levels['L2'].isin(L2_list)]

L2 = L2_group.groupby('L2')
L2 = L2.sum()

L2.reset_index(inplace=True)
L2.rename(columns={'L2': 'clade_name'},inplace=True)

In [29]:
L3_group = otu_test_levels[otu_test_levels['L3'].isin(L3_list)]

L3 = L3_group.groupby('L3')
L3 = L3.sum()

L3.reset_index(inplace=True)
L3.rename(columns={'L3': 'clade_name'},inplace=True)

In [30]:
L4_group = otu_test_levels[otu_test_levels['L4'].isin(L4_list)]

L4 = L4_group.groupby('L4')
L4 = L4.sum()

L4.reset_index(inplace=True)
L4.rename(columns={'L4': 'clade_name'},inplace=True)

In [31]:
L5_group = otu_test_levels[otu_test_levels['L5'].isin(L5_list)]

L5 = L5_group.groupby('L5')
L5 = L5.sum()

L5.reset_index(inplace=True)
L5.rename(columns={'L5': 'clade_name'},inplace=True)

In [32]:
x = pd.concat([L1, L2, L3, L4, L5])

In [33]:
x.set_index('clade_name', inplace=True)

In [34]:
x = x.loc[(x.sum(axis=1) > 0)]

In [35]:
x.reset_index(inplace=True)

In [36]:
x = x.transpose()

In [37]:
new_header = x.iloc[0] #grab the first row for the header
x = x[1:] #take the data less the header row
x.columns = new_header #set the header row as the df header

In [38]:
x.reset_index(inplace=True)

In [39]:
x.rename(columns={'index': 'SEQN'},inplace=True)

In [40]:
x.SEQN = x.SEQN.astype(int)

In [42]:
x = x.merge(cov, on='SEQN', how='left')

In [43]:
y_all = pd.read_csv('../../data/00/crp_class_target.csv') 
sw_all = pd.read_csv('../../data/00/wweia_wts.csv')

In [44]:
y_all.SEQN = y_all.SEQN.astype(int)

In [45]:
y_test = y_all[y_all['SEQN'].isin(x['SEQN'])]

In [46]:
x = x[x['SEQN'].isin(y_test['SEQN'])]

In [47]:
y_train = y_all[~y_all['SEQN'].isin(x['SEQN'])]

In [48]:
y_test.to_csv('../../data/04/processed_for_ml/binary_class/y_test_class.csv',index=None)
y_train.to_csv('../../data/04/processed_for_ml/binary_class/y_train_class.csv',index=None)

In [49]:
x.to_csv('../../data/04/processed_for_ml/binary_class/x_test_no_sf.csv',index=None)

In [50]:
# save diet features for ML
x[['SEQN', 'l1_eggs', 'l1_meat_poultry_fish_and_mixtures',
       'l2_alcoholic_beverages', 'l2_darkgreen_vegetables', 'l2_dried_fruits',
       'l2_fats', 'l2_nonalcoholic_beverages', 'l2_pastas_cooked_cereals',
       'l2_seeds_and_seed_mixtures', 'l2_tomatoes_and_tomato_mixtures',
       'l3_milk_cow', 'l3_other_vegetables_cooked',
       'l3_white_potatoes_baked_and_boiled', 'l4_canola_oil', 'l5_carrots_raw',
       'l5_garlic', 'l5_soy_flour']].to_csv('../../data/04/processed_for_ml/binary_class/x_test_no_sf_diet.csv',index=None)

In [51]:
print('x_test', x.shape)
print('x_train', crp_train_out.shape)
print('y_test', y_test.shape)
print('y_train', y_train.shape)

x_test (2582, 38)
x_train (10327, 38)
y_test (2582, 2)
y_train (10327, 2)
