In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
wts = pd.read_csv('../../data/00/wweia_wts.csv')
crp = pd.read_csv('../../data/00/crp_target.csv')
otu = pd.read_csv('../../data/03/ingred_otu_e_adjust_drywt_no_salt_water.txt', sep='\t')
cov = pd.read_csv('../../data/00/wweia_covariates.csv')

In [7]:
crp = crp.merge(cov, on='SEQN')

In [8]:
crp = crp[crp['ever_smoker']!='unknown']
crp = crp[crp['hypertension']!='unknown']
crp = crp[crp['diabetes']!='unknown']

In [10]:
# create feature for tertiles of crp
tertiles = crp['crp'].quantile([1/3, 2/3]).tolist()
tertiles = [0] + tertiles + [float('inf')]
crp['crp_class'] = pd.cut(crp['crp'], bins=tertiles, labels=[0, 9, 1]) # mid tertile = 9, remove

In [6]:
crp.drop(columns=['crp'],inplace=True)

In [7]:
crp = crp[crp['crp_class']!=9] # remove middle tertile

In [8]:
crp[['SEQN', 'crp_class']].to_csv('../../data/00/crp_class_target.csv', index=None)

In [9]:
wts = wts[wts['SEQN'].isin(crp['SEQN'])]

In [10]:
crp.set_index('SEQN', inplace=True)
wts.set_index('SEQN', inplace=True)

In [11]:
y_train, y_test, sw_train, sw_test = train_test_split(crp, wts, stratify=crp['crp_class'], shuffle=True, test_size=0.2)

In [12]:
train_set = y_train.reset_index()

In [13]:
train_set = train_set['SEQN'].astype(int)

In [14]:
train_set = train_set.astype(str)

In [15]:
otu_train = otu[otu.columns[otu.columns.isin(train_set)]]

In [17]:
pd.concat([otu['clade_name'], otu_train],axis=1).to_csv('../../data/04/for_hfe/otu_train_class.txt', sep='\t', index=None)

In [18]:
y_train.reset_index(inplace=True)

In [19]:
y_train.SEQN = y_train.SEQN.astype(int)

In [21]:
y_train.shape

(10327, 11)

In [22]:
y_test.shape

(2582, 10)

In [24]:
y_train.to_csv('../../data/04/for_hfe/y_train_class.txt', sep='\t', index=None) # input for taxaHFE
sw_train.to_csv('../../data/04/processed_for_ml/sw_train_class.csv', index=True) # input for ML
y_test.to_csv('../../data/04/processed_for_ml/y_test_class.csv', index=True) # input for ML
sw_test.to_csv('../../data/04/processed_for_ml/sw_test_class.csv', index=True) # input for ML