In [1]:
import pandas as pd
from gemelli.rpca import rpca, joint_rpca, feature_correlation_table
from biom import Table
from biom.util import biom_open
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(42)


In [2]:
# import metadata
metadata = pd.read_csv('../../network/metadata_v0.csv', index_col=0)
tables = {}

# import/match metabolomics
metabolomics = pd.read_csv('../../network/GNPS-raw-metabolomics-quant-table.csv', index_col=0)
metabolomics = metabolomics.drop(['row m/z', 'row retention time'], axis=1)
map_names = pd.read_csv('../../network/metabolite_sample_names.txt', sep='\t', index_col=0).orig_sample_name.to_dict()
metabolomics.columns = [c.replace(' Peak area','') for c in metabolomics.columns]
metabolomics.index = ['metab_' + str(i) for i in metabolomics.index]
metabolomics = metabolomics[set(metabolomics.columns) & set(map_names.keys())]
metabolomics.columns = [map_names[c] for c in metabolomics.columns]
tables['metabolomics'] = metabolomics.copy()

# import the rest
all_tables = {'mag':'../../network/MAG_bact_table.tsv',
            '18S':'../../network/18S_ASV_table.tsv',
            '16S':'../../network/16S_ASV_table.tsv',
            'gene':'../../network/MAG_bact_genes_table.tsv',
            'gene_module':'../../network/MAG_bact_gene_modules_table.tsv'}
for tbl_id, tbl_path in all_tables.items():
    tables[tbl_id] = pd.read_csv(tbl_path, sep='\t', index_col=0)
    
# match and filter the tables
def filter_table(df, use_ids):
    df = df[use_ids]
    df = df[df.sum(1) > 0]
    return df
all_shared_samples = set(metadata.index) & set.intersection(*[set(t.columns) for t in tables.values()])
tables = {tblid:filter_table(tbl, all_shared_samples)  for tblid, tbl in tables.items()}
metadata = metadata.reindex(all_shared_samples)

# subset the tables to last active time point
#metadata_subset = pd.concat([df[df.add_0c_group == 'active'].sort_values('timepoint').iloc[[-1], :]
#                             for _, df in metadata.groupby('subjects') if 'active' in list(df.add_0c_group)])

metadata_subset = pd.concat([df
                             for _, df in metadata.groupby('subjects') if len(set(df.add_0c_group)) == 4])

tables_subset = {tblid:filter_table(tbl.copy(), metadata_subset.index)  for tblid, tbl in tables.items()}
tables_subset = {tblid:Table(tbl.values, tbl.index, tbl.columns)  for tblid, tbl in tables_subset.items()}

# train-tests
for i_ in range(10):
    train, test = train_test_split(metadata_subset, test_size=0.25, shuffle=True, random_state=42,
                                   stratify=metadata_subset[['facility']])
    metadata_subset.loc[:, 'traintest_%i' % i_] = 'train'
    metadata_subset.loc[test.index, 'traintest_%i' % i_] = 'test'
    
# save data 
metadata_subset.to_csv('../../network/split-matched-data/metadata.tsv', sep='\t')
for tblid, tbl in tables_subset.items():
    with biom_open('../../network/split-matched-data/%s.biom' % (tblid), 'w') as f:
        tbl.to_hdf5(f, "filtered-table-cm")

tables_subset



{'metabolomics': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense),
 'mag': 257 x 374 <class 'biom.table.Table'> with 58498 nonzero entries (60% dense),
 '18S': 5473 x 374 <class 'biom.table.Table'> with 114755 nonzero entries (5% dense),
 '16S': 14237 x 374 <class 'biom.table.Table'> with 318875 nonzero entries (5% dense),
 'gene': 2457 x 374 <class 'biom.table.Table'> with 901890 nonzero entries (98% dense),
 'gene_module': 377 x 374 <class 'biom.table.Table'> with 117093 nonzero entries (83% dense)}