In [1]:
import numpy as np
import pandas as pd
import requests
import time

from statsmodels.stats.multitest import multipletests as multipletests
from scipy.stats import ttest_ind as ttest_ind

pd.set_option("display.max_rows", 500)


def get_metab_ids(metabs_ls, verbose=False):
    """GETs a list of metabs from the Metaboanalyst API and returns their IDs as a dataframe.
    
    PARAMS
    ------
    metabs_ls: list of string; input list of metab common names
    verbose: bool; verbosity flag.
    
    RETURNS
    -------
    response_obj: list of dictionaries of IDs.
    """
    
    input_str = ";".join(metabs_ls)
    url = "http://api.xialab.ca/mapcompounds"
    payload = '{\n\t\"queryList\": \"'+input_str+';\",\n\t\"inputType\": \"name\"\n}'
    headers = {
        'Content-Type': "application/json",
        'cache-control': "no-cache",
        }

    r = requests.request("POST", url, data=payload, headers=headers)
    
    response_obj = r.json()
    
    contents = []
    colnames_ls = ["query", "hit", "hmdb_id", "kegg_id", "pubchem_id", "chebi_id", "metlin_id", "smiles"]
    for row_dict in response_obj:
        contents.append([row_dict[colname] for colname in colnames_ls])

    d_id = pd.DataFrame(data=contents, columns=colnames_ls)
    d_id.fillna("undef", inplace=True)
    d_id.replace({"-": "undef", "":"undef", "NA":"undef"}, inplace=True)

    return d_id

  import pandas.util.testing as tm


In [2]:
d0 = pd.read_csv("/Users/don/Documents/flask_boxplot_reports_v3/sample_data/sample_gcms_normalized_n171.csv")
pairs_ls = [["group1", "group2"], ["group2", "group3"], ["group3", "group4"]]

sample_colname= d0.columns[0]
group_colname = d0.columns[1]
metabs_ls = list(d0.columns)[2:]

In [5]:
d_id = get_metab_ids(metabs_ls)

In [6]:
d_id.head()

Unnamed: 0,query,hit,hmdb_id,kegg_id,pubchem_id,chebi_id,metlin_id,smiles
0,Cadaverine,Cadaverine,HMDB0002322,C01672,273,18127,3236,NCCCCCN
1,Citramalic acid,Citramalic acid,undef,undef,441696,undef,undef,undef
2,Isobutyrylglycine,Isobutyrylglycine,HMDB0000730,undef,10855600,70979,5698,CC(C)C(=O)NCC(O)=O
3,Guanosine,Guanosine,undef,undef,135398635,undef,undef,undef
4,L-Leucine,L-Leucine,HMDB0000687,C00123,6106,15603,24,CC(C)C[C@H](N)C(O)=O


In [7]:
comparisons_df_dict = {}
for pair in pairs_ls:
    k = "|".join(pair)
    d_t = d0.loc[d0[group_colname].isin(pair)]
    contents = []
    for mtb in metabs_ls:
        record_dict = {}
        arr0 = d_t.loc[d_t[group_colname]==pair[0]][mtb]
        arr1 = d_t.loc[d_t[group_colname]==pair[1]][mtb]

        t_test_result = ttest_ind(arr0, arr1, equal_var = False)
        new_row = [mtb, t_test_result.statistic, t_test_result.pvalue, np.average(arr0)/np.average(arr1)]
        contents.append(new_row)

    d_stats = pd.DataFrame(data=contents, columns=["metab", "t_stat", "p_val", "fc"])

    # Multiple hypothesis testing correction
    multtest_result = multipletests(list(d_stats["p_val"]), alpha=0.05, method="fdr_bh", is_sorted=False, returnsorted=False)
    d_stats["BH_q_val"] = multtest_result[1]
    
    comparisons_df_dict[k] = d_stats

In [8]:
comparisons_df_dict["group1|group2"]

Unnamed: 0,metab,t_stat,p_val,fc,BH_q_val
0,Cadaverine,-1.034681,0.3449388,0.728321,0.487475
1,Citramalic acid,-0.901196,0.4058333,0.80024,0.559657
2,Isobutyrylglycine,-3.287575,0.01300324,-6.266324,0.061738
3,Guanosine,-0.686278,0.5094865,0.896083,0.63132
4,L-Leucine,-1.732399,0.1139132,0.910952,0.205044
5,Spermidine,-3.000093,0.01575082,1.464879,0.064128
6,Ethanolamine,-0.05168,0.9601497,0.992341,0.965798
7,L-Threonine,-2.107476,0.06134969,0.680731,0.138506
8,4-Hydroxybenzoic acid,2.442771,0.03981626,1.099612,0.107737
9,Glycolic acid,2.485995,0.03468378,1.231573,0.100768
