# Machine Learning for metabolomes of microbial communities

In [2]:
# imports
import sys
sys.path.append( '../FIA' )
sys.path.append( '../ML' )
sys.path.append( '..' )

from helpers import *
from FIA import *
from ML4com import *



## Loading in

In [3]:
# Load matrices
strains = pd.read_csv("../../data/comm8_0/strains.tsv", sep="\t")
comm8 = pd.read_csv("../../data/comm8_0/comm8.tsv", sep="\t")
metData = pd.read_csv("../../data/comm8_0/metData.tsv", sep="\t")
metData.index = pd.read_csv("../../data/comm8_0/metName.tsv", sep="\t")

met_raw_pos = pd.read_excel("../../data/comm8_0/FIA-Data Com8_20230717_P0024_msAV206-312.xlsx", sheet_name="pos")
met_raw_neg = pd.read_excel("../../data/comm8_0/FIA-Data Com8_20230717_P0024_msAV206-312.xlsx", sheet_name="neg")

In [4]:
def join_df_metNames(df):
    cols = ["metNames"] + [f"MS{i+1}" for i in range(len(df.columns) - 6)]
    comb = pd.DataFrame(columns=cols)
    for pid in df["peakID"].unique():
        comb_met_name = ""
        for met_name in df.loc[df["peakID"] == pid]["MetName"]:
            comb_met_name += met_name + "\n"
        comb.loc[len(comb.index)] = [comb_met_name[:-2]] + list(df.loc[df["peakID"] == pid].iloc[0, 6:])
    comb = comb.set_index('metNames')
    return comb

## Normalization

In [5]:
met_raw_comb = pd.concat( [total_ion_count_normalization( join_df_metNames(met_raw_pos) ), total_ion_count_normalization( join_df_metNames(met_raw_neg) )] )

In [6]:
X = met_raw_comb.transpose()
ys = comm8
targets = strains.values.flatten()

## Learning and tuning

In [7]:
run_dir = "../../runs/ML/annot"

In [7]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
param_grid = [
    {"random_state": [42],
     "ccp_alpha": [0.01]}
]

results = train_cv_model(DecisionTreeClassifier, param_grid, X, ys, targets, os.path.join(run_dir, "decision_tree"), suffix="", n_fold=5)

Parameter combinations 1:


100%|██████████| 1/1 [00:01<00:00,  1.07s/it]


In [None]:
from sklearn.tree import DecisionTreeClassifier
param_grid = [
    {"random_state": [42],
     "ccp_alpha": [0.01]}
]

grids = grid_search_params_cv_model(DecisionTreeClassifier, param_grid, X, ys.values, targets, n_splits=5, n_jobs=1)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
[CV] END ....................ccp_alpha=0.01, random_state=42; total time=   0.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ....

## Random Forest

### Standard

In [None]:
from sklearn.ensemble import RandomForestClassifier
param_grid = [
    {"random_state": [42],
     "ccp_alpha": [0, 0.5, 0.025],
     "n_estimators": [10, 100, 400],
     "max_depth": [None, 5, 20]}
]

results = train_cv_model(RandomForestClassifier, param_grid, X, ys, targets, os.path.join(run_dir, "random_forest"), suffix="", n_fold=5)

Parameter combinations 1:


100%|██████████| 27/27 [03:07<00:00,  6.96s/it]


### BaggingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier
param_grid = [
    {"random_state": [42],
     "bootstrap": [True],
     "bootstrap_features": [True],
     "oob_score": [True, False],
     "n_estimators": [10, 100, 400],
     "n_jobs": [4]}
]

results = train_cv_model(BaggingClassifier, param_grid, X, ys, targets, os.path.join(run_dir, "bagging"), suffix="", n_fold=5)
res_df = pd.concat([results, pd.json_normalize(results["parameters"])], axis=1)
res_df.to_csv(os.path.join(run_dir, "gradient_boost.tsv"), sep="\t")

Parameter combinations 1:


  0%|          | 0/6 [00:00<?, ?it/s]


ValueError: Out of bag estimation only available if bootstrap=True

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
param_grid = [
    {"random_state": [42],
     "ccp_alpha": [0, 0.5, 0.025],
     "loss": ["exponential", "log_loss"],
     "learning_rate": [0.1, 0.2 , 0.5],
     "n_estimators": [10, 100, 200],
     "max_depth": [None, 1, 2, 3, 5],
     "n_iter_no_change": [20]}
]

results = train_cv_model(GradientBoostingClassifier, param_grid, X, ys, targets, os.path.join(run_dir, "gradient_boost"), suffix="", n_fold=5)
res_df = pd.concat([results, pd.json_normalize(results["parameters"])], axis=1)
res_df.to_csv(os.path.join(run_dir, "gradient_boost_2.tsv"), sep="\t")

Parameter combinations 1:


100%|██████████| 108/108 [47:46<00:00, 26.54s/it] 


##### Extreme Gradiant Boosting

In [164]:
from xgboost import XGBClassifier

param_grid = [
    {"seed": [42],
     "objective": ['binary:logistic'],  
     "min_split_loss": [0],                       # = gamma
     "min_child_weight": [1],                        # larger -> more conservative (gives up exploration if weight is under..)
     "max_delta_step": [0],                              # larger -> more conservative, for imbalanced datasets
     "lambda": [1],                                     # L2 regularization -""-
     "alpha": [0],                                     # L1 regularization -""-
     "num_parallel_tree": [5],                       # Allows boosted RF
     "subsample": [1.0],
     "learning_rate": [0.25, 0.5],
     "n_estimators": [500],
     "max_depth": [None]}
]

results = train_cv_model(XGBClassifier, param_grid, X, ys, targets, os.path.join(run_dir, "xgboost"), suffix="2", n_fold=5)
res_df = pd.concat([results, pd.json_normalize(results["parameters"])], axis=1)
res_df.to_csv(os.path.join(run_dir,  "xgboost", "xgboost_2.tsv"), sep="\t")


Parameter combinations 1:


100%|██████████| 4/4 [08:46<00:00, 131.54s/it]


### SVM

In [None]:
from sklearn.svm import SVC
param_grid = [
    {"random_state": [42],
     "kernel": ["linear"],
     "cache_size": [1000],
     "decision_function_shape": ["ovr"]},

     {"random_state": [42],
     "kernel": ["poly"],
     "degree": [2, 3],
     "cache_size": [1000],
     "decision_function_shape": ["ovr"]},

    {"random_state": [42],
     "kernel": ["sigmoid"],             # tanh
     "coef0" : np.logspace(-2, 3, 6),
     "cache_size": [1000],
     "decision_function_shape": ["ovr"]},

    {"random_state": [42],
     "kernel": ["rbf",],
     "gamma": np.logspace(-4, 2, 7),
     "cache_size": [1000],
     "decision_function_shape": ["ovr"]}
]

results = train_cv_model(SVC, param_grid, X, ys, targets, os.path.join(run_dir, "svm"), suffix="", n_fold=5)

Parameter combinations 1:


100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


Parameter combinations 2:


100%|██████████| 2/2 [00:01<00:00,  1.11it/s]


Parameter combinations 3:


100%|██████████| 6/6 [00:06<00:00,  1.01s/it]


Parameter combinations 4:


100%|██████████| 7/7 [00:06<00:00,  1.08it/s]


### Multi-Layer-Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
param_grid = [
    {"random_state": [42],
     "activation": ["identity", "relu"],
     "solver": ['adam'],
     "alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1],
     "hidden_layer_sizes": [(500, 100), (100, 100), (20, 100), (100, 20), (500, 20)],
     "max_iter": [1000]}
]

results = train_cv_model(MLPClassifier, param_grid, X, ys, targets, os.path.join(run_dir, "multi_layer_perceptron"), suffix="", n_fold=5)

Parameter combinations 1:


  0%|          | 0/60 [00:00<?, ?it/s]

  2%|▏         | 1/60 [05:19<5:13:59, 319.32s/it]

## Analysis

In [165]:
expanded_df = pd.concat([results, pd.json_normalize(results["parameters"])], axis=1)
expanded_df["accuracy"] = expanded_df["accuracy"].apply(np.mean)
expanded_df

Unnamed: 0,model_nr,parameters,target,accuracy,seed,objective,min_split_loss,min_child_weight,max_delta_step,lambda,alpha,num_parallel_tree,subsample,learning_rate,n_estimators,max_depth
0,1,"{'seed': 42, 'objective': 'binary:logistic', '...",C. ramosum,0.97033,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
1,1,"{'seed': 42, 'objective': 'binary:logistic', '...",R. gnavus,0.984615,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
2,1,"{'seed': 42, 'objective': 'binary:logistic', '...",C. aerofaciens,0.984615,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
3,1,"{'seed': 42, 'objective': 'binary:logistic', '...",B. thetaiotaomicron,0.956044,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
4,1,"{'seed': 42, 'objective': 'binary:logistic', '...",B. uniformis,0.678022,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
5,1,"{'seed': 42, 'objective': 'binary:logistic', '...",B. vulgatus,0.985714,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
6,1,"{'seed': 42, 'objective': 'binary:logistic', '...",F. nucleatum,0.941758,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
7,1,"{'seed': 42, 'objective': 'binary:logistic', '...",R. intestinalis,0.969231,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,3
8,2,"{'seed': 42, 'objective': 'binary:logistic', '...",C. ramosum,0.97033,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,8
9,2,"{'seed': 42, 'objective': 'binary:logistic', '...",R. gnavus,0.984615,42,binary:logistic,0,1,0,1,0,5,1.0,0.25,500,8


In [166]:
for col in results["parameters"][0].keys():
    print(expanded_df.fillna(value=-1).groupby(col).mean(numeric_only=True)["accuracy"])     # Hyperparameter importance
    print()

seed
42    0.931044
Name: accuracy, dtype: float64

objective
binary:logistic    0.931044
Name: accuracy, dtype: float64

min_split_loss
0    0.931044
Name: accuracy, dtype: float64

min_child_weight
1    0.931044
Name: accuracy, dtype: float64

max_delta_step
0    0.931044
Name: accuracy, dtype: float64

lambda
1    0.931044
Name: accuracy, dtype: float64

alpha
0    0.931044
Name: accuracy, dtype: float64

num_parallel_tree
5    0.931044
Name: accuracy, dtype: float64

subsample
1.0    0.931044
Name: accuracy, dtype: float64

learning_rate
0.25    0.933791
0.50    0.928297
Name: accuracy, dtype: float64

n_estimators
500    0.931044
Name: accuracy, dtype: float64

max_depth
3    0.931044
8    0.931044
Name: accuracy, dtype: float64



In [167]:
target_mean_df = expanded_df.groupby("model_nr").mean(numeric_only=True)
cutoff = np.max(target_mean_df["accuracy"]) - np.var(target_mean_df["accuracy"]) * 1e-1
best_df = target_mean_df.loc[target_mean_df["accuracy"] > cutoff].sort_values("accuracy", ascending=False)
best_df

Unnamed: 0_level_0,accuracy,seed,min_split_loss,min_child_weight,max_delta_step,lambda,alpha,num_parallel_tree,subsample,learning_rate,n_estimators,max_depth
model_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.933791,42.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.25,500.0,3.0
2,0.933791,42.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.25,500.0,8.0


#### Plotting

In [168]:
for col in results["parameters"][0].keys():
    fig = sns.lineplot(expanded_df, y="accuracy", x=col, hue="target").figure.savefig(f"../../reports/ML/XGBoost/hp_{col}_2.png")
    plt.close()

## In-depth analysis

### DT depiction

In [169]:
model = DecisionTreeClassifier(random_state=42)

# for i, estimator in enumerate(model.estimators_):
for i, s in enumerate(strains):
    model.fit(X.transpose(),ys.transpose()[i])
    plot_decision_trees(model=model, feature_names=X.index, class_names=["absent", s], outdir=outdir, name=f"decisiontree/tree_{s}{suffix}")

NameError: name 'DecisionTreeClassifier' is not defined

### Feature importance

In [224]:
feature_importances = {}
for i, s in enumerate(strains["0"]):
    model = XGBClassifier(**results.loc[results["model_nr"] == best_df.iloc[0].name]["parameters"][0])
    model.fit(X.values, ys[f"{i}"])
    feature_importances[s] = model.feature_importances_

    imp_feat = feature_importances.get(s) > 0.01

    plt.bar(X.columns[imp_feat], feature_importances.get(s)[imp_feat])
    plt.savefig(f"../../reports/ML/XGBoost/important_features_{s}_2.png")
    plt.close()

In [225]:
feat_imp_df = pd.DataFrame(feature_importances, index=X.columns)
feat_imp_df.to_csv("../../reports/ML/XGBoost/feature_importance.tsv", sep="\t")

## Mess

In [8]:
feat_imp_df = pd.read_csv("../../reports/ML/XGBoost/feature_importance.tsv", sep="\t")

In [11]:
feat_imp_df["metNames"] = [met.replace("\n", ";") for met in feat_imp_df["metNames"].values]

In [13]:
feat_imp_df.to_csv("../../reports/ML/XGBoost/feature_importance_dense.tsv", sep="\t")

In [226]:
acc = results.loc[results["model_nr"] == best_df.iloc[0].name]["accuracy"]
acc = pd.DataFrame(dict(zip(acc.index, acc.values)))

In [227]:
acc.columns = strains["0"].values
acc

Unnamed: 0,C. ramosum,R. gnavus,C. aerofaciens,B. thetaiotaomicron,B. uniformis,B. vulgatus,F. nucleatum,R. intestinalis
0,1.0,1.0,1.0,0.857143,0.428571,1.0,0.928571,1.0
1,0.928571,1.0,1.0,1.0,0.857143,0.928571,0.928571,1.0
2,1.0,1.0,1.0,1.0,0.642857,1.0,0.928571,1.0
3,1.0,1.0,1.0,1.0,0.615385,1.0,0.923077,0.923077
4,0.923077,0.923077,0.923077,0.923077,0.846154,1.0,1.0,0.923077


In [228]:
acc.to_csv("../../reports/ML/XGBoost/accuracies.tsv", sep="\t")

In [223]:
results.loc[results["model_nr"] == best_df.iloc[0].name]["parameters"][0]

{'seed': 42,
 'objective': 'binary:logistic',
 'min_split_loss': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'lambda': 1,
 'alpha': 0,
 'num_parallel_tree': 5,
 'subsample': 1.0,
 'learning_rate': 0.25,
 'n_estimators': 500,
 'max_depth': 3}