# Machine Learning for metabolomes of microbial communities

In [1]:
# imports
import sys
sys.path.append( '../FIA' )
sys.path.append( '../ML' )
sys.path.append( '..' )

from helpers import *
from FIA import *
from ML4com import *
from DL4com import *

## Loading in

In [2]:
# Load matrices
strains = pd.read_csv("../../data/comm8_0/strains.tsv", sep="\t")
comm8 = pd.read_csv("../../data/comm8_0/comm8.tsv", sep="\t")
metData = pd.read_csv("../../data/comm8_0/metData.tsv", sep="\t")
metData.index = pd.read_csv("../../data/comm8_0/metName.tsv", sep="\t")

met_raw_pos = pd.read_excel("../../data/comm8_0/FIA-Data Com8_20230717_P0024_msAV206-312.xlsx", sheet_name="pos")
met_raw_neg = pd.read_excel("../../data/comm8_0/FIA-Data Com8_20230717_P0024_msAV206-312.xlsx", sheet_name="neg")
met_raw_comb = pd.concat( [total_ion_count_normalization( join_df_metNames(met_raw_pos) ),
                           total_ion_count_normalization( join_df_metNames(met_raw_neg) )] )

## Normalization

In [3]:
X = met_raw_comb.transpose()
ys = comm8
targets = strains.values.flatten()

## Learning and tuning

In [4]:
from sklearn.model_selection import cross_val_score

In [5]:
run_dir = "../../runs/ML/annot"

In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

configuration_space = ConfigurationSpace()
ccp_alpha   = Float("ccp_alpha", (1e-3, 1e-1), log=True, default=0.01)
configuration_space.add_hyperparameters([ccp_alpha])

classifier = SKL_Classifier(X, ys, cv=5, configuration_space=configuration_space, classifier=DecisionTreeClassifier)

scenario = Scenario(classifier.configuration_space, deterministic=True, n_workers=4, n_trials=200,
                    walltime_limit=0.5*60*60, cputime_limit=np.inf, trial_memory_limit=int(6e10),
                    output_directory=Path(os.path.normpath("/mnt/d/runs/ML/NN/smac_dt")))

smac = HyperparameterOptimizationFacade(scenario, classifier.train)
incumbent = smac.optimize()

[INFO][abstract_initial_design.py:147] Using 10 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][abstract_intensifier.py:515] Added config 748752 as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 7474ab and rejected config 748752 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][smbo.py:319] Finished 50 trials.
[INFO][abstract_intensifier.py:590] Added config 8ce50c and rejected config 7474ab as incumbent because it is not better than the incumbents on 1 instances:
[INFO][smbo.py:319] Finished 100 trials.
[INFO][smbo.py:319] Finished 150 trials.
[INFO][smbo.py:319] Finished 200 trials.
[INFO][smbo.py:327] Configuration budget is exhausted:
[INFO][smbo.py:328] --- Remaining wallclock time: inf
[INFO][smbo.py:329] --- Remaining cpu time: inf
[INFO][smbo.py:330] --- Remaining trials: 0


In [None]:
if isinstance(incumbent, list):
    best_hp = incumbent[0]
else: 
    best_hp = incumbent

metrics_df = cross_validate_model_sklearn(DecisionTreeClassifier, X, ys, strains["0"], config=best_hp,
								          fold=StratifiedKFold(n_splits=5), verbosity=0)

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:02<00:00,  3.72it/s]


## Random Forest

### Standard

In [None]:
from sklearn.ensemble import RandomForestClassifier

configuration_space = ConfigurationSpace()
ccp_alpha       = Float("ccp_alpha", (1e-3, 1e-1), log=True, default=0.01)
n_estimators    = Integer("n_estimators", (10,1000), log=True, default=100)
max_depth       = Integer("max_depth", (5, 100), default=20)
configuration_space.add_hyperparameters([ccp_alpha, n_estimators, max_depth])

classifier = SKL_Classifier(X, ys, cv=5, configuration_space=configuration_space, classifier=RandomForestClassifier)

scenario = Scenario(classifier.configuration_space, deterministic=True, n_workers=4, n_trials=200,
                    walltime_limit=np.inf, cputime_limit=np.inf, trial_memory_limit=None,
                    output_directory=Path(os.path.normpath("/mnt/d/runs/ML/NN/smac_dt")))

smac = HyperparameterOptimizationFacade(scenario, classifier.train)
incumbent = smac.optimize()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34283 instead


[INFO][abstract_initial_design.py:147] Using 30 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][abstract_intensifier.py:515] Added config fe7927 as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config d1f329 and rejected config fe7927 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][smbo.py:319] Finished 50 trials.
[INFO][smbo.py:319] Finished 100 trials.
[INFO][abstract_intensifier.py:590] Added config 803917 and rejected config d1f329 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][smbo.py:319] Finished 150 trials.
[INFO][smbo.py:327] Configuration budget is exhausted:
[INFO][smbo.py:328] --- Remaining wallclock time

In [None]:
if isinstance(incumbent, list):
    best_hp = incumbent[0]
else: 
    best_hp = incumbent

metrics_df = cross_validate_model_sklearn(RandomForestClassifier, X, ys, targets, config=best_hp,
								          fold=StratifiedKFold(n_splits=5), verbosity=0)

NameError: name 'cross_validate_model_sklearn' is not defined

### Gradient Boosting

##### Extreme Gradiant Boosting

In [10]:
from xgboost import XGBClassifier

configuration_space = ConfigurationSpace()
objective           = Constant("objective", "binary:logistic")
num_parallel_tree   = Constant("num_parallel_tree", 4)
n_estimators        = Integer("n_estimators", (10,1000), log=True, default=100)
max_depth           = Integer("max_depth", (1, 100), default=20)
subsample           = Float("subsample", (1e-1, 1e0), default=1e0)
learning_rate       = Float("learning_rate", (1e-2, 5e-1), default=1e-1)
configuration_space.add_hyperparameters([objective, num_parallel_tree, n_estimators, max_depth, subsample, learning_rate])

classifier = SKL_Classifier(np.array(X), np.array(ys), cv=5, configuration_space=configuration_space, classifier=XGBClassifier)

scenario = Scenario(classifier.configuration_space, deterministic=True, n_workers=4, n_trials=200,
                    walltime_limit=np.inf, cputime_limit=np.inf, trial_memory_limit=None,
                    output_directory=Path(os.path.normpath("/mnt/d/runs/ML/NN/smac_dt")))

smac = HyperparameterOptimizationFacade(scenario, classifier.train)
incumbent = smac.optimize()

[INFO][abstract_initial_design.py:95] Reducing the number of initial configurations from 60 to 50 (max_ratio == 0.25).


Perhaps you already have a cluster running?
Hosting the HTTP server on port 46275 instead


[INFO][abstract_initial_design.py:147] Using 50 initial design configurations and 0 additional configurations.
[INFO][smbo.py:497] Continuing from previous run.
[INFO][abstract_intensifier.py:287] Added existing seed 209652396 from runhistory to the intensifier.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][abstract_intensifier.py:590] Added config 6212ba and rejected config 829f77 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config 711802 and rejected config 6212ba as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config 07f952 and rejected config 711802 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config f5b665 and rejected config 07f952 as incumbent because it is not better than the incumbents on 1 instances:


## Analysis

### Plotting

In [None]:
# Accuracy CV matrix
ax = sns.heatmap(metrics_df.pivot(index="Organism", columns="Cross-Validation run", values="Accuracy"),
                 vmin=0, vmax=1.0, annot=True, cmap=sns.color_palette("crest", as_cmap=True))
plt.savefig('../../runs/ML/annot/multi_layer_perceptron/heatmap_accuracies_1.png')  

In [None]:
plot_cv_confmat(ys=ys, target_labels=metrics_df.pivot(index="Organism", columns="Cross-Validation run", values="Accuracy").index,
                accuracies=np.mean(metrics_df.pivot(index="Organism", columns="Cross-Validation run", values="Accuracy").values, axis=1),
                confusion_matrices=np.sum(metrics_df.pivot(index="Organism", columns="Cross-Validation run", values="Conf_Mat").values, axis=1),
				outdir="../../runs/ML/annot/multi_layer_perceptron", name="class_annot_test_smac")

### DT depiction

In [None]:
model = DecisionTreeClassifier(random_state=42)

# for i, estimator in enumerate(model.estimators_):
for i, s in enumerate(strains):
    model.fit(X.transpose(),ys.transpose()[i])
    plot_decision_trees(model=model, feature_names=X.index, class_names=["absent", s], outdir=outdir, name=f"decisiontree/tree_{s}{suffix}")

NameError: name 'DecisionTreeClassifier' is not defined

### Feature importance

In [None]:
feature_importances = {}
for i, s in enumerate(strains["0"]):
    model = XGBClassifier(**results.loc[results["model_nr"] == best_df.iloc[0].name]["parameters"][0])
    model.fit(X.values, ys[f"{i}"])
    feature_importances[s] = model.feature_importances_

    imp_feat = feature_importances.get(s) > 0.01

    plt.bar(X.columns[imp_feat], feature_importances.get(s)[imp_feat])
    plt.savefig(f"../../reports/ML/XGBoost/important_features_{s}_2.png")
    plt.close()

In [None]:
feat_imp_df = pd.DataFrame(feature_importances, index=X.columns)
feat_imp_df.to_csv("../../reports/ML/XGBoost/feature_importance.tsv", sep="\t")