In [1]:
# imports
import sys
sys.path.append( '../FIA' )
sys.path.append( '../ML' )


from FIA import *
from ML4com import *

# TensorFlow and tf.keras
import tensorflow as tf
import keras
import keras_tuner
from keras import layers, activations


# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

tf.compat.v1.disable_eager_execution()





In [2]:
info_dir = "../../data/comm8_self"
data_dir = "../../runs/FIA/comm8/oms"
run_dir = "../../runs/ML/try"

info_dir = os.path.normpath(os.path.join(os.getcwd(), info_dir))
data_dir = os.path.normpath(os.path.join(os.getcwd(), data_dir))
run_dir = os.path.normpath(os.path.join(os.getcwd(), run_dir))

strains = pd.read_csv(os.path.join(info_dir, "strains.tsv"), sep="\t")
comm8 = pd.read_csv(os.path.join(info_dir, "comm8.tsv"), sep="\t")

fia_df = load_fia_df(data_dir, file_ending=".mzML", separator="\t")

Loading names:


100%|██████████| 72/72 [00:00<00:00, 72039.57it/s]


Loading experiments:


100%|██████████| 68/68 [00:00<00:00, 99.97it/s] 


In [3]:
# binned_dfs = bin_df_stepwise_batch(fia_df, binning_var="mz", binned_var="inty", statistic="sum", start=50.0, stop=1700.0, step=0.002)
# binned_dfs.to_csv(os.path.join(run_dir, "data_matrix.tsv"), sep="\t")
binned_dfs = pd.read_csv(os.path.join(run_dir, "data_matrix_oms.tsv"), sep="\t", index_col="mz", engine="pyarrow")

In [4]:
scaler = MaxAbsScaler()
binned_dfs[:] =  scaler.fit_transform(binned_dfs)

In [5]:
print(binned_dfs.shape)
print(comm8.shape)
print(strains.shape)

(825000, 68)
(68, 8)
(8, 1)


## Model definition

## Splitting dataset

In [6]:
X = binned_dfs.transpose()
ys = comm8
kf = KFold(n_splits = 5, shuffle=True)     # stratified: skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

### Classifier

In [7]:
def build_model(hyperparameters):
    model = keras.Sequential(name="MS_community_classifier")
    if hyperparameters.Boolean("dropout_in"):
        model.add(keras.layers.Dropout(0.1, noise_shape=None, seed=None))
    model.add(keras.layers.BatchNormalization())
    
    for i in range(hyperparameters.Int("num_layers", 1, 3)):
        model.add(
            layers.Dense(
                # Tune number of units separately.
                units=hyperparameters.Int(f"units_{i}", min_value=32, max_value=256, step=64),
                activation=hyperparameters.Choice("activation", ["relu", "tanh"]),
            )
        )
    model.add(layers.Dense(64,  activation=activations.relu))
    
    if hyperparameters.Boolean("dropout"):
        model.add(keras.layers.Dropout(0.25, noise_shape=None, seed=None))
    model.add(keras.layers.BatchNormalization())

    model.add(layers.Dense(1,  activation=activations.sigmoid))
    
    loss_function = keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0.0, axis=-1, reduction="sum_over_batch_size", name="binary_crossentropy")

    learning_rate = hyperparameters.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    optimizer = keras.optimizers.Nadam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
    return model

## Training

In [8]:
hp = keras_tuner.HyperParameters()
hp.Fixed("dropout", True)
hp.Fixed("num_layers", 1)
hp.Fixed("units_1", 64)
hp.Fixed("activation", "relu")
hp.Fixed("lr", 1e-3)
model = build_model(hp)




In [9]:
confusion_matrices = []
accuracies = []

for y in tqdm(ys.columns):
	y = ys[y]
	conf_mats = []
	acc = []
	for train_index, val_index in kf.split(X, y):
		training_data = X.iloc[train_index]
		training_labels = y.iloc[train_index]
		validation_data = X.iloc[val_index]
		validation_labels = y.iloc[val_index]
		
		history = model.fit(training_data.values, training_labels.values, epochs=10, verbose=0)

		print("Evaluation:")
		val_loss, val_acc = model.evaluate(validation_data,  validation_labels, verbose="auto")
		acc.append(val_acc)
		prediction = model.predict(validation_data)
		prediction = [0.0 if pred[0] < 0.5 else 1.0 for pred in prediction]
		conf_mats.append(confusion_matrix(validation_labels,  prediction))
	
	accuracies.append(np.mean(acc))
	confusion_matrices.append(np.sum(conf_mats, axis=0))

  0%|          | 0/8 [00:00<?, ?it/s]



Instructions for updating:
Colocations handled automatically by placer.


  0%|          | 0/8 [00:00<?, ?it/s]


AttributeError: 'Nadam' object has no attribute 'get_updates'

In [None]:
plot_cv_confmat(ys=ys, target_labels=strains.values, accuracies=accuracies,
				confusion_matrices=confusion_matrices, outdir="../../runs/ML/try/NN", name="64(0.5)x1LR0.001,100epochs")

In [None]:
break

## Tuning

In [None]:
for y in tqdm(ys.columns):
	y = ys[y]
	for train_index, val_index in kf.split(X, y):
		training_data = X.iloc[train_index]
		training_labels = y.iloc[train_index]
		validation_data = X.iloc[val_index]
		validation_labels = y.iloc[val_index]
		tuner = keras_tuner.RandomSearch(hypermodel=build_model, objective="val_accuracy", max_trials=3, executions_per_trial=2,
											overwrite=True, directory="../../runs/ML/try/NN", project_name="64(0.5)x1LR0.01")
		tuner.search(training_data, training_labels, epochs=10, validation_data=(validation_data, validation_labels))

In [None]:
tuner.results_summary(3)

In [None]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)