In [2]:
# imports
import sys
sys.path.append( '../FIA' )
sys.path.append( '../ML' )

from FIA import *
from ML4com import *

# TensorFlow and tf.keras
import tensorflow as tf
import keras
from keras import layers, activations


# Helper libraries
import numpy as np
import matplotlib.pyplot as plt




In [2]:
info_dir = "../../data/comm8_self"
data_dir = "../../runs/FIA/comm8/oms"
run_dir = "../../runs/ML/try"

info_dir = os.path.normpath(os.path.join(os.getcwd(), info_dir))
data_dir = os.path.normpath(os.path.join(os.getcwd(), data_dir))
run_dir = os.path.normpath(os.path.join(os.getcwd(), run_dir))

strains = pd.read_csv(os.path.join(info_dir, "strains.tsv"), sep="\t")
comm8 = pd.read_csv(os.path.join(info_dir, "comm8.tsv"), sep="\t")

fia_df = load_fia_df(data_dir, file_ending=".mzML", separator="\t")

Loading experiments:


100%|██████████| 68/68 [00:01<00:00, 64.25it/s]


Loading names:


100%|██████████| 72/72 [00:00<00:00, 71988.05it/s]


In [3]:
# binned_dfs = bin_df_stepwise_batch(fia_df, binning_var="mz", binned_var="inty", statistic="sum", start=50.0, stop=1700.0, step=0.002)
# binned_dfs.to_csv(os.path.join(run_dir, "data_matrix.tsv"), sep="\t")
binned_dfs = pd.read_csv(os.path.join(run_dir, "data_matrix.tsv"), sep="\t", index_col="mz", engine="pyarrow")

In [4]:
scaler = MaxAbsScaler()
binned_dfs[:] =  scaler.fit_transform(binned_dfs)

In [5]:
print(binned_dfs.shape)
print(comm8.shape)
print(strains.shape)

(825000, 68)
(68, 8)
(8, 1)


## Model definition

In [20]:
model = keras.Sequential(name="MS_community_prediction")
model.add(keras.Input(shape=(8,68,825000)))
model.add(layers.Dense(512, activation=activations.tanh, name="Feature_combination_1"))
model.add(layers.Dense(128, activation=activations.relu, name="Feature_combination_2"))
model.add(layers.Dense(128,  activation=activations.relu, name="Feature_interpretation_1"))
model.add(layers.Dense(128,  activation=activations.relu, name="Feature_interpretation_2"))
model.add(layers.Dense(100,  activation=activations.sigmoid, name="Feature_separation_1"))
model.add(layers.Dense(8,  activation=activations.softmax, name="Decision"))

model.summary()


Model: "MS_community_prediction"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Feature_combination_1 (Den  (None, 8, 68, 1000)       825001000 
 se)                                                             
                                                                 
 Feature_combination_2 (Den  (None, 8, 68, 100)        100100    
 se)                                                             
                                                                 
 Feature_interpretation_1 (  (None, 8, 68, 100)        10100     
 Dense)                                                          
                                                                 
 Feature_interpretation_2 (  (None, 8, 68, 100)        10100     
 Dense)                                                          
                                                                 
 Feature_separation_1 (Dens  (None, 8, 68, 

## Splitting dataset

In [35]:
binned_dfs.shape[1]

68

In [None]:
X = binned_dfs
y = comm8

test_index = sample_without_replacement(68, math.ceil(68/5))
test_data  = X.iloc[test_index]
test_labels = y.iloc[test_index]

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)        # non stratified: kf = KFold(n_splits = 5)
for train_index, val_index in skf.split(X, y):
	training_data = X.iloc[train_index]
	training_labels = y.iloc[train_index]
	validation_data = X.iloc[val_index]
	validation_labels = y.iloc[val_index]

	model.fit(training_data, training_labels, epochs=10)
	test_loss, test_acc = model.evaluate(validation_data,  validation_labels, verbose="auto")
	
	probability_model = keras.Sequential([model, keras.layers.Softmax()])
	predictions = probability_model.predict(test_data)

	

In [42]:
from MStoML.skripts.FIA.FIA import quick_plot


def plot_spec_label(i, predictions_array, test_data, test_labels):
  true_label, ms_peaks = test_labels[i], test_data.iloc[i]
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])
  
  spectrum = oms.MSSpectrum()
  spectrum.set_peaks((ms_peaks.columns, ms_peaks.values))
  plt.show(quick_plot(spectrum, plottype="scatter"))
  predicted_label = np.argmax(predictions_array)
  if predicted_label == true_label:
    color = 'blue'
  else:
    color = 'red'
  plt.xlabel("{} {:2.0f}% ({})".format(test_labels[predicted_label],
                                100*np.max(predictions_array),
                                test_labels[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, test_labels):
  true_label = test_labels[i]
  plt.grid(False)
  plt.xticks(range(10))
  plt.yticks([])
  thisplot = plt.bar(range(10), predictions_array, color="#777777")
  plt.ylim([0, 1])
  predicted_label = np.argmax(predictions_array)

  thisplot[predicted_label].set_color('red')
  thisplot[true_label].set_color('blue')



array([61, 55, 67, 62, 66, 27, 51, 21,  0, 50, 17, 36, 12, 59])

In [None]:
# Plot the first X test images, their predicted labels, and the true labels.
# Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(min(num_images, len(test_labels))):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_spec_label(i, predictions[i], test_data, test_labels)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()
