In [None]:
!pip install dnngior

Load dependecies

In [None]:
import numpy as np
import pandas as pd
import os
import sys
from pathlib import Path
from dnngior import NN_Trainer
from dnngior.reaction_class import Reaction as rc
path = Path.cwd()
sys.path.append(path)

### Generating reaction presence dataframe

To prepare the training data we need to determine the reactions present in your training metabolic models. This means that we need generate a list of possible reactions found in your training data, which will serve as the reaction keys. We can then determine for every draft training models which of these reactions are present and create a binary list of reactions presences. We will end up with a binary array with on one axis the different reactions and on the other every model in the training data. 

We will use the class we build but you can use any module to load metabolic models or extract the reaction sets in another way, the key is to end up with a binary array of reaction presences. If you already have this, this step can be skipped

In [None]:
#path to training models

model_path =  ''

#output path training data

output_path = ''

#list of model-ids of draft-models
paths  = os.listdir(model_path)
model_ids = []
for filename in paths:
    model_ids.append(filename[:-5])
n_models = len(model_ids)
dic = {}
rxn = []
for file_path, model_id in zip(paths,model_ids):
    print(model_id)
    model = rc(model = os.path.join(model_path, file_path))
    rs = set(model.reactions)
    dic[model_id]=rs
    
    #generate a list of all possible reactions
    for i in list(rs):
         if i not in rxn:
             rxn.append(i)

n_reactions = len(rxn)

reaction_df=pd.DataFrame(index=rxn, columns=model_ids)
for key, value in dic.items():
    a = []
    for i in rxn:
        if i in value:
            a.append(1)
        else:
            a.append(0)
    reaction_df[key]=a

#saving to pandas file
reaction_df.to_csv(output_path)



### Training the Neural Network

The easiest way to train the network requires providing a pandas dataframe where the index are the reaction keys and the columns the different training examples (see above). You can also provide a numpy array and the reaction keys as a separate list. The function will return a NN_predictor object to be used immediately, but it will also save it at output_path.

In [None]:
#Load in a small training sample
NN_folder = os.path.join(path,'docs', 'NN')
data_path = os.path.join(NN_folder, 'Sample_reaction_presence.csv')
data = pd.read_csv(data_path, index_col=0)

#set save path
save_path = os.path.join(NN_folder, 'example.npz')

#Train the network
NN_example = NN_Trainer.train(data=data, modeltype='ModelSEED',output_path=save_path)

The function will return a object of the predictor class (NN_predictor) containing the network, the reaction keys and modeltype. By default the network in this object is not a full tensorflow object but rather an array of the weights and biases of the different layers. It can still be used to make predictions while being less memory intensive by using matrix multiplication:

        a = input
        for layer in self.network:
            a = a.clip(0)
            a = ((a @ layer[0]) + layer[1])
        prediction =  1 / (1 + np.exp(-a))#sigmoid(a)
        
Which is build into the NN_Predictor class 

In [None]:
print("The weights of the first layer network: \n{}".format(NN_example.network[0][0][:3,:3]))
print("The bias of the first layer network: \n{}".format(NN_example.network[0][1][:3]))
print("The rxn_keys: \n{}".format(NN_example.rxn_keys.values))
print("The Modeltype: {}".format(NN_example.modeltype))

test_input = data.iloc[:,:3]
p = NN_example.predict(test)
print("Prediction: \n{}".format(np.round(p,3)))

### Changing feature generation parameters

Basically you now know how to train networks but there are many additional changes you want to make during training. 

During training the function will automatically generate the training dataset. You can change several parameters for the generation of the feature:

1. You can change the number of times each training model is used (nuplo).
2. You can change the range of deletion percentages (min_for to max_for) which will be removed in equal sized steps based on the number of replicates. 
3. You can weigh the deletion of certain reactions (del_p). 
4. You can add false reactions (min_con and max_con) in addition to removing during training*

*Note: we do not currently use this and it will not work with the masking of input reactions as the mask does not differentiate between contamination and real reactions.

In the following example we set nuplo to 5 instead of 30, and we vary deletion between 0.05 and 0.35. We also dont have to keep saving them, so we can set save=False.

In [None]:
network = NN_Trainer.train(data=data, nuplo=5, min_for=0.05, max_for=0.35, modeltype='ModelSEED',save=False)

By default the network will asume that your input (the data without deletions) should be what the network tries to predict. Alternatively, you can provide labels (the full set of reactions) for the network to try and predict.

In [None]:
special_labels = data.copy()
np.random.shuffle(np.asarray(special_labels))
special_labels.shape
data.shape
network = NN_Trainer.train(data=data, labels=special_labels, modeltype='ModelSEED',save=False)

Finally, you can rely on the default parameters to define the network which we optimised for our usecase, but for optimal perfomance on different datasets, you might want to change the hyperparameters (dropout, batch size), the architecture (nnodes, nlayers) or bias of predicted classes (bias0). You can also disable the masking of input positions during loss calculation (maskI=False). You can also provide a validation split which will set apart a part of your input data during training and calculate scores after to validate your network.

In [None]:
network = NN_Trainer.train(data=data, dropout=0.2,b_size=42,nnodes = 420, nlayers=3, bias_0=0.42, maskI=False, validation_split=0.2, modeltype='ModelSEED',save=False)

### Tensorflow object

By default the function returns a class with the simplified network but you very well might want instead the full Tensorflow network. To do this you can set return_full_network = True, which will change the NN_predictor to contain a Tensorflow network instead. If you want to save this different class you can change the file extension to .h5.

If you set return_history = True it will also return the history of training for optimisation purposes.

In [None]:
save_path = os.path.join(NN_folder, 'example.h5')
NN_tensorflow, history = NN_Trainer.train(data=data, return_full_network=True, modeltype='ModelSEED', output_path=save_path, return_history=True)

In [None]:
NN_tensorflow.network.summary()
NN_tensorflow.network.predict(test.T)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])