### Imports and functions

In [None]:
%%capture
#required modules
!pip install .

import pandas as pd
import numpy as np
import os
import glob
from cobra import io
from jproperties import Properties
from zipfile import ZipFile
from mana import modelling, batchs, results_processing, dars


### Load properties file

In [None]:
props = Properties()
try:
    with open('props.properties', 'rb') as config_file:
        props.load(config_file)
except FileNotFoundError as e:
    print(e)
    print("\033[91m\033[1m "+"You must provide a props.properties file"+" \033[0m\033[91m")

### Load required datasets

In [None]:
#Define paths 
### Load the model 
model = io.load_json_model(props.get("modelFile").data)
if len(props.get("modelId").data) > 0:
    model.id = str(props.get("modelId").data)

### Load the metadata file
pheno = pd.read_csv(props.get("pheno").data,sep="\t",index_col=0)

### Get relevant data and process it
#If cpds is empty, ask the user a list of compounds
cpds = str(props.get("cpds").data).split('/')
if len(cpds) == 0:
    input_buf = ''
    while input_buf.lower() != 'stop':
        input_buf = input('Provide either a molecule name or stop')
        if input_buf.lower() != 'stop':
            cpds.append(input_buf)

#load barcode processed data
gprs = modelling.get_GPR_reactions(model)
hgnc_data = pd.read_csv(props.get("mappingFile").data, sep="\t", dtype='unicode')
data = pd.read_csv(props.get("data").data,sep="\t")

### Prepare datasets and working env

In [None]:
#map gene expression identifiers and build sub dataset
col_to_add = modelling.identify_model_gene_ids(model)
if col_to_add == "model not implemented":
    print("\033[91m\033[1m "+"Unknown model"+" \033[0m\033[91m")
data = modelling.map_single_column(data,hgnc_data,col_to_add)
barcodes_to_keep = {}
subset_data = pd.DataFrame()
anot_cols = ['PROBEID','SYMBOL','HGNC ID','ENTREZID','GENENAME','N_GENES_IDENTICAL_PROBE']
#build dir and go into the relevant dir
dirname = str(props.get("working_path").data)+str(props.get("dose").data).lower()+'_'+str(props.get("time").data).replace(' ','_')
if dirname.split("/")[-1] not in os.listdir(str(props.get("working_path").data)):
    os.mkdir(dirname)
# os.chdir(dirname)
working_folders = ['batch_renum/batch','csvs','full_rxn_enum_set','log_dir','working_renum']
for name in working_folders:
    if name.split('/')[0] not in os.listdir(dirname):
        if name == "batch_renum/batch":
            os.mkdir(dirname+"/"+name.split('/')[0])
            os.mkdir(dirname+"/"+name)
        else:
            os.mkdir(dirname+"/"+name)
            
for cpd in str(props.get("cpds").data).split("/"):
    barcodes_to_keep[cpd] = list(pheno[(pheno['dose_level'] == props.get("dose").data) & \
                                       (pheno['sacri_period'] == props.get("time").data) & (pheno['compound_name'] == cpd)].index)
    if subset_data.shape[0] == 0:
        subset_data = pd.DataFrame(data.loc[:,anot_cols+barcodes_to_keep[cpd]])
    else:
        subset_data = pd.concat([subset_data,pd.DataFrame(data.loc[:,barcodes_to_keep[cpd]])],axis=1)
#extract data and binarize with the 75/25 method
modelling.preprocess_data(subset_data,col_to_add,model,csvs_path=dirname+"/csvs/")

### Write reaction enum batch scripts

In [None]:
for file in os.listdir(dirname+"/"+'csvs/'):
    if ".CEL" in file:
        batchs.write_rxn_enum_script(script_path=str(props.get("rxn_enum_script_path").data),batch_directory=dirname+"/"+'batch_renum/',\
                                     output_directory=dirname+"/"+'working_renum',modelfile=props.get("modelFile").data,\
                                     weightfile=dirname+"/"+'csvs/'+file,reactionFile=props.get("working_path").data+"input_data/recon2_2_reactions.csv", para_batchs=True)

### Launch all the batchs on the cluster
To run reaction enum on the cluster, you will need the dexom-python package with its dependencies installed (preferrably in a conda environment ) and CPLEX installed (to install locally on the genotoul cluster)

In [None]:
#for testing purpose only, get modelisation results and unzip them
for file in glob.glob(props.get("working_path").data+"input_data/test_data/"+\
                       str(props.get("dose").data).lower()+"_"+\
                        str(props.get("time").data).lower().replace(" ","")+"_renum_sols.zip"):
    with ZipFile(file, 'r') as zObject:
        zObject.extractall(
            path=props.get("working_path").data+\
                    str(props.get("dose").data).lower()+"_"+\
                        str(props.get("time").data).lower().replace(" ","_")+\
                            "/working_renum/")
    print(file)

### Check that all the batchs are done. If necessary launch the launch_failed_batch_reaction_enum.sh file

In [None]:
results_processing.remove_done_batchs(dirname+"/"+'batch_renum/batch/',dirname+"/"+'working_renum/',relax_param=True)
if len(glob.glob(dirname+"/"+'batch_renum/batch/*.CEL*.sh',recursive=False))>0:
    print(len(glob.glob(dirname+"/"+'batch_renum/batch/*.CEL*.sh',recursive=False)),' batchs to relaunch')
    raise FileExistsError
else:
    print("All batchs have been processed")


### Concatenate solutions in one csv file per biological condition and replicate

In [None]:
results_processing.concatenate_solutions(dirname+"/"+"working_renum/",dirname+"/"+"full_rxn_enum_set",col_index="",ncpus=1)

### Generate batch for diversity enum, starting from full reaction enum results

To launch the diversity enum pipeline, you must have finished the following steps:
* Full enumeration completed
* Proabilities computation on all the solutions

In [None]:
working_folders = ['batch_dexom/batch','prev_sol_dir','full_div_enum_set','working_divers']
for name in working_folders:
    if name.split('/')[0] not in os.listdir(dirname):
        if name == "batch_dexom/batch":
            os.mkdir(dirname+"/"+name.split('/')[0])
            os.mkdir(dirname+"/"+name)
        else:
            os.mkdir(dirname+"/"+name)
for file in os.listdir(dirname+"/"+'csvs/'):
    batchs.write_div_enum_script(script_path=str(props.get("div_enum_script_path").data),batch_directory = dirname+"/"+'batch_dexom',
                                 rxn_enum_set_dir = dirname+"/"+'full_rxn_enum_set',output_directory = dirname+"/"+'working_divers',
                                   modelfile = '../input_data/recon2v2_biomass_corrected.json',weightfile = dirname+"/"+'csvs/'+file,
                                     reactionFile = props.get("working_path").data+'input_data/recon2_2_reactions.csv',
                                     prev_sol_dir = dirname+"/"+'prev_sol_dir/')

### Launch all the batchs on the cluster
To run reaction enum on the cluster, you will need the dexom-python package with its dependencies installed (preferrably in a conda environment ) and CPLEX installed (to install locally on the genotoul cluster)

In [None]:
#for testing purpose only, get modelisation results and unzip them
for file in glob.glob(props.get("working_path").data+"input_data/test_data/"+\
                       str(props.get("dose").data).lower()+"_"+\
                        str(props.get("time").data).lower().replace(" ","")+"_divers_sols.zip"):
        with ZipFile(file, 'r') as zObject:
              zObject.extractall(
                     path=props.get("working_path").data+\
                        str(props.get("dose").data).lower()+"_"+\
                            str(props.get("time").data).lower().replace(" ","_")+\
                                "/working_divers/")

### Check that all the batchs are done. If necessary launch the launch_failed_batch_reaction_enum.sh file

In [None]:
results_processing.remove_done_batchs(dirname+"/"+'batch_dexom/batch/',dirname+"/"+'working_divers/',relax_param=False)
if len(glob.glob(dirname+"/"+'batch_dexom/batch/*.CEL*.sh',recursive=False))>0:
    print(len(glob.glob(dirname+"/"+'batch_dexom/batch/*.CEL*.sh',recursive=False)),' batchs to relaunch')
    raise FileExistsError
else:
    print("All batchs have been processed")

### Concatenate solutions in one csv file per biological condition and replicate

In [None]:
results_processing.concatenate_solutions(dirname+"/"+"working_divers/",dirname+"/"+"full_div_enum_set",col_index="",ncpus=1)

### Remove zero biomass solutions

In [None]:
results_processing.remove_zerobiomass_solutions(dirname+"/"+'full_rxn_enum_set',props.get("working_path").data+'input_data/recon2_2_reactions.csv')
results_processing.remove_zerobiomass_solutions(dirname+"/"+'full_div_enum_set',props.get("working_path").data+'input_data/recon2_2_reactions.csv')

### Concatenate the solutions

In [None]:
if "full_enum" not in os.listdir(dirname+"/"):
    os.mkdir(dirname+"/"+"full_enum")
results_processing.concatenate_reaction_div_enum(path_concat_rxn_enum = dirname+"/"+'full_rxn_enum_set',
                                                 path_concat_div_enum = dirname+"/"+'full_div_enum_set',
                                                  out_dir = dirname+"/"+"full_enum/",col_index="",single_csv=False,ncpus=1)