# Analysis Workflow #

#### Imports

In [None]:
# !pip install pandas
# !pip install jproperties
import os
import glob
import warnings
import itertools
import pandas as pd
import numpy as np
from jproperties import Properties
from zipfile import ZipFile
from mana import dars

### Load properties file

In [None]:
props = Properties()
try:
    with open('props.properties', 'rb') as config_file:
        props.load(config_file)
except FileNotFoundError as e:
    print(e)
    print("\033[91m\033[1m "+"You must provide a props.properties file"+" \033[0m\033[91m")

### Load required datasets

In [None]:
### Load the metadata file
pheno = pd.read_csv(props.get("pheno").data,sep="\t",index_col=0)

### Compute activation frequencies

In [None]:
rList = list(pd.read_csv(props.get("rListFile").data).iloc[:,0])
freq_table = pd.DataFrame()
for dir in glob.iglob(props.get("working_path").data+'**/full_enum',recursive=True):
    freq_table = pd.concat([freq_table,dars.calculate_frequencies_for_dir(dir,rList = rList)])
freq_table.insert(1,"Condition",np.nan)
#fill Condition column
for barcode in freq_table.index:
    barcode_condition = pheno.loc[barcode,["compound_name","sacri_period","dose_level"]]
    freq_table.loc[barcode,"Condition"] = str(barcode_condition.compound_name)+"_"+\
        str(barcode_condition.dose_level)+"_"+\
            str(barcode_condition.sacri_period).replace(" hr","h")+"_freq"
# #merge replicates
freq_table = freq_table.groupby('Condition').mean()

In [None]:
#Load controls for all the tested chemicals in our study
#for testing purpose only, get modelisation results and unzip them
with ZipFile(props.get("working_path").data+"input_data/test_data/all_controls_24hr.zip", 'r') as zObject:
      zObject.extractall(
                  path=props.get("working_path").data+\
                  "control_24_hr/full_enum/")

### Compute baseline noise (optional)

In [None]:
#pool control enumerated solutions per molecule and time,then compute activation frequencies
freq_ctrls = dars.calculate_freq_ctrls(props.get("rListFile").data, props.get("all_cpds").data, \
    props.get("time").data, pheno, props.get("working_path").data)
#for each vehicle, compute all combinations
condition_metadata = pheno[pheno["compound_name"].isin(str(props.get("all_cpds").data).split('/'))][(pheno["sacri_period"] == props.get("time").data)\
     & (pheno["dose_level"] == "Control")]
if "noise_data" not in os.listdir(props.get("working_path").data+"input_data/"):
    os.mkdir(props.get("working_path").data+"input_data/noise_data")
for vehicle in list(set(condition_metadata["Solvent"])):
    tmp_metadata = condition_metadata[condition_metadata['Solvent'] == vehicle]
    vehicle_conditions_list =  [compound + "_control_"+str(props.get("time").data).replace(" ","_") for compound in tmp_metadata['compound_name'].unique()]
    combinations = pd.DataFrame(list(itertools.combinations(vehicle_conditions_list, r=2)), columns=['col1', 'col2'])
    combinations = combinations.applymap(str)
    ctrl_noise = pd.DataFrame()
    for comb in range(0,combinations.shape[0]):
        freq1 = freq_ctrls.loc[combinations.iloc[comb,0]]
        freq2 = freq_ctrls.loc[combinations.iloc[comb,1]]
        comb_freq = pd.concat([freq1,freq2],axis=1)
        comb_freq.columns = ["freq1","freq2"]
        comb_freq = comb_freq.assign(
            R2 = np.square((comb_freq["freq1"] - comb_freq["freq2"]))
        )
        ctrl_noise.insert(0,combinations.iloc[comb,0]+"vs"\
                          +combinations.iloc[comb,1],comb_freq.R2)
    pd.DataFrame(ctrl_noise.transpose().median()).transpose().to_csv(\
        props.get("working_path").data+"input_data/noise_data/"+\
            "med_R2_"+vehicle+"_Control_"+str(props.get("time").data).replace(" ","")+"_combinations.tsv",\
                index=None,sep='\t')


#### Extract a list of diffentially activated reactions between controls and treated

In [None]:
#TODO INTEGRATE THE FORMULA RMD
## Ignore warnings (warnings from the linter for division by zero errors, which are catched in the code)
warnings.filterwarnings(action='ignore')
#Iterate to compute DARs between controls and treated conditions with each molecule
reaction_prefix = "R_"
if "DARS" not in os.listdir(props.get("working_path").data):
    os.mkdir(props.get("working_path").data+"DARS")
if "DARS_direction" not in os.listdir(props.get("working_path").data):
    os.mkdir(props.get("working_path").data+"DARS_direction")

for molecule in str(props.get("cpds").data).split('/'):
    indices = [s for s in list(freq_table.index) if molecule in s]
    if len(indices) < 2:
         raise ValueError("Less than two conditions for this molecule, we cannot compute DARs. Check previous steps of the workflow")
    if  "Control" not in indices[0]:
        indices.reverse()
    print(indices)
    comp_freq = pd.DataFrame(freq_table.loc[[indices[0],indices[1]]]).dropna(axis=1).transpose()
    print(comp_freq.columns)
    comp_freq.columns = ['f_ctrl','f_treatment']
    comp_freq.insert(loc=0, column='data_id', value=comp_freq.index)
    #rescale and rotate
    comp_freq = dars.rescale_and_rotate(comp_freq)
    #compute scores
    comp_freq = dars.compute_scores(comp_freq)
    DAR_direction = {}
    for reaction in comp_freq.data_id:
        if comp_freq.loc[reaction].R2 > float(props.get("cutoff").data):
            if((comp_freq.loc[reaction,"f_treatment"]>comp_freq.loc[reaction,"f_ctrl"])):
                  DAR_direction[reaction_prefix+reaction] = "UP"
            elif((comp_freq.loc[reaction,"f_treatment"]<comp_freq.loc[reaction,"f_ctrl"])):
                  DAR_direction[reaction_prefix+reaction] = "DOWN"
            else:
                  DAR_direction[reaction_prefix+reaction] = "UNDETERMINED" 
    if str(props.get("baseline_noise_filtering").data) == "True":
        DAR_list_filtered = {}
        #filter according to associated vehicule and time noise
        noise = pd.read_csv(props.get("working_path").data+"input_data/noise_data/med_R2_"+\
                            pheno[pheno["compound_name"] == molecule].Solvent.unique()[0]+\
                                "_Control_"+str(props.get("time").data).replace(" ","")+\
                                    "_combinations.tsv",sep="\t").transpose()
        for reaction in DAR_direction.keys():
            if comp_freq.loc[reaction.replace(reaction_prefix,"")].R2 > noise.loc[reaction.replace(reaction_prefix,""),0]*2:
                 DAR_list_filtered[reaction] = DAR_direction[reaction]
        pd.DataFrame(DAR_list_filtered.keys()).to_csv(props.get("working_path").data+"/DARS/"+molecule+\
                                      '_'+str(props.get("time").data).replace(" ","_")+\
                                        '_'+str(props.get("cond").data)+'noise_filtered.tsv','\t',index=None, header=None)
        pd.DataFrame([DAR_list_filtered]).transpose().to_csv(props.get("working_path").data+"/DARS_direction/"+molecule+\
                                '_'+str(props.get("time").data).replace(" ","_")+\
                                '_'+str(props.get("cond").data)+'noise_filtered.tsv','\t', header=None)
    else:
        pd.DataFrame(DAR_direction.keys()).to_csv(props.get("working_path").data+"/DARS/"+molecule+\
            '_'+str(props.get("time").data).replace(" ","_")+\
                '_'+str(props.get("cond").data)+'.tsv','\t',index=None, header=None)
        pd.DataFrame([DAR_direction]).transpose().to_csv(props.get("working_path").data+"/DARS_direction/"+molecule+\
            '_'+str(props.get("time").data).replace(" ","_")+\
                '_'+str(props.get("cond").data)+'.tsv','\t', header=None)
#reset warning parameter:
warnings.filterwarnings(action='default')