### **Notebook to create a dataframe that contains mapping of gsm reaction to transcript measurements for all reactions** 

##### **Import python packages**

In [1]:
import pandas as pd
import numpy as np
import cobra
from matplotlib import pyplot as plt

from edd_utils import login, export_study, export_metadata

##### **Load Yondeda 2016 data and Henson 2018 data**

In [2]:
# Studies to Download
glucose_study_slug = 'biodesign_yoneda_set3_reprocessed'
phenol_study_slug = 'biodesign_henson2018_reprocessed'
# EDD server
edd_server = 'public-edd.jbei.org'
user       = 'garrettroell'

In [3]:
session = login(edd_server=edd_server, user=user)

glucose_df = export_study(session, glucose_study_slug, edd_server=edd_server)
phenol_df = export_study(session, phenol_study_slug, edd_server=edd_server)

Password for garrettroell:  ················


HBox(children=(FloatProgress(value=0.0, max=192297.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=440493.0), HTML(value='')))




##### **Filter transcriptomics data from all EDD data into different dataframes**

In [4]:
glucose_trans_df = glucose_df[glucose_df['Protocol'].str.contains('Transcriptomics')]
phenol_trans_df = phenol_df[phenol_df['Protocol'].str.contains('Transcriptomics')]
glucose_trans_df.head()

Unnamed: 0,Study ID,Study Name,Line ID,Line Name,Line Description,Protocol,Assay ID,Assay Name,Formal Type,Measurement Type,Compartment,Units,Value,Hours
0,14400,Biodesign_Yoneda_set3_reprocessed,15006,EVOL33-LN-G-R1,R. Opacus adaptively evolved strain1 with 1.0 ...,Transcriptomics,15030,EVOL33-LN-G-R1,,WP_000104864_1,0,FPKM,0.0,14.0
1,14400,Biodesign_Yoneda_set3_reprocessed,15007,EVOL33-LN-G-R2,R. Opacus adaptively evolved strain1 with 1.0 ...,Transcriptomics,15031,EVOL33-LN-G-R2,,WP_000104864_1,0,FPKM,0.0,14.0
2,14400,Biodesign_Yoneda_set3_reprocessed,15008,EVOL33-LN-G-R3,R. Opacus adaptively evolved strain1 with 1.0 ...,Transcriptomics,15032,EVOL33-LN-G-R3,,WP_000104864_1,0,FPKM,0.0,14.0
3,14400,Biodesign_Yoneda_set3_reprocessed,14991,EVOL33-LN-LP-R1,R. Opacus adaptively evolved strain1 with 0.75...,Transcriptomics,15015,EVOL33-LN-LP-R1,,WP_000104864_1,0,FPKM,0.0,24.0
4,14400,Biodesign_Yoneda_set3_reprocessed,14992,EVOL33-LN-LP-R2,R. Opacus adaptively evolved strain1 with 0.75...,Transcriptomics,15016,EVOL33-LN-LP-R2,,WP_000104864_1,0,FPKM,0.0,24.0


##### **Load Genome Scale Model**

In [5]:
file_name =  '../GSMs/Ropacus_annotated_curated_with_phenol_custom_biomass.xml'
model = cobra.io.read_sbml_model(file_name)

#### Define functions to find transcript levels for a reaction given a transcriptomic dataframe ####

In [13]:
def create_gpr_dict(model):   
    gpr_dict = dict()
    for rxn in model.reactions:
        if rxn.gene_reaction_rule:
            temp = set()
            for x in [x.strip('() ') for x in rxn.gene_reaction_rule.split(' or ')]:
                temp.add(frozenset(y.strip('() ') for y in x.split(' and ')))
            gpr_dict[rxn.id] = temp
    return gpr_dict

"""
    Calculates bound value based on transcriptomics data for reactions in gene reaction rule
    
    NOTE: 
    If a reaction R1 has the GPR of 'A and B', it would be parsed to { {A, B} } in gpr_dict['R1']. Then t for R1 would be sum( [ min(A, B) ] ) = min(A, B).
    If a reaction R1 has the GPR of 'A or B', it would be parsed to { {A}, {B} } in gpr_dict['R1']. Then t for R1 would be sum( [ min(A), min(B) ] ) = sum( [A, B] ).
    If a reaction R1 has the GPR of '(A and B) or (C and D)', it would be parsed to { {A, B}, {C, D} } in gpr_dict['R1']. Then t for R1 would be sum( [ min(A, B), min(C, D) ] ).
    
        Parameters
        ----------
        model : cobrapy model.
        Transcriptomics : pandas dataframe with transcriptomics data.Data frame has gene identifiers as index and just one column with transcript values.  
        rxn : cobrapy model reaction
        
        
        Returns
        -------
        transscript bound value: float.
"""

def find_trans_bound_from_gpr(model, Transcriptomics, rxn, gpr_dict, newinf=np.inf):
    finaltransval = 0
    listids = []
    for parallel_gene in gpr_dict[rxn.id]:
        transvals = []
        for gene in parallel_gene:
            if gene in Transcriptomics.index:
                transvals.append(Transcriptomics.loc[gene].values)
            else:
                transvals.append(np.inf)
            mintransval=np.min(transvals)
            if mintransval == np.inf:
                mintransval= newinf
        finaltransval = finaltransval + mintransval
#         if finaltransval==newinfbound:
#             display(rxn.id)
#             listids.append(rxn.id)
    return finaltransval


#### Define a function to get the input transcriptomic dataframe for find_trans_bound_from_gpr ####

In [7]:
def construct_trans_df(trans_df, line_name):
    line_name_df = trans_df[trans_df['Line Name']==line_name]
    filtered_df = line_name_df.filter(['Value', 'Measurement Type'])
    filtered_df.set_index('Measurement Type', inplace=True)
    return filtered_df

#### Create the basic structure of the dataframe

In [8]:
reaction_df = pd.DataFrame({
        'reaction_name': [rxn.name for rxn in model.reactions],
        'reaction_string': [rxn.reaction for rxn in model.reactions],
    },
    columns = ['reaction_name', 'reaction_string'],
    index=[rxn.id for rxn in model.reactions]
)
reaction_df.head()

Unnamed: 0,reaction_name,reaction_string
12DGR120tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr120_p --> 12dgr120_c
12DGR140tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr140_p --> 12dgr140_c
12DGR141tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr141_p --> 12dgr141_c
12DGR161tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr161_p --> 12dgr161_c
12DGR180tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr180_p --> 12dgr180_c


#### Fill dataframe with data from glucose conditions

In [14]:
gpr_dict = create_gpr_dict(model)

glucose_conditions = [
    'WT-LN-G-R1',
    'WT-LN-G-R2',
    'WT-LN-G-R3',
    'EVOL33-LN-G-R1',
    'EVOL33-LN-G-R2',
    'EVOL33-LN-G-R3',
    'EVOL40-LN-G-R1',
    'EVOL40-LN-G-R2',
    'EVOL40-LN-G-R3',
]

phenol_conditions = [
    'WT-P-R1',
    'WT-P-R2',
    'WT-P-R3',
    'PVHG-P-R1',
    'PVHG-P-R2',
    'PVHG-P-R3',
]

for condition in glucose_conditions + phenol_conditions:
    print(condition)
    
    # use the transcript dataframe that matches the condition
    trans_df = glucose_trans_df if '-G-' in condition else phenol_trans_df
    
    # get the formatted transcriptomic df for the condition
    transcriptomics_df = construct_trans_df(trans_df, condition)

    # define a list to hold all transcript measurements for a condtion
    transcript_measurements = []

    for index, _ in reaction_df.iterrows():
        rxn = model.reactions.get_by_id(index)
        if rxn.id in gpr_dict.keys():
            trans_bound = find_trans_bound_from_gpr(model, transcriptomics_df, rxn, gpr_dict, newinf=np.inf)
            transcript_measurements.append(trans_bound)
        else:
            transcript_measurements.append(np.nan)

    # transcript_measurements 

    reaction_df[f'{condition} trans'] = transcript_measurements

reaction_df

WT-LN-G-R1
WT-LN-G-R2
WT-LN-G-R3
EVOL33-LN-G-R1
EVOL33-LN-G-R2
EVOL33-LN-G-R3
EVOL40-LN-G-R1
EVOL40-LN-G-R2
EVOL40-LN-G-R3
WT-P-R1
WT-P-R2
WT-P-R3
PVHG-P-R1
PVHG-P-R2
PVHG-P-R3


Unnamed: 0,reaction_name,reaction_string,WT-LN-G-R1 trans,WT-LN-G-R2 trans,WT-LN-G-R3 trans,EVOL33-LN-G-R1 trans,EVOL33-LN-G-R2 trans,EVOL33-LN-G-R3 trans,EVOL40-LN-G-R1 trans,EVOL40-LN-G-R2 trans,EVOL40-LN-G-R3 trans,WT-P-R1 trans,WT-P-R2 trans,WT-P-R3 trans,PVHG-P-R1 trans,PVHG-P-R2 trans,PVHG-P-R3 trans
12DGR120tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr120_p --> 12dgr120_c,,,,,,,,,,,,,,,
12DGR140tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr140_p --> 12dgr140_c,,,,,,,,,,,,,,,
12DGR141tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr141_p --> 12dgr141_c,,,,,,,,,,,,,,,
12DGR161tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr161_p --> 12dgr161_c,,,,,,,,,,,,,,,
12DGR180tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr180_p --> 12dgr180_c,,,,,,,,,,,,,,,
12DGR181tipp,"1,2 diacylglycerol transport via flipping (per...",12dgr181_p --> 12dgr181_c,,,,,,,,,,,,,,,
12PPDStpp,"(S)-Propane-1,2-diol facilitated transport (pe...",12ppd__S_p <=> 12ppd__S_c,1.66651,1.74287,1.04483,1.59518,1.61617,1.21061,1.50650,1.20227,1.22692,2.37356,2.41686,2.82043,1.63712,1.43141,2.30775
12PPDt,"S-Propane-1,2-diol facilitated transport",12ppd__S_e <=> 12ppd__S_c,1.66651,1.74287,1.04483,1.59518,1.61617,1.21061,1.50650,1.20227,1.22692,2.37356,2.41686,2.82043,1.63712,1.43141,2.30775
14GLUCANabcpp,"1,4-alpha-D-glucan transport via ABC system (p...",14glucan_p + atp_c + h2o_c --> 14glucan_c + ad...,7.94798,6.75360,7.66209,9.89014,9.62353,10.03080,5.96575,8.13841,9.33820,1.14283,3.18587,1.34889,2.12826,1.43141,1.23080
14GLUCANtexi,"1,4-alpha-D-glucan transport via diffusion (ex...",14glucan_e --> 14glucan_p,,,,,,,,,,,,,,,


#### Save dataframe to as csv to be used in other notebooks

In [9]:
glucose_conditions = [
    'EVOL33-LN-G-R1',
    'EVOL33-LN-G-R2',
    'EVOL33-LN-G-R3',
    'EVOL40-LN-G-R1',
    'EVOL40-LN-G-R2',
    'EVOL40-LN-G-R3',
    'WT-LN-G-R1',
    'WT-LN-G-R2',
    'WT-LN-G-R3'
]

for condition in glucose_conditions:

    # get the transcriptomic df for the condition
    transcriptomics_df = constructTransdffrompred(trans_df, condition)

    # define a list to hold all transcript measurements for a condtion
    transcript_measurements = [] 

    # loop over the 13C MFA reactions
    for _, row in glucose_fluxes.iterrows():

        # get the reaction ids associated with the flux for that row
        reaction_id_string = row['Forward Reactions']
        mfa_reaction = row['Reaction']

        # remove directionality
        reaction_id_string = reaction_id_string.replace('reverse_', '')

        # convert to an array of parallel sets of reactions
        reaction_ids = reaction_id_string.split(' or ')

        # define a value to act as a counter
        transcript_level = 0

        for reaction_id in reaction_ids:
            # remove spaces and parentheses
            reaction_id = reaction_id.strip('() ')

            if ' and ' in reaction_id:
                series_reactions = reaction_id.split(' and ')
                # loop over reactions that have and relationships and add min value to the transcript total
                series_transcripts = []
                for series_reaction in series_reactions:
                    # get transcript level for a given reaction in the genome scale model
                    rxn = model.reactions.get_by_id(series_reaction)
                    transcript_level = findtransboundval_forgprrxns(model, transcriptomics_df, rxn, newinf=np.inf)
                    series_transcripts.append(transcript_level)
                # only add the minimum transcript level of reactions connected by 'and' relations
                transcript_level += min(series_transcripts)
            else:
                rxn = model.reactions.get_by_id(reaction_id)
                if rxn.id in create_gprdict(model).keys():
                    reaction_transcripts =  findtransboundval_forgprrxns(model, transcriptomics_df, rxn)
                    transcript_level += reaction_transcripts
                else:
                    pass

        transcript_measurements.append(transcript_level)

    glucose_fluxes[condition + ' transcripts'] = transcript_measurements

glucose_fluxes.head()

Unnamed: 0,Pathway,Forward Reactions,Reaction,Location on map,Flux,90% Confidence Lower Bound,90% Confidence Upper Bound,EVOL33-LN-G-R1 transcripts,EVOL33-LN-G-R2 transcripts,EVOL33-LN-G-R3 transcripts,EVOL40-LN-G-R1 transcripts,EVOL40-LN-G-R2 transcripts,EVOL40-LN-G-R3 transcripts,WT-LN-G-R1 transcripts,WT-LN-G-R2 transcripts,WT-LN-G-R3 transcripts
0,Glucose Uptake,reverse_EX_glc__D_e,Gluc.ext + ATP -> G6P,"(50, 460)",100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EMP Pathway,PGI,G6P <-> F6P,"(-150, 430)",-1.61,-2.09,1.42,339.77434,333.37109,325.13618,282.25838,282.34734,271.35304,270.55161,242.91212,285.06464
2,EMP Pathway,PFK or reverse_FBP,F6P + ATP -> FBP,"(-220, 195)",0.0,0.0,1.91,642.00867,599.81818,617.06697,667.32051,648.11338,625.11398,704.42129,648.56446,741.30736
3,EMP Pathway,FBA,FBP <-> DHAP + GAP,"(-140, 115)",0.0,0.0,1.91,113.47081,103.43466,116.04595,88.28107,102.10006,85.06621,100.31113,98.25414,109.88136
4,EMP Pathway,TPI,DHAP <-> GAP,"(-270, 150)",0.0,0.0,1.91,100.92202,101.01041,118.81306,79.72413,76.11263,86.29313,79.60794,63.17894,72.44159
