# Notebook L: Mapping Transcriptomics Data to Model Reactions
The purpose of this notebook is to take in a csv file of transcript measurements and a model, and to output a csv files with the transcript mapped transcripts for each all applicable reactions in the genome scale model.

### Imports

In [1]:
import pandas as pd
import cobra

#### Load csv files with transcript data 
This step can be replaced with import from EDD 

In [2]:
# define blank dictionary and fill with transcript measurements from different conditions and carbon sources
transcript_measurements = {}
# add glucose conditions to data set object
transcript_measurements['glucose_cpm'] = pd.read_csv('../winston_data/october_19_2021/yoneda/yoneda_reprocess_CPM_melted.csv')
transcript_measurements['glucose_fpkm'] = pd.read_csv('../winston_data/october_19_2021/yoneda/yoneda_reprocess_FPKM_melted.csv')
transcript_measurements['glucose_mr'] = pd.read_csv('../winston_data/october_19_2021/yoneda/yoneda_reprocess_MR_melted.csv')
transcript_measurements['glucose_tmm'] = pd.read_csv('../winston_data/october_19_2021/yoneda/yoneda_reprocess_TMM_melted.csv')
# add phenol conditions to data set object
transcript_measurements['phenol_cpm'] = pd.read_csv('../winston_data/october_19_2021/henson/henson_reprocess_CPM_melted.csv')
transcript_measurements['phenol_fpkm'] = pd.read_csv('../winston_data/october_19_2021/henson/henson_reprocess_FPKM_melted.csv')
transcript_measurements['phenol_mr'] = pd.read_csv('../winston_data/october_19_2021/henson/henson_reprocess_MR_melted.csv')
transcript_measurements['phenol_tmm'] = pd.read_csv('../winston_data/october_19_2021/henson/henson_reprocess_TMM_melted.csv')

#### Load genome scale model
This is needed to get the model reactions and reaction -> gene mapping

In [3]:
model = cobra.io.read_sbml_model('../GSMs/Ropacus_annotated_curated.xml')

In [4]:
reactions_with_gpr = [r for r in model.reactions if r.gene_reaction_rule != '']
print(f'The are {len(reactions_with_gpr)} reactions that are mapped to transcripts')

The are 1755 reactions that are mapped to transcripts


#### Define a function to find the transcriptomic value for a gene in a given condition 

In [5]:
def transcript_value_for_gene(transcript_df, condition, gene):
    
    # filter the data frame by the condition and the gene
    gene_row = transcript_df[(transcript_df['Line Name'] == condition) & (transcript_df['Measurement Type'] == gene)]
    
    # return the transcriptomic measurement, if the measurement is missing, or duplicated, return 0
    return float(gene_row.Value) if len(gene_row) == 1  else 0

#### Define a function to map a reaction to a total transcript value
Note: a reaction may be mapped to multiple genes. Those genes may be required in series or in parallel

In [6]:
def transcript_value_for_reaction(transcript_df, condition, gene_reaction_rule):
    
    # convert the gene reaction rule into set of frozensets that represent parallel sets of genes
    set_of_parallel_genes = set()
    for x in [x.strip('() ') for x in gene_reaction_rule.split(' or ')]:
        # add to set of parallel genes
        set_of_parallel_genes.add(frozenset(y.strip('() ') for y in x.split(' and ')))
        
    # define variable for total transcripts
    total_transcripts = 0
    
    # loop over set_of_parallel_genes
    for parallel_genes in set_of_parallel_genes:
        # for each set of parallel genes, add the limiting transcript value to the total
        measurements = [transcript_value_for_gene(transcript_df, condition, gene) for gene in parallel_genes]
        # 
        total_transcripts += min(measurements)

    return total_transcripts

#### Define a function return a dataframe with mapped transcript values for all reactions in all conditions

In [7]:
def map_transcripts(transcript_df, model):
    # get the set of conditions in the transcript dataframe
    conditions = list(set(transcript_df['Line Name']))
    conditions.sort()
    
    # get the set of reactions in the model that have known gene mapping
    reactions_with_gpr = [r for r in model.reactions if r.gene_reaction_rule != '']

    # create object to hold the data
    reaction_id_to_transcript_mapping = {}
    
    # loop over conditions
    for condition in conditions:
        # keep track of the function's progress
        print(condition)

        # create dictionary with the mapped transcripts for all reactions for a given condition
        reaction_info = {r.id : transcript_value_for_reaction(transcript_df, condition, r.gene_reaction_rule) for r in reactions_with_gpr}
        
        # add reaction information to data object
        reaction_id_to_transcript_mapping[condition] = reaction_info
        
    return pd.DataFrame(reaction_id_to_transcript_mapping)

In [8]:
# loop over data sets, saving the dataframes as csv files
for data_set, data in transcript_measurements.items():
    print(data_set)
    map_transcripts(data, model).to_csv(f'../transcript_mapping/{data_set}.csv')

glucose_cpm
EVOL33-G-R1
EVOL33-G-R2
EVOL33-G-R3
EVOL33-H-R1
EVOL33-H-R2
EVOL33-H-R3
EVOL33-L-R1
EVOL33-L-R2
EVOL33-L-R3
EVOL40-G-R1
EVOL40-G-R2
EVOL40-G-R3
EVOL40-H-R1
EVOL40-H-R2
EVOL40-H-R3
EVOL40-L-R1
EVOL40-L-R2
EVOL40-L-R3
WT-G-R1
WT-G-R2
WT-G-R3
WT-L-R1
WT-L-R2
WT-L-R3
glucose_fpkm
EVOL33-G-R1
EVOL33-G-R2
EVOL33-G-R3
EVOL33-H-R1
EVOL33-H-R2
EVOL33-H-R3
EVOL33-L-R1
EVOL33-L-R2
EVOL33-L-R3
EVOL40-G-R1
EVOL40-G-R2
EVOL40-G-R3
EVOL40-H-R1
EVOL40-H-R2
EVOL40-H-R3
EVOL40-L-R1
EVOL40-L-R2
EVOL40-L-R3
WT-G-R1
WT-G-R2
WT-G-R3
WT-L-R1
WT-L-R2
WT-L-R3
glucose_mr
EVOL33-G-R1
EVOL33-G-R2
EVOL33-G-R3
EVOL33-H-R1
EVOL33-H-R2
EVOL33-H-R3
EVOL33-L-R1
EVOL33-L-R2
EVOL33-L-R3
EVOL40-G-R1
EVOL40-G-R2
EVOL40-G-R3
EVOL40-H-R1
EVOL40-H-R2
EVOL40-H-R3
EVOL40-L-R1
EVOL40-L-R2
EVOL40-L-R3
WT-G-R1
WT-G-R2
WT-G-R3
WT-L-R1
WT-L-R2
WT-L-R3
glucose_tmm
EVOL33-G-R1
EVOL33-G-R2
EVOL33-G-R3
EVOL33-H-R1
EVOL33-H-R2
EVOL33-H-R3
EVOL33-L-R1
EVOL33-L-R2
EVOL33-L-R3
EVOL40-G-R1
EVOL40-G-R2
EVOL40-G-R3
EVOL40-H-R1
EVOL