# Part 3.1.1. Reconstructing pathways for uptake of oligosaccharides from cellulose

In [1]:
import pandas as pd
import reframed
from reframed import Metabolite, GPRAssociation,Gene,Protein, ReactionType,CBReaction,Environment,FVA
import json
from collections import OrderedDict

## Reconstructing pathway

### Metabolites
    

In [2]:
metabolites = pd.read_excel('/Users/idunmariaburgos/Documents/Work/Project/Ruminiclostridium cellulolyticum part 2/Polysaccharide degrading pathways.xlsx',
                              sheet_name="Mets.  Cat. of cellodextrins ", 
                              usecols="A:E")
metabolites

Unnamed: 0,Name,Identifier,Compartment,Formula,Charge
0,Cellulose (n=5 repeating units),M_cell5_e,e,C30H52O26,0
1,Cellulose (n=5 repeating units),M_cell5_c,c,C30H52O26,0
2,Cellulose (n=4 repeating units),M_cell4_e,e,C24H42O21,0
3,Cellulose (n=4 repeating units),M_cell4_c,c,C24H42O21,0
4,Cellulose (n=3 repeating units),M_cell3_e,e,C18H32O16,0
5,Cellulose (n=3 repeating units),M_cell3_c,c,C18H32O16,0


In [3]:
mets = []

for index, row in metabolites.iterrows():
    # Create metabolite object
    met_id=row['Identifier']
    name=row['Name']
    compartment="C_"+row['Compartment']

    met = Metabolite(met_id=met_id, name=name,compartment=compartment)
    
    # Add metadata  
    formula = row['Formula']
    charge = row['Charge']
    
    met.metadata=OrderedDict({'FORMULA':formula,
                             'CHARGE':str(charge)})                   
    mets.append(met)

In [4]:
mets

[Cellulose (n=5 repeating units),
 Cellulose (n=5 repeating units),
 Cellulose (n=4 repeating units),
 Cellulose (n=4 repeating units),
 Cellulose (n=3 repeating units),
 Cellulose (n=3 repeating units)]

### Reactions

In [5]:
reactions = pd.read_excel('/Users/idunmariaburgos/Documents/Work/Project/Ruminiclostridium cellulolyticum part 2/Polysaccharide degrading pathways.xlsx',
                              sheet_name="Rxns.  Cat. of cellodextrins ", 
                              usecols="A:F")

In [6]:
reactions

Unnamed: 0,Enzyme,Identifier,Gene,Stoichiometry,Transport,Type
0,ABC transporter cellobiose,R_Cellbabc,Ccel_2112 and Ccel_2111 and Ccel_2110,"{""M_cellb_e"":-1, ""M_atp_c"":-2, ""M_h2o_c"":-1, ""...",1,ABC-transporter
1,ABC transporter cellodextrin G3,R_cell3abc,Ccel_2112 and Ccel_2111 and Ccel_2110,"{""M_cell3_e"":-1, ""M_atp_c"":-2, ""M_h2o_c"":-1, ""...",1,ABC-transporter
2,ABC transporter cellodextrin G4,R_cell4abc,Ccel_2112 and Ccel_2111 and Ccel_2110,"{""M_cell4_e"":-1, ""M_atp_c"":-2, ""M_h2o_c"":-1, ""...",1,ABC-transporter
3,ABC transporter cellodextrin G5,R_cell5abc,Ccel_2112 and Ccel_2111 and Ccel_2110,"{""M_cell5_e"":-1, ""M_atp_c"":-2, ""M_h2o_c"":-1, ""...",1,ABC-transporter
4,"cellodextrin phosphorylase CdpA G4, CdpB",R_CEPA3,Ccel_1439 or Ccel_2354,"{""M_cell4_c"": -1,""M_pi_c"": -1, ""M_cell3_c"": 1,...",0,Glycosyl hydrolases Family 94
5,cellodextrin phosphorylase CdpA G5,R_CEPA4,Ccel_1439,"{""M_cell5_c"": -1,""M_pi_c"": -1, ""M_cell4_c"": 1,...",0,Glycosyl hydrolases Family 94
6,cellodextrin phisphorylase CdpC,R_CEPA2,Ccel_3412,"{""M_cell3_c"": -1,""M_pi_c"": -1, ""M_cellb_c"": 1,...",0,Glycosyl hydrolases Family 94


**From gene string find GPR**

- process gene string and find all genes
    - For all genes: find protein ID. 
        - For each gene: Create Gene(gene_id=protein_id, name=None?)
 - Create Protein()
     - protein.genes= list of genes
 - Create GPRAssociation()
     - gpr.proteins = list of proteins

In [7]:
%store -r gene_protein_map 

In [8]:
gene_protein_map.head(3)

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Cross-reference (RefSeq)
0,B8I4G1,LEUD_RUMCH,3-isopropylmalate dehydratase small subunit (E...,leuD Ccel_0127,G_WP_012634581_1
1,B8I8F2,UVRC_RUMCH,UvrABC system protein C (Protein UvrC) (Excinu...,uvrC Ccel_0807,G_WP_015924347_1
2,B8I567,UPP_RUMCH,Uracil phosphoribosyltransferase (EC 2.4.2.9) ...,upp Ccel_0260,G_WP_012634712_1


In [9]:
def gene_str_to_GPR(gene_string, gene_protein_map):
    # This is meant to be used when there is only one protein complex in the string (in other word it can only handle 'and' associations and not 'or')

    genes_unfiltered = gene_string.split(' ')
    gpr=GPRAssociation()
    proteins=[]
    genes = []

    # Find the gene id (actually protein id, but in this case considered as gene id). If there is no ID, keep the old one. 
    i =0
    while i<len(genes_unfiltered):
        
        # If the substring is a gene id
        if genes_unfiltered[i]!='and' and genes_unfiltered[i]!='or':
            gene = gene_protein_map.loc[gene_protein_map['Gene names'].str.contains(genes_unfiltered[i])]['Cross-reference (RefSeq)'].values[0]

            # If there is a matching protein Id, add this to the gene list. 
            if type(gene)!=float:
                genes.append(gene)

            # If there is NOT a matching protein Id, add gene ID. 
            else:
                genes.append("G_" + genes_unfiltered[i])
                
        # If it's time to make a new protein        
        if genes_unfiltered[i]=="or" or i==len(genes_unfiltered)-1:
            # Create protein object
            protein=Protein()
            protein.genes=genes

            # Add protein to list of proteins
            proteins.append(protein) 
            genes=[]
        i=i+1

    gpr.proteins=proteins
                
    return gpr

 

**Create reaction objects**

In [10]:
rxns=[]
gprs={}

for index, row in reactions.iterrows():
    
    reaction_id = row['Identifier']
    name = row['Enzyme']
    reversible = False
    stoichiometry = json.loads(row['Stoichiometry'])
    reaction_type = ReactionType.ENZYMATIC
    
    if row['Transport']==1:
        reaction_type=ReactionType.TRANSPORT 
    
    rxn = CBReaction(reaction_id=reaction_id, name=name, reversible=reversible, stoichiometry=stoichiometry, reaction_type=reaction_type)
    rxns.append(rxn)
    
    gprs[reaction_id] = gene_str_to_GPR(row['Gene'],gene_protein_map)


In [11]:
gprs

{'R_Cellbabc': (G_WP_015925557_1 and G_WP_015925556_1 and G_WP_015925555_1),
 'R_cell3abc': (G_WP_015925557_1 and G_WP_015925556_1 and G_WP_015925555_1),
 'R_cell4abc': (G_WP_015925557_1 and G_WP_015925556_1 and G_WP_015925555_1),
 'R_cell5abc': (G_WP_015925557_1 and G_WP_015925556_1 and G_WP_015925555_1),
 'R_CEPA3': (G_WP_015924937_1 or G_WP_015925780_1),
 'R_CEPA4': G_WP_015924937_1,
 'R_CEPA2': G_WP_015926752_1}

### Add new metabolites and reactions to model

In [12]:
model = reframed.load_cbmodel('model_c_H10_part2_3_1.xml')

In [13]:
model.summary()

Metabolites:
C_c 852
C_e 214
C_p 184

Reactions:
enzymatic 878
transport 424
exchange 210
sink 0
other 244


In [14]:
len(model.genes)

728

In [15]:
for met in mets:
    model.add_metabolite(met)

In [16]:
for rxn in rxns:
    model.add_reaction(rxn)
    model.set_gpr_association(rxn.id,gprs[rxn.id])

In [17]:
model.summary()

Metabolites:
C_c 855
C_e 217
C_p 184

Reactions:
enzymatic 881
transport 428
exchange 210
sink 0
other 244


In [18]:
len(model.genes)

732

### Add exchange reactions for cellodextrins

In [19]:
mets_exchange = [met.id for met in mets if met.compartment=="C_e"]
rxns_exchange = []
for met in mets_exchange:
    rxn_id = "R_EX_" + met[2:]
    name = "Exchange of " + model.metabolites[met].name
    reversible=True
    stoichiometry =OrderedDict([(met, -1.0)])
    reaction_type = ReactionType.EXCHANGE
    rxns_exchange.append(CBReaction(reaction_id=rxn_id, name=name, reversible=reversible, stoichiometry=stoichiometry, reaction_type=reaction_type))

In [20]:
rxns_exchange

[R_EX_cell5_e: M_cell5_e <-> ,
 R_EX_cell4_e: M_cell4_e <-> ,
 R_EX_cell3_e: M_cell3_e <-> ]

In [21]:
model.summary()

Metabolites:
C_c 855
C_e 217
C_p 184

Reactions:
enzymatic 881
transport 428
exchange 210
sink 0
other 244


## Verifying that new reactions can carry flux with FVA

**Creating an environment from all exchange reactions in the model.**

In [22]:
env = Environment.complete(model, max_uptake=10)

In [23]:
all_rxns= rxns_exchange + rxns
rxn_ids = [rxn.id for rxn in all_rxns]

**Predict flux with all exchange reactions open** 

In [24]:
sol = FVA(model,constraints=env, reactions= rxn_ids)

  warn(f"Objective variable not previously declared: {r_id}")
  warn(f"Objective variable not previously declared: {r_id}")
  warn(f"Objective variable not previously declared: {r_id}")


In [25]:
sol

{'R_EX_cell5_e': [0.0, 0.0],
 'R_EX_cell4_e': [0.0, 0.0],
 'R_EX_cell3_e': [0.0, 0.0],
 'R_Cellbabc': [0.0, 10.0],
 'R_cell3abc': [0.0, 0.0],
 'R_cell4abc': [0.0, 0.0],
 'R_cell5abc': [0.0, 0.0],
 'R_CEPA3': [0.0, 0.0],
 'R_CEPA4': [0.0, 0.0],
 'R_CEPA2': [0.0, 0.0]}

## Checking if genes included are involved in other enzymatic reactions

In [26]:
genes = [rxn.get_genes() for rxn in rxns]
    

In [27]:
genes_flat = list(set([item for sublist in genes for item in sublist]))

In [28]:
genes_flat

['G_WP_015925555_1',
 'G_WP_015926752_1',
 'G_WP_015925557_1',
 'G_WP_015925780_1',
 'G_WP_015924937_1',
 'G_WP_015925556_1']

In [29]:
gene_reaction_dict= {}
for gene in genes_flat:
    try:
        print("Gene: " + gene + ", Reactions: " +  str(model.gene_to_reaction_lookup()[gene]))
        gene_reaction_dict[gene]=model.gene_to_reaction_lookup()[gene]
    except:
        print("Gene: " + gene + " not in model ")

Gene: G_WP_015925555_1, Reactions: ['R_ARBabc', 'R_Cellbabc', 'R_cell3abc', 'R_cell4abc', 'R_cell5abc']
Gene: G_WP_015926752_1, Reactions: ['R_CEPA2']
Gene: G_WP_015925557_1, Reactions: ['R_Cellbabc', 'R_cell3abc', 'R_cell4abc', 'R_cell5abc']
Gene: G_WP_015925780_1, Reactions: ['R_CEPA3']
Gene: G_WP_015924937_1, Reactions: ['R_CEPA3', 'R_CEPA4']
Gene: G_WP_015925556_1, Reactions: ['R_ARBabc', 'R_Cellbabc', 'R_cell3abc', 'R_cell4abc', 'R_cell5abc']


In [30]:
def prGreen(skk): print("\033[92m {}\033[00m" .format(skk))

In [31]:
print("Green reactions are the reactions that were included in this Jupyter Notebook\n")
for key in gene_reaction_dict.keys():
    print("Gene: " + key)
    for rxn in gene_reaction_dict[key]:
        if rxn in rxn_ids:
            prGreen(" " + str(model.reactions[rxn]))
        else:
            print("  " + str(model.reactions[rxn]))

Green reactions are the reactions that were included in this Jupyter Notebook

Gene: G_WP_015925555_1
  R_ARBabc: M_arab__L_e + M_atp_c + M_h2o_c --> M_adp_c + M_arab__L_c + M_h_c + M_pi_c
[92m  R_Cellbabc: M_cellb_e + 2 M_atp_c + M_h2o_c --> M_cellb_c + 2 M_adp_c + M_h_c + 2 M_pi_c[00m
[92m  R_cell3abc: M_cell3_e + 2 M_atp_c + M_h2o_c --> M_cell3_c + 2 M_adp_c + M_h_c + 2 M_pi_c[00m
[92m  R_cell4abc: M_cell4_e + 2 M_atp_c + M_h2o_c --> M_cell4_c + 2 M_adp_c + M_h_c + 2 M_pi_c[00m
[92m  R_cell5abc: M_cell5_e + 2 M_atp_c + M_h2o_c --> M_cell5_c + 2 M_adp_c + M_h_c + 2 M_pi_c[00m
Gene: G_WP_015926752_1
[92m  R_CEPA2: M_cell3_c + M_pi_c --> M_cellb_c + M_g1p_c[00m
Gene: G_WP_015925557_1
[92m  R_Cellbabc: M_cellb_e + 2 M_atp_c + M_h2o_c --> M_cellb_c + 2 M_adp_c + M_h_c + 2 M_pi_c[00m
[92m  R_cell3abc: M_cell3_e + 2 M_atp_c + M_h2o_c --> M_cell3_c + 2 M_adp_c + M_h_c + 2 M_pi_c[00m
[92m  R_cell4abc: M_cell4_e + 2 M_atp_c + M_h2o_c --> M_cell4_c + 2 M_adp_c + M_h_c + 2 M_pi_c

In [41]:
model.reactions.R_GLCabc.gpr

(G_WP_242651738_1 or (G_WP_012634654_1 and G_WP_015926592_1) or (G_WP_015924545_1 and G_WP_015924646_1))

In [46]:
model.reactions.R_ARBabc.gpr

(G_WP_015925555_1 and G_WP_015925556_1)

## <span style="color: blue;">Summary </span>

In [32]:
model.update()

In [33]:
model.id = "model_c_H10_part3_1_1"

In [34]:
reframed.save_cbmodel(model,filename="model_c_H10_part3_1_1.xml")

In [35]:
model_new = reframed.load_cbmodel('model_c_H10_part3_1_1.xml')

In [36]:
model_prev = reframed.load_cbmodel('model_cellulolyticum_H10.xml')

In [37]:
models_dict={model.id:{} for model in [model_new,model_prev]}
models_rxn_dict={model.id:{} for model in [model_new,model_prev]}
for model in [model,model_prev]:
    models_dict[model.id]['Reactions']=len(model.reactions)
    models_dict[model.id]['Metabolites']=len(model.metabolites)
    models_dict[model.id]['Genes']=len(model.genes)
    
    models_rxn_dict[model.id]['Enzymatic']=len(model.get_reactions_by_type(reframed.ReactionType.ENZYMATIC))
    models_rxn_dict[model.id]['Exchange']=len(model.get_reactions_by_type(reframed.ReactionType.EXCHANGE))
    models_rxn_dict[model.id]['Transport']=len(model.get_reactions_by_type(reframed.ReactionType.TRANSPORT))
    models_rxn_dict[model.id]['Sink']=len(model.get_reactions_by_type(reframed.ReactionType.SINK))
    models_rxn_dict[model.id]['Other']=len(model.get_reactions_by_type(reframed.ReactionType.OTHER))
    

**Overview models**

In [38]:
pd.DataFrame(models_dict)

Unnamed: 0,model_c_H10_part3_1_1,model_cellulolyticum_H10
Reactions,1763,1811
Metabolites,1256,1250
Genes,732,733


**Overview reactions in models**

In [39]:
pd.DataFrame(models_rxn_dict)

Unnamed: 0,model_c_H10_part3_1_1,model_cellulolyticum_H10
Enzymatic,881,883
Exchange,210,210
Transport,428,475
Sink,0,0
Other,244,243
