In [1]:
from libchebipy._chebi_entity import ChebiEntity
from Bio import Entrez, SeqIO
import pandas as pd
import reframed
import os
import json
import numpy as np
import collections
import sys
import seaborn as sns
import copy
import subprocess
#import matplotlib.pyplot as plt

from reframed.io.sbml import parse_gpr_rule

sys.path.append('../functions')

import translation_dicts
import general_functions as general_func
import EGC as EGC
import MAG_environments

### Prepare data 

In [2]:
%run -i "../a. Notebooks - unclassified but important/python_scripts/Part u. Processing MAG data - From genbank to fasta.py"

%run -i "../a. Notebooks - unclassified but important/python_scripts/Part v. Defining metabolite classes.py"

%run -i "../a. Notebooks - unclassified but important/python_scripts/Part x. MAG-community association.py"

%run -i "../a. Notebooks - unclassified but important/python_scripts/Part y. Community production.py"


**CONVERT FROM GENBANK TO FASTA**
	 Converting and saving...
**DEFINING METABOLITE CLASSES**
	 Get chebi IDs from model metadata
	 Assign compound class
	 Save data
**CREATE MAG-COMMUNITY ASSOCIATION**
	 Save data
**COMPOUNDS PRODUCED EXPERIMENTALLY BY COMMUNITY**
	 Save data


## Create models without constraints

## Create constraints

In [8]:
compounds_dict, source_dict,substrate_dict, gas_sheet_dict, community_dict = translation_dicts.translation_dicts()
all_Mags_for_paper_analysis = general_func.read_allmags_data()
MAG2community_id = pd.read_csv("../output/MAG2community_id.tsv",sep="\t",header=None,index_col=0)

with open("../output/relevant_MAGs_99.txt") as text_file:
    relevant_MAGs = text_file.read().split("\n")
relevant_MAGs = [string.replace("\t","") for string in relevant_MAGs]

with open("../output/community_production_names.json") as text_file:
    community_production_names = json.load(text_file)

with open("../output/compounds_dict_list.json") as text_file: 
    compounds_dict_list = json.load(text_file)

with open("../output/compounds_dict.json") as text_file: 
    compounds_dict = json.load(text_file)

In [9]:
directory = os.fsencode("../output/GEMs/GEMs_no_constraints/")
GEMs_dict = {}
for file in os.listdir(directory):
    
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        GEMs_dict[filename[:-4]]= reframed.load_cbmodel("../output/GEMs/GEMs_no_constraints/"+filename)

##### Defining soft constraints

- Excluding elements that can already be produced by a community member in complete media
- Focus on 99% most abundant species

##### Find the producers (from the 99% most abundant species)

In [10]:
## Find the community they belong to
MAG2community_id_most_abundant = MAG2community_id[MAG2community_id.index.isin(relevant_MAGs)]
enrich_groups_top = MAG2community_id_most_abundant.groupby(1).groups# top 99

## Find producers in COMPLETE media and growth=0

# For each community set the default production to False
comm_producers_top = {community_id:{MAG:{compound:False for compound in compounds.keys()} for MAG in enrich_groups_top[community_id]} for community_id,compounds in community_production_names.items()}
for community_id,MAGs in enrich_groups_top.items():
    for MAG in MAGs:
        model = GEMs_dict[MAG].copy()
        
        complete_env = reframed.Environment.from_model(model)      
        
        for compound in community_production_names[community_id].keys():
            mets = compounds_dict_list[compound]

            ex_rxns = ["R_EX_"+met+"_e" for met in mets if "R_EX_"+met+"_e" in model.get_exchange_reactions()]
            
            if len(ex_rxns)==0:
                continue
            
            sol = reframed.FVA(model,constraints=complete_env,reactions=ex_rxns)
            total_prod = sum([value[1] for value in sol.values()])
            comm_producers_top[community_id][MAG][compound] = total_prod>1e-6

Set parameter Username
Academic license - for non-commercial use only - expires 2025-03-14


In [11]:
dfs_community_count_top = {community_id:pd.DataFrame(producers_dict).sum(axis=1).rename("top99%") for community_id,producers_dict in comm_producers_top.items()}

##### Define soft constraints

In [12]:
soft_constraints_new = {}

for community_id,producers_df in dfs_community_count_top.items():
    # Find compounds not produced
    not_produced = producers_df[producers_df==0].index
    
    soft_constraints_new[community_id]={}
    
    for compound in not_produced:
        soft_constraints_new[community_id]["R_EX_"+compounds_dict[compound]+"_e"]=1

##### Media for gapfilling during reconstruction

We know that some of these compounds are produced when a specific electron donor is present. 

In [13]:
media_db = pd.read_csv("https://raw.githubusercontent.com/cdanielmachado/carveme/master/carveme/data/input/media_db.tsv",sep="\t")

lb_02 = media_db[media_db.medium=="LB[-O2]"].copy()
lb_02.medium = lb_02.medium.map(lambda x:x.replace("LB[-O2]","LB_extend"))
lb_02.loc[-1] = ["LB_extend", "Additional elements for MCCA production", "lac__L","L-Lactate"] 
lb_02.loc[-1] = ["LB_extend", "Additional elements for MCCA production", "etoh","Ethanol"] 
lb_02.loc[-1] = ["LB_extend", "Additional elements for MCCA production", "ac","Acetate"] 
lb_02.loc[-1] = ["LB_extend", "Additional elements for MCCA production", "ppa","Propionate (n-C3:0)"] 
lb_02.loc[-1] = ["LB_extend", "Additional elements for MCCA production", "xyl__D","D-Xylose"] 
lb_02.loc[-1] = ["LB_extend", "Additional elements for MCCA production", "lcts","Lactose"] 

lb_02.reset_index(drop=True,inplace=True)

##### Save data

In [14]:
for community_id, dict_ in soft_constraints_new.items():
    pd.DataFrame(pd.Series(dict_)).to_csv("./output/SC_"+community_id+".tsv",
                                          sep="\t",
                                          header=False,
                                          index_label=False)

##### Check that everything is as it should be

In [15]:
for community_id, dict_ in soft_constraints_new.items():
    SC_media_test_old = pd.read_csv("../output/soft_constraints/SC_"+community_id+".tsv",
                                          sep="\t",
                                          header=None)
    SC_media_test_new = pd.DataFrame(pd.Series(dict_)).reset_index()
    SC_media_test_new.columns=[0,1]
    assert  SC_media_test_old[SC_media_test_old==SC_media_test_new].shape==SC_media_test_old.shape
    assert  SC_media_test_old[SC_media_test_old==SC_media_test_new].shape==SC_media_test_new.shape

In [16]:
old_lb_02 = pd.read_csv("../output/soft_constraints/SC_media_db.tsv",
                                         sep="\t")

assert old_lb_02.shape==old_lb_02[old_lb_02==lb_02].shape
assert lb_02.shape==old_lb_02[old_lb_02==lb_02].shape

## Create models with constraints

## Select models

We have different sets of reconstructed models.

- **no_constr**: no soft constraints included. This is the ideal, but not all metabolites were produced.
- **constr0_1, constr0_5**: Will likely add reactions needed to support the phenotype, but it is a biased approach. May overpredict the amount of models carrying the ability. 


Strategy

1. Find the difference between the models
2. **Selection**: The models that required the least amount of changes (symmetric difference) to acquire the desired phenotype.



In [17]:
compounds_dict, source_dict,substrate_dict, gas_sheet_dict, community_dict = translation_dicts.translation_dicts()

In [18]:
relevant_MAGs.remove('CH15-bin.15')

**Load models made without soft constraints**

In [19]:
directory = os.fsencode("../output/GEMs/GEMs_no_constraints/")

GEMs_dict = {"no_constr":{}}
for file in os.listdir(directory):
    
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        GEMs_dict["no_constr"][filename[:-4]]= reframed.load_cbmodel("../output/GEMs/GEMs_no_constraints/"+filename)


**Load models made with soft constraints**

In [20]:
directories = {
               "constr0_1":"../output/GEMs/GEMs_intermediate/GEMs_soft_constraints_score_0.1/"
              }

for id_,directory_str in directories.items():
    directory = os.fsencode("../output/GEMs/GEMs_intermediate/GEMs_soft_constraints_score_0.1/")

    GEMs_dict[id_] = {}
    for file in os.listdir(directory):

        filename = os.fsdecode(file)
        if filename.endswith(".xml"): 
            GEMs_dict[id_][filename[:-4]]= reframed.load_cbmodel(directory_str+filename)


#### Find difference between models made with different reconstruction parameters

- new_rxns_count = number of new reactions
- symmetric_diff = number of lost reactions + number of new reactions 

In [21]:
difference_dict ={}
    
for constr_status in directories.keys():

    difference_dict[constr_status] = {}

    difference_dict[constr_status]["new_rxns_count"] = {}
    difference_dict[constr_status]["symmetric_diff"] = {}

    for MAG in relevant_MAGs:
        model_const = GEMs_dict[constr_status][MAG]
        model_no_constr = GEMs_dict["no_constr"][MAG]

        # number of new reactions in new model
        difference_dict[constr_status]["new_rxns_count"][MAG]=len(set(model_const.reactions)-set(model_no_constr.reactions))
        # reaction symmetric difference
        difference_dict[constr_status]["symmetric_diff"][MAG]=len(set(model_const.reactions).symmetric_difference(set(model_no_constr.reactions)))

In [22]:
diff_df = pd.DataFrame.from_dict({(constr_status,diff_type):difference_dict[constr_status][diff_type] 
                        for constr_status in difference_dict.keys() 
                        for diff_type in difference_dict[constr_status].keys()})

#### Select models: The best candidates to fulfill the community metabolic phenotype 

Here we focus on the most promising sets of reconstructed models. For this we chose:

- **constr0_1**: low soft constraint score will likely lead to addition of fewer reactions
- **no_constr**: no soft constraints included. This is the ideal, but not all metabolites were produced.


**Selection criteria**: The models that required the least amount of changes (symmetric difference) to acquire the desired phenotype.

In [23]:
# change this if you want to study something different.
soft_constraint_selected = "constr0_1"

In [24]:
# NB: This function does NOT confirm that the metabolite is produced.
def best_candidate(SC_rxns,MAGs,diff_df_select):
    """
    Input:
    SC_rxns - list of exchanged metabolites of community (exchange reactions)
    MAGs - list of MAGs in this community
    diff_df_select - dataframe with increase in rxns and symmetric difference
    """
    
    rxn_candidates ={}
    
    # For each exchanged compound find the candidate with lowest score in symmetric_diff
    for rxn in SC_rxns:
        rxn_candidates[rxn]={"MAG":None,"symmetric_diff":1000} # Default
        
        # For each community member
        for MAG in MAGs:
            
            # If reaction is not in model it is not a candidate
            if rxn not in GEMs_dict[soft_constraint_selected][MAG].reactions:
                continue
                
            symmetric_diff = diff_df_select.loc[MAG,"symmetric_diff"]  
            
            # If the symmetric difference is smaller than for the previous candidate
            if symmetric_diff<rxn_candidates[rxn]["symmetric_diff"]:
                rxn_candidates[rxn]["MAG"]=MAG
                rxn_candidates[rxn]["symmetric_diff"]=symmetric_diff
                
    return rxn_candidates

**Find best candidates**

Based on the fact that we expect certain compounds produced, which models created with soft constraints are the best candidates? Through this code we select just one model for each community and compound produced.


In [25]:
diff_df_select = diff_df.xs(soft_constraint_selected,axis=1)

# A new key,value pair for the genome-scale metabolic models from 'soft_constraint_selected' or 'no_costr'
GEMs_dict["adapt"]={}

# An overview of the origin of the models in GEMs_dict["adapt"]={}
GEMs_adapt = {}

rxn_candidates_all = {}

# For each community 
for community_name, community_id in community_dict.items():
    # Find community members
    MAGs = MAG2community_id[MAG2community_id[1]==community_id].index.values
    MAGs = [MAG for MAG in MAGs if MAG in relevant_MAGs] # Only look at 99% best ones.
    
    # Find the exchange reactions representing produced compounds by this community
    soft_constraints = pd.read_csv(f"../output/soft_constraints/SC_{community_id}.tsv", header=None, sep="\t")
    SC_rxns = soft_constraints[0].values
    
    # Find best candidate in community for producing each compound
    rxn_candidates = best_candidate(SC_rxns,MAGs,diff_df_select) 
    rxn_candidates_all[community_id]=rxn_candidates
            
    # Create GEMs adapt
    MAGs_best_candidates = set([candidate_dict["MAG"] for rxn,candidate_dict in rxn_candidates.items()])
    
    for MAG in MAGs:
        
        if MAG in MAGs_best_candidates:
            GEMs_dict["adapt"][MAG]=GEMs_dict[soft_constraint_selected][MAG].copy()
            GEMs_adapt[MAG] = soft_constraint_selected
        else:
            GEMs_dict["adapt"][MAG]=GEMs_dict["no_constr"][MAG].copy()
            GEMs_adapt[MAG] = "no_constr"


Some candidates are quite different from the original model. Here the symmetric diff shows how many reactions are different between the community members. 

In [26]:
best_candidates = pd.DataFrame.from_dict({(community_id,rxn):rxn_candidates_all[community_id][rxn]  
                                          for community_id in rxn_candidates_all.keys() 
                                          for rxn in rxn_candidates_all[community_id].keys()}).transpose()

##### Save data

In [27]:
GEMs_adapt["CH15-bin.15"]="no_constr"

In [28]:
with open("./output/GEMs_adapt/GEMs_adapt.json", "w") as outfile: 
    json.dump(GEMs_adapt, outfile)

##### Save models

In [29]:
GEMs_dict["adapt"]["CH15-bin.15"] = GEMs_dict["no_constr"]["CH15-bin.15"].copy()

In [30]:
for MAG,model in GEMs_dict["adapt"].items():
    reframed.save_cbmodel(model,"./output/GEMs_adapt/"+MAG+".xml")

##### Check that they are the same

In [31]:
GEMs_adapt_old = {}

directory = os.fsencode("../output/GEMs/GEMs_intermediate/GEMs_adapt/")
for file in os.listdir(directory):
    
    filename = os.fsdecode(file)
    print(filename[:-4])
    
    if filename.endswith(".xml"): 
        GEMs_adapt_old[filename[:-4]]= reframed.load_cbmodel("../output/GEMs/GEMs_intermediate/GEMs_adapt/"+filename)
        assert set(GEMs_adapt_old[filename[:-4]].reactions) == set(GEMs_dict["adapt"][filename[:-4]].reactions)
        

CH7-bin.18
CH15-bin.7
CH13-bin.0
CH1-bin.4
CH13-bin.1
CH13-bin.11
CH15-bin.6
CH7-bin.2
CH13-bin.13
CH1-bin.10
CH1-bin.6
CH13-bin.2
CH13-bin.12
CH15-bin.5
CH7-bin.1
CH15-bin.1
CH1-bin.3
CH13-bin.17
CH15-bin.0
CH7-bin.4
CH7-bin.23
CH7-bin.6
CH15-bin.22
CH15-bin.2
CH1-bin.1
CH13-bin.4
CH13-bin.14
CH15-bin.23
CH7-bin.20
CH8-bin.8
CH8-bin.9
CH8-bin.25
CH14-bin.4
CH14-bin.1
CH8-bin.21
CH14-bin.2
CH8-bin.22
CH8-bin.2
CH8-bin.29
CH8-bin.7
CH8-bin.6
CH3-bin.2
CH8-bin.14
CH8-bin.16
CH3-bin.0
CH8-bin.5
CH3-bin.1
CH8-bin.17
CH7-bin.11
CH9-bin.1
CH15-bin.12
CH13-bin.25
CH15-bin.13
CH9-bin.0
CH7-bin.12
CH9-bin.2
CH7-bin.9
CH15-bin.10
CH7-bin.8
CH7-bin.13
CH7-bin.17
CH15-bin.8
CH15-bin.15
CH7-bin.16
CH9-bin.6
CH9-bin.4
CH15-bin.17
CH1-bin.8
CH1-bin.9
CH15-bin.16
CH7-bin.15
CH9-bin.5


### Find transporters

### Fixing acetate transporters

In [83]:
with open("../output/relevant_MAGs_99.txt") as text_file:
    relevant_MAGs = text_file.read().split("\n")

relevant_MAGs = [string.replace("\t","") for string in relevant_MAGs]


##### Load models

In [84]:
GEMs_dict = {}

directory = os.fsencode("../output/GEMs/GEMs_intermediate/GEMs_adapt/")

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    
    if filename.endswith(".xml") and filename[:-4] in relevant_MAGs:
        GEMs_dict[filename[:-4]]= reframed.load_cbmodel("../output/GEMs/GEMs_intermediate/GEMs_adapt/"+filename)


##### Find acetate producers/consumers

Find which models has GPRs for producing acetate through phosphotransacetylase(R_PTAr) and acetate kinase (R_ACKr)

In [86]:
has_enzymes_for_acetate = {}
for MAG in GEMs_dict.keys():
    
    has_enzyme = []
    if "R_ACKr" in GEMs_dict[MAG].reactions:
        if GEMs_dict[MAG].reactions["R_ACKr"].gpr!=None:
            has_enzyme.append("R_ACKr")
    if "R_PTAr" in GEMs_dict[MAG].reactions:
        
        if GEMs_dict[MAG].reactions["R_PTAr"].gpr!=None:
            has_enzyme.append("R_PTAr")
    
    has_enzymes_for_acetate[MAG]= len(has_enzyme)==2

has_enzymes_for_acetate_MAGs = list(pd.Series(has_enzymes_for_acetate)[pd.Series(has_enzymes_for_acetate)].index)

##### Find hits on the ACt2r protein from the TCDB database

In [88]:
ACt2r_MAGs = []

ACt2r_MAGs_data = []
for filename in os.listdir("transporters/"):
    
    if filename.endswith(".tsv"):

        transport= pd.read_csv("transporters/"+filename,sep="\t",header=None)
        transport.columns = ["query acc.ver", "subject acc.ver", "% identity", "alignment length", "mismatches", "gap opens", "q. start", "q. end", "s. start", "s. end", "evalue", "bit score"]
        transport.reset_index(inplace=True)
        
        if transport.shape[0]>1:
            
            transport.sort_values(by="evalue",ascending=True,inplace=True)
            transport.reset_index(inplace=True)
            
            query = transport.loc[0,"query acc.ver"]
            gene = transport.loc[0,"subject acc.ver"]
            best_evalue = transport.loc[0,"evalue"]
            bit_score = transport.loc[0,"bit score"]
            
            if best_evalue<1e-5 and bit_score>20:
                if "2.A.1.13.1" in query:
                    ACt2r_MAGs.append(filename[:-4])
                    
                    ACt2r_MAGs_data.append((filename[:-4],gene,best_evalue,bit_score,"2.A.1.13.1"))

                elif "2.A.21.7.3" in query: 
                    ACt2r_MAGs.append(filename[:-4])
                    ACt2r_MAGs_data.append((filename[:-4],gene,best_evalue,bit_score,"2.A.21.7.3"))


ACt2r_MAGs_df = pd.DataFrame(ACt2r_MAGs_data,columns=["MAG","gene","evalue","bit_score","TCDB_id"]).sort_values("evalue")
ACt2r_MAGs_df.set_index("MAG",inplace=True)

##### Add acetate transport

Two different conditions.

1. The MAG has a hit in the TCDB database -> add reversible reaction
2. The MAG has a hit for the enzymes in acetate production -> add only producing reaction

In [89]:
GEMs_dict2 = {}

for MAG,model in GEMs_dict.items():
    model_copy = model.copy()
    
    if "R_Acabc" in model_copy.reactions.keys(): 
        model_copy.remove_reaction("R_Acabc")
      
    if "R_ACt2r" in model_copy.reactions.keys():
        model_copy.remove_reaction("R_ACt2r")
        
    if MAG in ACt2r_MAGs_df.index.values:
        model_copy.add_reaction_from_str("R_ACt2r: M_ac_c + M_h_c --> M_ac_e + M_h_e")
        model_copy.reversible=True
        model_copy.lb=-1000
        
        GPR = parse_gpr_rule(ACt2r_MAGs_df.loc[MAG,"gene"])
        model_copy.set_gpr_association("R_ACt2r",GPR)
        
        
    elif has_enzymes_for_acetate_MAGs:
        model_copy.add_reaction_from_str("R_ACt2r: M_ac_c + M_h_c --> M_ac_e + M_h_e")
        
    GEMs_dict2[MAG]=model_copy

In [91]:
for MAG,model in GEMs_dict2.items():
    print(MAG)
    EGC.EGC_identifier(model,print_results=False)

CH7-bin.18
[92mThere are NO energy producing cycles in the model[0m
CH15-bin.7
[92mThere are NO energy producing cycles in the model[0m
CH13-bin.0
[92mThere are NO energy producing cycles in the model[0m
CH1-bin.4
[92mThere are NO energy producing cycles in the model[0m
CH13-bin.1
[92mThere are NO energy producing cycles in the model[0m
CH13-bin.11
[92mThere are NO energy producing cycles in the model[0m
CH15-bin.6
[92mThere are NO energy producing cycles in the model[0m
CH7-bin.2
[92mThere are NO energy producing cycles in the model[0m
CH13-bin.13
[92mThere are NO energy producing cycles in the model[0m
CH1-bin.10
[92mThere are NO energy producing cycles in the model[0m
CH1-bin.6
[92mThere are NO energy producing cycles in the model[0m
CH13-bin.2
[92mThere are NO energy producing cycles in the model[0m
CH13-bin.12
[92mThere are NO energy producing cycles in the model[0m
CH15-bin.5
[92mThere are NO energy producing cycles in the model[0m
CH7-bin.1
[92mThere

### Removing oxygen related reactions

In [93]:
GEMs_dict3 = {}
for MAG, model in GEMs_dict2.items():
    if "M_o2_e" not in model.metabolites:
        continue
        
    model_copy = model.copy()
    model_copy.remove_reactions(model_copy.get_metabolite_reactions("M_o2_e"))
    model_copy.remove_metabolite("M_o2_e")
    model_copy.remove_reaction("R_EX_o2_e")
    
    GEMs_dict3[MAG]=model_copy

##### Save models

for MAG,model in GEMs_dict3.items():
    model.update()
    reframed.save_cbmodel(model,"../output/GEMs/GEMs_intermediate/GEMs_ACt2r/"+MAG+".xml")

In [96]:
directory = os.fsencode("../output/GEMs/GEMs_intermediate/GEMs_ACt2r/")
for file in os.listdir(directory):
    
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        model_old = reframed.load_cbmodel("../output/GEMs/GEMs_intermediate/GEMs_ACt2r/"+filename)
        
        assert set(model_old.reactions)==set(GEMs_dict3[filename[:-4]].reactions)

## Gap-filling with media - creating media 

This part builds on the reconstruction with soft constraints. The models selected from that process are further used here. 


Strategy:
- **Make media based on..**
    - **product prediction of models able to grow**
    - **Filter by CHEBI class**

In [102]:
compounds_dict, source_dict,substrate_dict, gas_sheet_dict, community_dict = translation_dicts.translation_dicts()

In [115]:
all_mags_paper = general_func.read_allmags_data()

##### Load models made without soft constraints

In [97]:
GEMs_dict = GEMs_dict3

##### Define environment

In [101]:
syncon_environments = MAG_environments.community_syncon_environments()

#### Modify Environment to support community growth

##### FBA growth predictions

In [111]:
MAG2community_id.columns=["community_id"]

community_groups = MAG2community_id.groupby(by="community_id").groups

In [112]:
FBA_growth = {}
for community_id, MAGs in community_groups.items(): 
    FBA_growth[community_id]={}
    for MAG in MAGs:
        if MAG in relevant_MAGs:
            model = GEMs_dict[MAG]
            syncon_environments[community_id].apply(model,inplace=True,exclusive=True,warning=False)
            sol = reframed.FBA(model)

            if sol is None:
                FBA_growth[community_id][MAG]=None
            else:
                FBA_growth[community_id][MAG]=sol

In [113]:
# Add the source and substrate to this data
growth_community_df = pd.concat([MAG2community_id,pd.Series({GEM:sol.fobj for community_id,GEM_sol_dict in FBA_growth.items() for GEM,sol in GEM_sol_dict.items()})],axis=1)
# Change from float to False or positive
growth_community_df["Grows"] = growth_community_df[0].map(lambda x:x>1e-6)
# Drop the growth float column
growth_community_df.drop(0,axis=1,inplace=True)

**When considering top 99 of members all but Cow_Manure on xylan have a growing community member**

In [119]:
MAG_can_grow = growth_community_df[growth_community_df.Grows].index

##### FVA prediction of bacteria that can survive in the media at obj_frac=0

1. Find compounds produced by growing community members
2. Filter by CHEBI class

Run FVA for exchange reactions of growing community members

In [120]:
FVA_production = {}

for community_id, MAGs in community_groups.items(): 
    
    FVA_production[community_id]={}
    
    for MAG in MAGs:
        
        # If MAG is among the ones who cannot grow -> continue
        if MAG not in MAG_can_grow:
            continue
        model = GEMs_dict[MAG]
        
        # Apply medium
        syncon_environments[community_id].apply(model,inplace=True,exclusive=True,warning=False)
        # Find FVA solution and obj_frac=0
        FVA_production[community_id][MAG] = reframed.FVA(model,reactions=model.get_exchange_reactions(),obj_frac=0.0)
        

**Find compounds that are being produced**

In [121]:
FVA_production_copy = FVA_production.copy()
FVA_production_copy = {community:{MAG:[rxn for rxn,sol in FVA_production_copy[community][MAG].items() if sol[1]>1e-6] for MAG in FVA_production_copy[community].keys()}
                       for community in FVA_production_copy.keys()}

**Combine the results from each community member into community level**

In [122]:
community_prod = {community_name:[] for community_name in FVA_production_copy.keys()}

for community_name,mag_prod in FVA_production_copy.items():
    for MAG,rxns in mag_prod.items():
        community_prod[community_name].extend(rxns)
    
    community_prod[community_name] = list(set(community_prod[community_name]))

#### Filter by chebi_class

In [123]:
# Some compounds are not interesting for us when it comes to exchange
ignore_classes = ["other","inorganic ions and atoms","oligopeptide","simple sugars","cellodextrin","carbohydrate derivative","carbohydrate acid","oligosaccharides"]

In [124]:
met_chebi_class = pd.read_csv("../output/met_chebi_class.tsv",sep="\t",index_col=0)

met_chebi_class_reduced = met_chebi_class[~met_chebi_class["self defined super class"].isin(ignore_classes)].copy()


met_chebi_class_reduced.loc["M_glc__D_e"]= met_chebi_class.loc["M_glc__D_e"]
met_chebi_class_reduced.loc["M_xyl__D_e"]= met_chebi_class.loc["M_xyl__D_e"]


In [126]:
met_chebi_class_dict = met_chebi_class_reduced["chebi class"].to_dict()

In [127]:
community_prod_dfs= {community_name: pd.DataFrame({"rxns":[rxn for rxn in rxns if "M_"+rxn[5:] in met_chebi_class_dict.keys() ],
                                                   "chebi_class":[met_chebi_class_dict["M_"+rxn[5:]] for rxn in rxns if "M_"+rxn[5:] in met_chebi_class_dict.keys() ]})
                                                  for community_name,rxns in community_prod.items()}

##### Make new environments

In [130]:
model_uni = reframed.load_cbmodel("../input/universe_bacteria.xml")

In [131]:
substrate_dict = {'X': 'Xylan', 'A': 'Avicel', 'P': 'PASC'}
source_dict = {'M':'Marshland soil','CD':'Compost and Digestate', 'CM':'Cow manure'}

In [132]:
syncon_media = {}
for community_id in MAG2community_id.community_id.unique():
    
    # Mets from original syncon environment
    mets_syncon = [rxn[5:-2] for rxn in syncon_environments[community_id].keys()]
    
    # Mets produced by the community members
    mets_produced = [rxn[5:-2] for rxn in community_prod_dfs[community_id].rxns.values]
    mets_syncon.extend(mets_produced)

    syncon_media[community_id]=set(mets_syncon)

In [133]:
media_dfs = []

for community_id,compounds in syncon_media.items():
    media_id_list = community_id.split("_")

    source = media_id_list[0]
    substrate = media_id_list[1]

    # NB! Some of the compounds they used are not in the universal model!
    compounds = [met for met in compounds if "M_"+met+"_e" in model_uni.metabolites]
    compounds_names = [model_uni.metabolites["M_"+met+"_e"].name for met in compounds]
    
    media_df = pd.DataFrame({"medium":[community_id for met in compounds],
               "description": ["Media + produced compounds in "+source_dict[source]+" on " +substrate_dict[substrate] for met in compounds],
               "compound":compounds,
               "name":compounds_names})
    media_dfs.append(media_df)

media_total_df.to_csv("../output/gapfill_media/gapfill_media.tsv",sep="\t",index=None)

##### Check that everything is as expected

In [153]:
media_total_df_old = pd.read_csv("../output/gapfill_media/gapfill_media.tsv",sep="\t")
media_total_df = pd.concat(media_dfs).reset_index(drop=True)

In [154]:
for community_id in media_total_df.medium.unique():
    print(community_id)
    assert set(media_total_df[media_total_df.medium==community_id].compound.values)==set(media_total_df_old[media_total_df_old.medium==community_id].compound.values)
    

CD_A
CD_P
CD_X
CM_A
CM_P
CM_X
M_P
M_X


## Gapfill models