# Creating models step-by-step

In [None]:
import os
import json
import numpy as np
import pandas as pd
import collections
import sys
import seaborn as sns
import copy
import subprocess

from libchebipy._chebi_entity import ChebiEntity
from Bio import Entrez, SeqIO
import reframed
from reframed.io.sbml import parse_gpr_rule

sys.path.append('../functions')

import translation_dicts
import general_functions as general_func
import EGC as EGC
import MAG_environments

# Prepare data 

In [None]:
! mkdir ../output/
! mkdir ../output/MAGs_fasta/

%run -i "./python_scripts_process_data/1. Processing MAG data - From genbank to fasta.py"

%run -i "./python_scripts_process_data/2. Defining metabolite classes.py"

%run -i "./python_scripts_process_data/3. MAG-community association.py"

%run -i "./python_scripts_process_data/4. Community production.py"


# Build genome-scale metabolic models

## Create models without constraints

In [None]:
! mkdir ../output/GEMs
! mkdir ../output/GEMs/GEMs_no_constraints/

In [None]:
! source activate MAGs_RecAndAn2; ./bash_scripts/make_GEMs2.sh

## Create models with constraints

#### Create constraints

In [None]:
! mkdir ../output/soft_constraints/

In [None]:
%run -i "python_scripts/1. create soft constraints.py"

In [None]:
SC_media_test_old = pd.read_csv("../output_30_08_24/soft_constraints/SC_CD_X.tsv",
                                          sep="\t",
                                          header=None)
SC_media_test_old

In [None]:
SC_media_test_new = pd.DataFrame(pd.Series(soft_constraints_new["CD_X"])).reset_index()
SC_media_test_new

#### Create models 

In [None]:
! mkdir ../output/GEMs/GEMs_intermediate/


In [None]:
! source activate MAGs_RecAndAn2; ./bash_scripts/make_GEMs_soft_constraints.sh 0.1


## Select models

We have different sets of reconstructed models.

- **no_constr**: no soft constraints included. This is the ideal, but not all metabolites were produced.
- **constr0_1**: Will likely add reactions needed to support the phenotype, but it is a biased approach. May overpredict the amount of models carrying the ability. 

Strategy

1. Find the difference between the models
2. **Selection**: The models that required the least amount of changes (symmetric difference) to acquire the desired phenotype.

In [None]:
! mkdir ../output/GEMs/GEMs_intermediate/GEMs_adapt/

In [None]:
%run -i "python_scripts/2. Select models.py"

In [None]:
print("\t Assertions...")


directory = os.fsencode("../output_30_08_24/GEMs/GEMs_intermediate/GEMs_adapt/")
difference = {}


for file in os.listdir(directory):

    filename = os.fsdecode(file)
    print(filename[:-4])
    difference[filename[:-4]] = {}    
    if filename.endswith(".xml"): 
        model = reframed.load_cbmodel("../output_30_08_24/GEMs/GEMs_intermediate/GEMs_adapt/"+filename)
        difference[filename[:-4]]["original"] = set(model.reactions) -set(GEMs_dict["adapt"][filename[:-4]].reactions)
        difference[filename[:-4]]["new"] = set(GEMs_dict["adapt"][filename[:-4]].reactions)-set(model.reactions)
        

In [None]:
pd.DataFrame(difference).T.sort_values("new",ascending=False)

In [None]:
GEMs_dict.keys()

In [None]:
len(set(GEMs_dict["no_constr"]["CH14-bin.1"].reactions).symmetric_difference(set(GEMs_dict["constr0_1"]["CH14-bin.1"].reactions)))

In [None]:
len(set(GEMs_dict["no_constr"]["CH14-bin.2"].reactions).symmetric_difference(set(GEMs_dict["constr0_1"]["CH14-bin.2"].reactions)))

In [None]:
best_candidates.dropna()

In [None]:
set(model.reactions)-set(GEMs_dict["adapt"][filename[:-4]].reactions)

In [None]:
set(GEMs_dict["adapt"][filename[:-4]].reactions)-set(model.reactions)

## Fixing acetate, acetaldehyde, and O2 transporters 

####  Find transporters

In [None]:
! mkdir ../output/transporters

In [None]:
! source activate MAGs_RecAndAn2; export PATH=$PATH:/usr/local/ncbi/blast/bin/; ./bash_scripts/acetate_transport_predict.sh

#### Fixing transporters

In [None]:
! mkdir ../output/GEMs/GEMs_intermediate/GEMs_ACt2r/

In [None]:
%run -i "python_scripts/3. Fixing acetate acetaldehyde and O2 transporters.py"

In [None]:
directory = os.fsencode("../output_30_08_24/GEMs/GEMs_intermediate/GEMs_ACt2r/")
for file in os.listdir(directory):
    
    filename = os.fsdecode(file)
    if filename.endswith(".xml"): 
        model_old = reframed.load_cbmodel("../output_30_08_24/GEMs/GEMs_intermediate/GEMs_ACt2r/"+filename)
        
        difference[filename[:-4]] = set(model_old.reactions).symmetric_difference(set(GEMs_dict3[filename[:-4]].reactions))
        

In [None]:
pd.Series({mag:len(set_) for mag,set_ in difference.items()}).sort_values().tail(20)

In [None]:
difference["CH15-bin.13"]

In [None]:
difference["CH7-bin.8"]

## Gap-filling with media - creating media 

This part builds on the reconstruction with soft constraints. The models selected from that process are further used here. 


Strategy:
- **Make media based on..**
    - **product prediction of models able to grow**
    - **Filter by CHEBI class**

In [None]:
! mkdir ../output/gapfill_media

In [None]:
%run -i "python_scripts/4. Create media for gapfilling.py"

##### Check that everything is as expected

In [None]:
media_total_df_old = pd.read_csv("../output_30_08_24/gapfill_media/gapfill_media.tsv",sep="\t")
media_total_df = pd.concat(media_dfs).reset_index(drop=True)

In [None]:
for community_id in media_total_df.medium.unique():
    print(community_id)
    if community_id=="CD_X" or community_id=="CM_P":
        continue
    assert set(media_total_df[media_total_df.medium==community_id].compound.values)==set(media_total_df_old[media_total_df_old.medium==community_id].compound.values)
    

In [None]:
len(set(media_total_df[media_total_df.medium=="CD_X"].compound.values).intersection(set(media_total_df_old[media_total_df_old.medium=="CD_X"].compound.values)))

In [None]:
set(media_total_df[media_total_df.medium=="CD_X"].compound.values)-set(media_total_df_old[media_total_df_old.medium=="CD_X"].compound.values)


In [None]:
set(media_total_df_old[media_total_df_old.medium=="CD_X"].compound.values)-set(media_total_df[media_total_df.medium=="CD_X"].compound.values)

In [None]:
len(set(media_total_df[media_total_df.medium=="CM_P"].compound.values).intersection(set(media_total_df_old[media_total_df_old.medium=="CM_P"].compound.values)))


In [None]:
set(media_total_df[media_total_df.medium=="CM_P"].compound.values)-set(media_total_df_old[media_total_df_old.medium=="CM_P"].compound.values)


In [None]:
set(media_total_df_old[media_total_df_old.medium=="CM_P"].compound.values)-set(media_total_df[media_total_df.medium=="CM_P"].compound.values)

## Gapfill models

In [None]:
! mkdir ../output/GEMs/GEMs_final

In [None]:
! source activate MAGs_RecAndAn2; ./bash_scripts/gapfill_GEMs_media.sh


## Final changes

#### Load models

In [None]:
GEMs_adapt_media = {}

directory = os.fsencode("../output/GEMs/GEMs_final/")

for file in os.listdir(directory):
    filename = os.fsdecode(file)

    GEMs_adapt_media[filename[:-4]]= reframed.load_cbmodel("../output/GEMs/GEMs_final/"+filename)


In [None]:
for MAG, model in GEMs_adapt_media.items():
    
    if "R_EX_2h3mb_e" in model.reactions:
        print(MAG)     
        rxns = model.get_metabolite_reactions("M_2h3mb_e")
        model.remove_reactions(rxns)
        model.remove_metabolite("M_2h3mb_e")
        model.update()
        GEMs_adapt_media[MAG]=model
        
        

In [None]:
for MAG,model in GEMs_adapt_media.items():
    reframed.save_cbmodel(model,filename="../output/GEMs/GEMs_final/"+MAG+".xml")