# Carveme Model Curation

This notebook applies the curation method from notebook B to 5587 bacterial genome scale models that were generated with CarveMe. 

Imports

In [None]:
import pandas as pd
import os
from string import ascii_lowercase
import gzip
import cobra
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

from xml.dom.minidom import parse, parseString
from xml.dom import minidom 
import xml.etree.ElementTree as ET

from functions.curation import curate 

# Get improvement measurements from after curation function
Define functions for this section

In [None]:
def get_smbl_model(model_path):
    
    input_file = gzip.open(model_path)
    unzipped_content = input_file.read()

    unzipped_content_string = str(unzipped_content, 'utf-8')
    
    temp_file = open("temp_model.xml", "w")
    temp_file.write(unzipped_content_string)
    
    model = cobra.io.read_sbml_model("temp_model.xml")
    return model

def should_be_balanced(r):
    unbalanced_from_name = (r.id.startswith('EX_') or r.id.startswith('sink_') or r.id.startswith('Growth'))
    return not unbalanced_from_name
    
def number_unbalanced_reactions(model):
    return len([r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance() != {}])
    
def number_undefined_metabolites(model):
    return len([m for m in model.metabolites if ';' in m.formula])

def get_model_stats(model):
    return number_unbalanced_reactions(model), number_undefined_metabolites(model)

def get_model_stats_from_path(model_path):
    model = get_smbl_model(model_path)
    return number_unbalanced_reactions(model), number_undefined_metabolites(model)

def get_smbl_model_from_path(model_path):
    
    input_file = gzip.open(model_path)
    unzipped_content = input_file.read()

    unzipped_content_string = str(unzipped_content, 'utf-8')
    
    temp_file = open("temp_model.xml", "w")
    temp_file.write(unzipped_content_string)
    
    model = cobra.io.read_sbml_model("temp_model.xml")
    return model

def curate_from_path(model_path):
    model = get_smbl_model(model_path)
    return curate(model)

Get number of models in this directory

In [None]:
num_genus = 0
num_model = 0

for char in ascii_lowercase:
    for species in os.listdir(os.path.join(f'models/{char}')):
        num_genus += 1
        for model_filename in os.listdir(os.path.join(f'models/{char}', species)):
            num_model += 1
        
print(f'There are {num_model} models {num_genus} representing genera (plural of genus)')

Get all unbalanced reactions and undefined metabolites before and after curation

In [None]:
%%time
model_data = {}

# for char in ascii_lowercase:
for char in ['a']:
    print(char)
    for species in os.listdir(os.path.join(f'models/{char}')):
        print(species)
        for model_filename in os.listdir(os.path.join(f'models/{char}', species)):
            print(model_filename)

            # get model and curated model
            model_path = os.path.join(f'models/{char}', species, model_filename)
            model = get_smbl_model_from_path(model_path)
            curated_model = curate_from_path(model_path)

            unbalanced_original, undefined_original = get_model_stats(model)
            unbalanced_curated, undefined_curated = get_model_stats(curated_model)


            model_data[model.id] = [unbalanced_original, unbalanced_curated, undefined_original, undefined_curated]

columns = ['old unbalanced reactions', 'new unbalanced reactions', 'old undefined metabolites', 'new undefined metabolites']
model_data_df=pd.DataFrame.from_dict(model_data, orient='index', columns=columns)
model_data_df

In [None]:
model_data_df

Print output summary of above table

In [None]:
print(f"The original model had on average {model_data_df['old unbalanced reactions'].mean()} unbalanced reactions")
print(f"The curated models had on average {model_data_df['new unbalanced reactions'].mean()} unbalanced reactions")

### Plot density curves

In [None]:
old_data = list(model_data_df['old unbalanced reactions'])
new_data = list(model_data_df['new unbalanced reactions'])

max(old_data)
max(new_data)

old_density = gaussian_kde(old_data)
new_density = gaussian_kde(new_data)

xs = np.linspace(0,100,2000)

old_density.covariance_factor = lambda : .25
new_density.covariance_factor = lambda : .25

old_density._compute_covariance()
new_density._compute_covariance()

fig, ax = plt.subplots(figsize=(10,5)) 

ax.plot(xs,old_density(xs), label='Before Curation')
ax.plot(xs,new_density(xs), label='After Curation')
ax.set_title('Unbalanced Reactions in 5587 GSMs Before and After Curation') 
ax.set_ylabel('Relative Frequency') 
ax.set_xlabel('Number of Unbalanced Reactions') 
ax.legend()

plt.savefig("Unbalanced_reaction_distribution.png", dpi=150)
plt.show()

### Plot Scatter Plot

In [None]:
old_data = list(model_data_df['old unbalanced reactions'])
new_data = list(model_data_df['new unbalanced reactions'])

fig, ax = plt.subplots(figsize=(7,7)) 

minimum = min(old_data, new_data)
maximum = max(old_data, new_data)

ax.scatter(old_data, new_data)
ax.plot([minimum, maximum], [minimum, maximum], 'r')
# ax.plot(xs,new_density(xs), label='After Curation')
ax.set_title('Unbalanced Reactions in 5587 GSMs Before and After Curation') 
ax.set_ylabel('Number of Unbalanced Reactions After Curation') 
ax.set_xlabel('Number of Unbalanced Reactions Before Curation') 

plt.savefig("Unbalanced_reaction_scatter.png", dpi=150)
plt.show()

# Can probably delete everything below this 

Test if models really have no undefined metabolites

In [None]:
model = get_smbl_model('models/r/rhodococcus/Rhodococcus_jostii_RHA1.xml.gz')

In [None]:
for r in [r for r in model.reactions if r.check_mass_balance() != {}]:
    print(r)
    print(r.check_mass_balance())
    for m in r.metabolites:
        print(m.formula)
    print()
    

In [None]:
model = cobra.io.read_sbml_model("Ropacus_carveme_grampos.xml")
model

In [None]:
number_unbalanced_reactions(model)

In [None]:
number_undefined_metabolites(model)

In [None]:
New_model = curate(model)

In [None]:
number_unbalanced_reactions(New_model)

In [None]:
number_undefined_metabolites(New_model)

In [None]:
model.id = 'ropacus_curated_by_curation_py_version_1'
model.name = 'Rhodococcus opacus PD630 draft curation.py'
model.description = 'Rhodococcus opacus PD630 model curated by python function'

cobra.io.write_sbml_model(New_model, "Ropacus_curated_by_function_1.xml")

In [None]:
len([r for r in model.reactions if should_be_balanced(r) and r.check_mass_balance != {}])

In [None]:
should_be_balanced(model.reactions.get_by_id('Growth'))

Loop over all models which are sorted alphabetically and by genus

In [None]:
a_model_folders = os.listdir('models/a')
model_data = {}

for species in a_model_folders:
#     print(species)
    for model_filename in os.listdir(os.path.join('models/a', species)):
        model_path = os.path.join('models/a', species, model_filename)
        
        model = get_smbl_model(model_path)
        model_unbalanced = number_unbalanced_reactions(model)
        model_undefined = number_undefined_metabolites(model)
        
        curated_model = curate(model)
        
        curated_unbalanced = number_unbalanced_reactions(curated_model)
        curated_undefined = number_undefined_metabolites(curated_model)
        model_data[model.id] = [model_unbalanced, curated_unbalanced, model_undefined, curated_undefined]
#         print(model.id)
#     print()

df=pd.DataFrame.from_dict(model_data)
df

In [None]:
columns = ['old unbalanced reactions', 'new unbalanced reactions', 'old undefined metabolites', 'new undefined metabolites']
df=pd.DataFrame.from_dict(model_data,orient='index', columns=columns)
df

### Use models/a/acidibacillus/Acidibacillus_ferrooxidans_SLC66.xml.gz as a test file for curate function

In [None]:
model_path = 'models/a/acidibacillus/Acidibacillus_ferrooxidans_SLC66.xml.gz'

infile = gzip.open(model_path)
unzipped_content = infile.read()

unzipped_content_string = str(unzipped_content, 'utf-8')

# print(unzipped_content_string)

# parse xml file content
# dom = minidom.parseString(unzipped_content)

# unzipped_content_string = ET.tostring(unzipped_content)
# unzipped_content_string = 'Test4'

# data = ET.tostring(data)
myfile = open("test_model.xml", "w")
myfile.write(unzipped_content_string)

In [None]:
def get_smbl_model(model_path):
    
    input_file = gzip.open(model_path)
    unzipped_content = input_file.read()

    unzipped_content_string = str(unzipped_content, 'utf-8')
    
    temp_file = open("temp_model.xml", "w")
    temp_file.write(unzipped_content_string)
    
    model = cobra.io.read_sbml_model("temp_model.xml")
    return model
    
    
#     myfile = open("test_model.xml", "w")
#     myfile.write(unzipped_content_string)

In [None]:
model_path = 'models/a/acidibacillus/Acidibacillus_ferrooxidans_SLC66.xml.gz'
model = get_smbl_model(model_path)
model

In [None]:
file_name = 'models/a/acidibacillus/Acidibacillus_ferrooxidans_SLC66.xml.gz'


In [None]:
curate()

Loop over all files in models folder

In [None]:
# for letter in ascii_lowercase:
#     print(letter)

In [None]:
a_model_folders = os.listdir('models/a')

for species in a_model_folders[:10]:
    print(species)
    print(len(os.listdir(os.path.join('models/a', species))))
    for model_filename in os.listdir(os.path.join('models/a', species)):
        model_path = os.path.join('models/a', species, model_filename)
        print(model_path)
    print()

In [None]:
os.listdir("models/a/acidibacillus")[0]

In [None]:
'carveme/Ropacus_5_reaction_deletions.xml'

In [None]:
with gzip.open("models/a/acidibacillus/Acidibacillus_ferrooxidans_SLC66.xml.gz", 'rb') as f:
    file_content = f.read()[2:]
    
#     print(file_content)
#     model = cobra.io.read_sbml_model(file_content)
#     model
    

In [None]:
unzipped_xml = gzip.open("models/a/acidibacillus/Acidibacillus_ferrooxidans_SLC66.xml.gz")

In [None]:
unzipped_xml

In [None]:
# model = cobra.io.read_sbml_model(unzipped_xml)
# model

In [None]:
directory = r'C:\Users\admin'
for filename in os.listdir(directory):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        print(os.path.join(directory, filename))
    else:
        continue