#### Libraries

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from requests_html import HTMLSession

from tqdm.notebook import tqdm

In [2]:
#Read excel files and create the dfs
camel_df = pd.read_excel('../Data/1-s2.0-S1096717618302258-mmc1.xlsx', header = 1)
hefzi_df = pd.read_excel('../Data/hefzi_final.xlsx')
fouladiha_df = pd.read_excel('../Data/10529_2020_3021_MOESM1_ESM.xlsx', 'Supplementary Table 10', header = 1)
yeo_df = pd.read_excel('../Data/Supplementary Data.xlsx', 'Data S2')

In [3]:
#Standarization of the columns names
camel_df.rename(columns = {'Reaction ID':'Reaction', 'Initial reaction in model':'Reaction Formula', 'Reaction name':'Reaction Name', 'Justification':'Curation Notes'}, inplace = True)
fouladiha_df.rename(columns = {'Abbreviation':'Reaction', 'Description':'Reaction Name', 'Reaction':'Reaction Formula'}, inplace = True)
yeo_df.rename(columns = {'Rxn':'Reaction', 'Subsystem (iCHO1766)':'Subsystem'}, inplace = True)

# Addition of tag columns
camel_df.insert(loc=0, column='cam', value='X')
camel_df.insert(loc=1, column='hef', value=np.nan)
camel_df.insert(loc=2, column='fou', value=np.nan)
camel_df.insert(loc=3, column='yeo', value=np.nan)

hefzi_df.insert(loc=0, column='cam', value=np.nan)
hefzi_df.insert(loc=1, column='hef', value='X')
hefzi_df.insert(loc=2, column='fou', value=np.nan)
hefzi_df.insert(loc=3, column='yeo', value=np.nan)

fouladiha_df.insert(loc=0, column='cam', value=np.nan)
fouladiha_df.insert(loc=1, column='hef', value=np.nan)
fouladiha_df.insert(loc=2, column='fou', value='X')
fouladiha_df.insert(loc=3, column='yeo', value=np.nan)

yeo_df.insert(loc=0, column='cam', value=np.nan)
yeo_df.insert(loc=1, column='hef', value=np.nan)
yeo_df.insert(loc=2, column='fou', value=np.nan)
yeo_df.insert(loc=3, column='yeo', value='X')


hefzi_df

Unnamed: 0,cam,hef,fou,yeo,Reaction,Reaction Name,Reaction Formula,GPR,Subsystem,Lower bound,Upper bound,Curation Notes,References
0,,X,,,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,,"TRANSPORT, LYSOSOMAL",0.0,1000.0,,
1,,X,,,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,,"TRANSPORT, MITOCHONDRIAL",0.0,1000.0,,
2,,X,,,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,,"TRANSPORT, LYSOSOMAL",0.0,1000.0,,
3,,X,,,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,,"TRANSPORT, MITOCHONDRIAL",0.0,1000.0,,
4,,X,,,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,,"TRANSPORT, LYSOSOMAL",0.0,1000.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6658,,X,,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,,"TRANSPORT, EXTRACELLULAR",-1000.0,1000.0,,
6659,,X,,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,,"TRANSPORT, EXTRACELLULAR",-1000.0,1000.0,,
6660,,X,,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",-1000.0,1000.0,,
6661,,X,,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",-1000.0,1000.0,,


In [4]:
cols = hefzi_df.columns.to_list()+fouladiha_df.columns.to_list()+yeo_df.columns.to_list()+camel_df.columns.to_list()

# Eliminate repetitive values in the 'cols' list
cols = [cols[i] for i in range(len(cols)) if i == cols.index(cols[i])]

cols

['cam',
 'hef',
 'fou',
 'yeo',
 'Reaction',
 'Reaction Name',
 'Reaction Formula',
 'GPR',
 'Subsystem',
 'Lower bound',
 'Upper bound',
 'Curation Notes',
 'References',
 'Genes',
 'Protein',
 'Reversible',
 'Objective',
 'Proteins',
 'EC Number',
 'Mol wt',
 'kcat_forward',
 'kcat_backward',
 'Subsystem (iCHO2291)',
 'Reaction ID Camels Models']

In [5]:
def add_col(df):
    '''
    This function adds the remaining columns from the cols list 
    to an existing df that are not contained in such df
    '''
    df.columns
    add_col = []
    for col in cols:
        if col not in df.columns:
            add_col.append(col)
    df = df.reindex(columns = df.columns.tolist() + add_col)
    return df

In [6]:
# Unify columns for all datasets
hefzi_df = add_col(hefzi_df)
fouladiha_df = add_col(fouladiha_df)
yeo_df = add_col(yeo_df)
camel_df = add_col(camel_df)

hefzi_df

Unnamed: 0,cam,hef,fou,yeo,Reaction,Reaction Name,Reaction Formula,GPR,Subsystem,Lower bound,...,Protein,Reversible,Objective,Proteins,EC Number,Mol wt,kcat_forward,kcat_backward,Subsystem (iCHO2291),Reaction ID Camels Models
0,,X,,,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,,"TRANSPORT, LYSOSOMAL",0.0,...,,,,,,,,,,
1,,X,,,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,,"TRANSPORT, MITOCHONDRIAL",0.0,...,,,,,,,,,,
2,,X,,,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,,"TRANSPORT, LYSOSOMAL",0.0,...,,,,,,,,,,
3,,X,,,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,,"TRANSPORT, MITOCHONDRIAL",0.0,...,,,,,,,,,,
4,,X,,,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,,"TRANSPORT, LYSOSOMAL",0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6658,,X,,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,,"TRANSPORT, EXTRACELLULAR",-1000.0,...,,,,,,,,,,
6659,,X,,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,,"TRANSPORT, EXTRACELLULAR",-1000.0,...,,,,,,,,,,
6660,,X,,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",-1000.0,...,,,,,,,,,,
6661,,X,,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",-1000.0,...,,,,,,,,,,


In [7]:
# Reorder columns in all datasets the same way
fouladiha_df = fouladiha_df[['cam','hef', 'fou', 'yeo', 'Reaction', 'Reaction Name', 'Reaction Formula', 'GPR', 'Subsystem', 'Subsystem (iCHO2291)', 'Genes', 'Protein', 'EC Number', 'Mol wt', 'kcat_forward', 'kcat_backward', 'Reversible','Lower bound', 'Upper bound', 'Objective', 'Curation Notes', 'References', 'Reaction ID Camels Models']]
yeo_df = yeo_df[['cam','hef', 'fou', 'yeo', 'Reaction', 'Reaction Name', 'Reaction Formula', 'GPR', 'Subsystem', 'Subsystem (iCHO2291)', 'Genes', 'Protein', 'EC Number', 'Mol wt', 'kcat_forward', 'kcat_backward', 'Reversible','Lower bound', 'Upper bound', 'Objective', 'Curation Notes', 'References', 'Reaction ID Camels Models']]
hefzi_df = hefzi_df[['cam','hef', 'fou', 'yeo', 'Reaction', 'Reaction Name', 'Reaction Formula', 'GPR', 'Subsystem', 'Subsystem (iCHO2291)', 'Genes', 'Protein', 'EC Number', 'Mol wt', 'kcat_forward', 'kcat_backward', 'Reversible','Lower bound', 'Upper bound', 'Objective', 'Curation Notes', 'References', 'Reaction ID Camels Models']]
camel_df = camel_df[['cam','hef', 'fou', 'yeo', 'Reaction', 'Reaction Name', 'Reaction Formula', 'GPR', 'Subsystem', 'Subsystem (iCHO2291)', 'Genes', 'Protein', 'EC Number', 'Mol wt', 'kcat_forward', 'kcat_backward', 'Reversible','Lower bound', 'Upper bound', 'Objective', 'Curation Notes', 'References', 'Reaction ID Camels Models']]

hefzi_df

Unnamed: 0,cam,hef,fou,yeo,Reaction,Reaction Name,Reaction Formula,GPR,Subsystem,Subsystem (iCHO2291),...,Mol wt,kcat_forward,kcat_backward,Reversible,Lower bound,Upper bound,Objective,Curation Notes,References,Reaction ID Camels Models
0,,X,,,10FTHF5GLUtl,"5-glutamyl-10FTHF transport, lysosomal",10fthf5glu_c --> 10fthf5glu_l,,"TRANSPORT, LYSOSOMAL",,...,,,,,0.0,1000.0,,,,
1,,X,,,10FTHF5GLUtm,"5-glutamyl-10FTHF transport, mitochondrial",10fthf5glu_m --> 10fthf5glu_c,,"TRANSPORT, MITOCHONDRIAL",,...,,,,,0.0,1000.0,,,,
2,,X,,,10FTHF6GLUtl,"6-glutamyl-10FTHF transport, lysosomal",10fthf6glu_c --> 10fthf6glu_l,,"TRANSPORT, LYSOSOMAL",,...,,,,,0.0,1000.0,,,,
3,,X,,,10FTHF6GLUtm,"6-glutamyl-10FTHF transport, mitochondrial",10fthf6glu_m --> 10fthf6glu_c,,"TRANSPORT, MITOCHONDRIAL",,...,,,,,0.0,1000.0,,,,
4,,X,,,10FTHF7GLUtl,"7-glutamyl-10FTHF transport, lysosomal",10fthf7glu_c --> 10fthf7glu_l,,"TRANSPORT, LYSOSOMAL",,...,,,,,0.0,1000.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6658,,X,,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,,"TRANSPORT, EXTRACELLULAR",,...,,,,,-1000.0,1000.0,,,,
6659,,X,,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,,"TRANSPORT, EXTRACELLULAR",,...,,,,,-1000.0,1000.0,,,,
6660,,X,,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",,...,,,,,-1000.0,1000.0,,,,
6661,,X,,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",,...,,,,,-1000.0,1000.0,,,,


In [8]:
# Merge all the df into a unified df
all_dfs = pd.concat([camel_df, hefzi_df, fouladiha_df, yeo_df])
all_dfs = all_dfs.reset_index(drop = True)

all_dfs #20940 rows/reactions (many of them repeated)

Unnamed: 0,cam,hef,fou,yeo,Reaction,Reaction Name,Reaction Formula,GPR,Subsystem,Subsystem (iCHO2291),...,Mol wt,kcat_forward,kcat_backward,Reversible,Lower bound,Upper bound,Objective,Curation Notes,References,Reaction ID Camels Models
0,X,,,,GLCt1r,glucose transport (uniport),glc_D_e --> glc_D_c,,,,...,,,,,0.0,,,Unidrectional transporters,,
1,X,,,,GLCt2r,D-glucose transport in via proton symport,glc_D_e + h_e --> glc_D_c + h_c,,,,...,,,,,0.0,,,Unidrectional transporters,,
2,X,,,,GLCt2_2,D-glucose transport in via proton symport,glc_D_e + 2.0 h_e --> glc_D_c + 2.0 h_c,,,,...,,,,,0.0,,,Unidrectional transporters,,
3,X,,,,RE1342C,RE1342,nad_c + sbt_D_c --> glc_D_c + h_c + nadh_c,,,,...,,,,,0.0,,,Unidrectional transporters,,
4,X,,,,FRUt4,D-fructose transport via sodium cotransport,fru_e + na1_e --> fru_c + na1_c,,,,...,,,,,0.0,,,Unidrectional transporters,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20935,,,,X,RTOTALFATPc,,,,R GROUP SYNTHESIS,Exchange/demand/sink reaction,...,,,,,,,,,,
20936,,,,X,RTOTALt,,,,"TRANSPORT, EXTRACELLULAR",Transport,...,,,,,,,,,,
20937,,,,X,Rtotaltl,,,,"TRANSPORT, LYSOSOMAL",Transport,...,,,,,,,,,,
20938,,,,X,Rtotaltp,,,,"TRANSPORT, PEROXISOMAL",Transport,...,,,,,,,,,,


In [9]:
#def f(x):
#    d = {}
#    d['GPR'] = x['GPR'].dropna().apply(lambda x: len(x.unique())) > 1
#    d['a_max'] = x['a'].max()
#    d['b_mean'] = x['b'].mean()
#    d['c_d_prodsum'] = (x['c'] * x['d']).sum()
#    return pd.Series(d, index=['GPR'])#, 'a_max', 'b_mean', 'c_d_prodsum'])

In [10]:
#all_dfs1 = all_dfs.groupby(['Reaction'], group_keys=True).apply(lambda x: f(x))
#all_dfs1

In [11]:
# Group the entire dataset by reaction name 
all_dfs1 = all_dfs.groupby(['Reaction'], group_keys=True).apply(lambda x: x)

# Save merged datasets as Excel file
all_dfs1.to_excel('../Data/all_dfs1.xlsx')

all_dfs1

Unnamed: 0_level_0,Unnamed: 1_level_0,cam,hef,fou,yeo,Reaction,Reaction Name,Reaction Formula,GPR,Subsystem,Subsystem (iCHO2291),...,Mol wt,kcat_forward,kcat_backward,Reversible,Lower bound,Upper bound,Objective,Curation Notes,References,Reaction ID Camels Models
Reaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AASAD3m,556,X,,,,AASAD3m,L-aminoadipate-semialdehyde dehydrogenase (NAD...,L2aadp6sa_m + h2o_m + nad_m --> L2aadp_m + 2....,,,,...,,,,,,,,AASAD3 equivalent human enzyme mainly localize...,,
CAT,571,X,,,,CAT,hydrogen-peroxide:hydrogen-peroxide oxidoreduc...,2.0 h2o2_c --> 2.0 h2o_c + o2_c,,,,...,,,,,,,,This enzyme is present in CHO cell but it is s...,,
CATm,569,X,,,,CATm,catalase,2.0 h2o2_m --> 2.0 h2o_m + o2_m,,,,...,,,,,,,,This enzyme is present in CHO cell but it is s...,,
CATp,570,X,,,,CATp,"catalase A, peroxisomal",2.0 h2o2_x --> 2.0 h2o_x + o2_x,,,,...,,,,,,,,This enzyme is present in CHO cell but it is s...,,
CYOOm2_cho,575,X,,,,CYOOm2_cho,"cytochrome c oxidase, mitochondrial Complex IV",4.0 focytC_m + 8.0 h_m + o2_m --> 4.0 ficytC_...,,,,...,,,,,,,,Only the reaction generating superoxide anion ...,,Reaction ID kept : CYOOm3_cho
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
r2534,7263,,X,,,r2534,Major Facilitator(MFS) TCDB:2.A.1.44.1,thr_L_e <=> thr_L_c,,"TRANSPORT, EXTRACELLULAR",,...,,,,,-1000.0,1000.0,,,,
r2535,7264,,X,,,r2535,Major Facilitator(MFS) TCDB:2.A.1.44.1,hom_L_e <=> hom_L_c,,"TRANSPORT, EXTRACELLULAR",,...,,,,,-1000.0,1000.0,,,,
r2537,7265,,X,,,r2537,Utilized transport,lnlncgcoa_c <=> lnlncgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",,...,,,,,-1000.0,1000.0,,,,
r2538,7266,,X,,,r2538,Utilized transport,dlnlcgcoa_c <=> dlnlcgcoa_r,,"TRANSPORT, ENDOPLASMIC RETICULAR",,...,,,,,-1000.0,1000.0,,,,


In [None]:
#def f(x):
#    d = {}
#    d['EC Number_m'] = x['EC Number'].first()
#    d['a_max'] = x['a'].max()
#    d['b_mean'] = x['b'].mean()
#    d['c_d_prodsum'] = (x['c'] * x['d']).sum()
#    return pd.Series(d, index=['EC Number_m', 'a_max', 'b_mean', 'c_d_prodsum'])

all_dfs2 = all_dfs.groupby('Reaction').first()
all_dfs2.to_excel('../Data/all_dfs2.xlsx')
all_dfs2

In [None]:
# this functions are for fetching information from the BiGG database

def get_rxninfo(rxn):

    #download the page
    session = HTMLSession()
    
    response=session.get('http://bigg.ucsd.edu/models/iCHOv1/reactions/'+rxn)
    if response.status_code != 200:
        print('Status code:', response.status_code,f'Failed to fetch info on {rxn} from iCHOv1 model')
        response=session.get('http://bigg.ucsd.edu/universal/reactions/'+rxn)
        if response.status_code != 200:
            print('Status code:', response.status_code,f'Failed to fetch info on {rxn} from BiGG')

    #parse using beautiful soup
    rxn_doc = BeautifulSoup(response.text,'html.parser')

    return rxn_doc


def bigg_attributes(rxn):
    
    '''
    recieves a rxn file from the get_rxninfo function
    and returns Reaction Description, Subsystem
    '''
    if rxn.title.get_text().endswith('iCHOv1'):
        # Reaction description
        rxn_d = rxn.find_all("p")[0].get_text()

        # Reaction formula
        form = rxn.find_all("p")[2].get_text()

        # Subsystem
        subsystem = rxn.find_all("p")[5].get_text()
    else:
        # Reaction description
        rxn_d = rxn.find_all("p")[0].get_text()

        # Reaction formula
        form = rxn.find_all("p")[1].get_text()

        # Subsystem
        subsystem = ''
    
    return rxn_d, form, subsystem


In [None]:
#all_dfs3 = pd.DataFrame(columns = ['Reactions', 'Reaction Description', 'Reaction Formula', 'Subsystem'])
att = []
for i,r in enumerate(tqdm((all_dfs2.index))):
    try:
        rxn = get_rxninfo(r)
        rxn_d, form, subsystem = bigg_attributes(rxn)
        print(rxn_d, form, subsystem)
        if all_dfs2['Reaction Formula'][i] == None:
            all_dfs2['Reaction Formula'][i] = form
        if all_dfs2['Reaction Name'][i] == None:
            all_dfs2['Reaction Name'][i] = rxn_d
    except:
        print(f'Could not find info for {r}')
        break
        
all_dfs2.to_excel('../Data/all_dfs2.xlsx')

In [None]:
rxn.find_all("p")

In [None]:
rxn.find_all("p")[2].get_text()

In [None]:
rxn.find_all("div", class_="col-lg-8")

In [None]:
rxn = get_rxninfo('5G2OXPTtx')
print(rxn.title.get_text().endswith('iCHOv1'))


In [None]:
rxn.find_all("p")[3].get_text()

In [None]:
for i,r in enumerate(tqdm((all_dfs2.index))):
    if r.endswith('_cho'):
        print(r,i)
        if all_dfs2.index[i-1] == r.split('_')[0]:
            print(all_dfs2.index[i-1],i-1)
        #print(all_dfs2.index[i-1],i-1)