# Prosessing data from UniProt

In [1]:
import pandas as pd

from reframed import load_cbmodel

In [2]:
gene_protein_map = pd.read_csv('uniprot/uniprot-proteome_UP000001349.tab', sep='\t')
gene_protein_map

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Cross-reference (RefSeq)
0,B8I4G1,LEUD_RUMCH,3-isopropylmalate dehydratase small subunit (E...,leuD Ccel_0127,WP_012634581.1;
1,B8I8F2,UVRC_RUMCH,UvrABC system protein C (Protein UvrC) (Excinu...,uvrC Ccel_0807,WP_015924347.1;
2,B8I567,UPP_RUMCH,Uracil phosphoribosyltransferase (EC 2.4.2.9) ...,upp Ccel_0260,WP_012634712.1;
3,B8I364,PDXT_RUMCH,Pyridoxal 5'-phosphate synthase subunit PdxT (...,pdxT Ccel_1859,WP_015925312.1;
4,B8I176,RL21_RUMCH,50S ribosomal protein L21,rplU Ccel_1320,WP_015924822.1;
...,...,...,...,...,...
3282,B8I8G0,B8I8G0_RUMCH,Lytic transglycosylase catalytic,Ccel_0815,WP_015924355.1;
3283,B8I473,B8I473_RUMCH,5-formyltetrahydrofolate cyclo-ligase (EC 6.3....,Ccel_2164,WP_015925601.1;
3284,B8I0E1,B8I0E1_RUMCH,Uncharacterized protein,Ccel_3176,
3285,B8I126,B8I126_RUMCH,Uncharacterized protein,Ccel_3046 Ccel_3293,WP_015926397.1;


In [3]:
# Cleaning up an preparing for comparison with model
for index, string in enumerate(gene_protein_map['Cross-reference (RefSeq)']):
     if type(string) == str:   
        array = string.split('.')
        if len(array)==2:
            new_array = ['G_',array[0],'_',array[1][:-1]]
            gene_protein_map.loc[index,'Cross-reference (RefSeq)'] = ''.join(new_array)

In [4]:
%store gene_protein_map

Stored 'gene_protein_map' (DataFrame)


#### Get GPR relationship for CarveMe model

CarveMe automatically has a protein ID instead of  gene ID. Here the gene-protein-reaction relationship is shown. 

In [5]:
model = load_cbmodel('model_cellulolyticum_H10.xml') 

In [6]:
# Get subset of proteins for eacg reaction (length of outer list is the same as the number of reactions). 
# The reactions that do not have a reaction associated with them remain the same
carveme_proteins = [model.reactions[reaction].gpr.get_genes() 
                    if model.reactions[reaction].gpr else {} for reaction in model.reactions]


In [7]:
# Get the gene ID for each protein involved in each reactions.

carveme_genes_inrxn = []
for proteins in carveme_proteins:
    gene_list = []
    for protein in proteins:
        genes = list(gene_protein_map.loc[gene_protein_map['Cross-reference (RefSeq)']==protein,'Gene names'].values)
        
        # If one or multiple genes match with the same protein. 
        #However there is (almost) one-to-one mapping between gene and protein
        if len(genes)>0:
            multiple_genes = genes[0].split(' ') # Some strings contain elements separated by ' '
            
            for gene in multiple_genes:
                if "Ccel_" in gene: # Only include the elements that have a Ccel_value        
                    gene_list.append(gene)
    carveme_genes_inrxn.append(gene_list)

In [8]:
# Storing data in DataFrame 
carveme_genes_inrxn_str = [", ".join(genes) for genes in carveme_genes_inrxn]
reaction_gene_map = pd.DataFrame({"CarveMe rxn ID": list(model.reactions), "Genes": carveme_genes_inrxn_str})
reaction_gene_map

Unnamed: 0,CarveMe rxn ID,Genes
0,R_12DGR120tipp,
1,R_12DGR140tipp,
2,R_12DGR141tipp,
3,R_12DGR160tipp,
4,R_12DGR161tipp,
...,...,...
1806,R_EX_xylb_e,
1807,R_EX_zn2_e,
1808,Growth,
1809,R_ATPM,


In [9]:
%store reaction_gene_map

Stored 'reaction_gene_map' (DataFrame)
