# Prosessing data from UniProt

In [10]:
import pandas as pd

from reframed import load_cbmodel

In [11]:
gene_protein_map = pd.read_csv('../input/uniprot-proteome_UP000001349.tab', sep='\t')
gene_protein_map

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Cross-reference (RefSeq)
0,B8I4G1,LEUD_RUMCH,3-isopropylmalate dehydratase small subunit (E...,leuD Ccel_0127,WP_012634581.1;
1,B8I8F2,UVRC_RUMCH,UvrABC system protein C (Protein UvrC) (Excinu...,uvrC Ccel_0807,WP_015924347.1;
2,B8I567,UPP_RUMCH,Uracil phosphoribosyltransferase (EC 2.4.2.9) ...,upp Ccel_0260,WP_012634712.1;
3,B8I364,PDXT_RUMCH,Pyridoxal 5'-phosphate synthase subunit PdxT (...,pdxT Ccel_1859,WP_015925312.1;
4,B8I176,RL21_RUMCH,50S ribosomal protein L21,rplU Ccel_1320,WP_015924822.1;
...,...,...,...,...,...
3282,B8I8G0,B8I8G0_RUMCH,Lytic transglycosylase catalytic,Ccel_0815,WP_015924355.1;
3283,B8I473,B8I473_RUMCH,5-formyltetrahydrofolate cyclo-ligase (EC 6.3....,Ccel_2164,WP_015925601.1;
3284,B8I0E1,B8I0E1_RUMCH,Uncharacterized protein,Ccel_3176,
3285,B8I126,B8I126_RUMCH,Uncharacterized protein,Ccel_3046 Ccel_3293,WP_015926397.1;


In [18]:
genes = {'Ccel_1439',
 'Ccel_2110',
 'Ccel_2111',
 'Ccel_2112',
 'Ccel_2354',
 'Ccel_3412',}

In [21]:
gene_protein_map[gene_protein_map['Gene names'].isin(genes)]

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Cross-reference (RefSeq)
1209,B8I1W7,B8I1W7_RUMCH,Glycosyltransferase 36,Ccel_1439,G_WP_015924937_1
1279,B8I422,B8I422_RUMCH,Binding-protein-dependent transport systems in...,Ccel_2110,G_WP_015925555_1
2191,B8I423,B8I423_RUMCH,Binding-protein-dependent transport systems in...,Ccel_2111,G_WP_015925556_1
3033,B8I424,B8I424_RUMCH,Extracellular solute-binding protein family 1,Ccel_2112,G_WP_015925557_1
3224,B8I1R5,B8I1R5_RUMCH,Glycosyltransferase 36,Ccel_3412,G_WP_015926752_1
3264,B8I5F2,B8I5F2_RUMCH,Glycosyltransferase 36,Ccel_2354,G_WP_015925780_1


In [12]:
# Cleaning up an preparing for comparison with model
for index, string in enumerate(gene_protein_map['Cross-reference (RefSeq)']):
     if type(string) == str:   
        array = string.split('.')
        if len(array)==2:
            new_array = ['G_',array[0],'_',array[1][:-1]]
            gene_protein_map.loc[index,'Cross-reference (RefSeq)'] = ''.join(new_array)

#### Get GPR relationship for CarveMe model

CarveMe automatically has a protein ID instead of  gene ID. Here the gene-protein-reaction relationship is shown. 

In [13]:
gene_protein_map

Unnamed: 0,Entry,Entry name,Protein names,Gene names,Cross-reference (RefSeq)
0,B8I4G1,LEUD_RUMCH,3-isopropylmalate dehydratase small subunit (E...,leuD Ccel_0127,G_WP_012634581_1
1,B8I8F2,UVRC_RUMCH,UvrABC system protein C (Protein UvrC) (Excinu...,uvrC Ccel_0807,G_WP_015924347_1
2,B8I567,UPP_RUMCH,Uracil phosphoribosyltransferase (EC 2.4.2.9) ...,upp Ccel_0260,G_WP_012634712_1
3,B8I364,PDXT_RUMCH,Pyridoxal 5'-phosphate synthase subunit PdxT (...,pdxT Ccel_1859,G_WP_015925312_1
4,B8I176,RL21_RUMCH,50S ribosomal protein L21,rplU Ccel_1320,G_WP_015924822_1
...,...,...,...,...,...
3282,B8I8G0,B8I8G0_RUMCH,Lytic transglycosylase catalytic,Ccel_0815,G_WP_015924355_1
3283,B8I473,B8I473_RUMCH,5-formyltetrahydrofolate cyclo-ligase (EC 6.3....,Ccel_2164,G_WP_015925601_1
3284,B8I0E1,B8I0E1_RUMCH,Uncharacterized protein,Ccel_3176,
3285,B8I126,B8I126_RUMCH,Uncharacterized protein,Ccel_3046 Ccel_3293,G_WP_015926397_1


In [4]:
model = load_cbmodel('../models/RcH10_draft.xml') 

In [5]:
# Get subset of proteins for eacg reaction (length of outer list is the same as the number of reactions). 
# The reactions that do not have a reaction associated with them remain the same
carveme_proteins = [model.reactions[reaction].gpr.get_genes() 
                    if model.reactions[reaction].gpr else {} for reaction in model.reactions]


In [6]:
# Get the gene ID for each protein involved in each reactions.

carveme_genes_inrxn = []
for proteins in carveme_proteins:
    gene_list = []
    for protein in proteins:
        genes = list(gene_protein_map.loc[gene_protein_map['Cross-reference (RefSeq)']==protein,'Gene names'].values)
        
        # If one or multiple genes match with the same protein. 
        #However there is (almost) one-to-one mapping between gene and protein
        if len(genes)>0:
            multiple_genes = genes[0].split(' ') # Some strings contain elements separated by ' '
            
            for gene in multiple_genes:
                if "Ccel_" in gene: # Only include the elements that have a Ccel_value        
                    gene_list.append(gene)
    carveme_genes_inrxn.append(gene_list)

In [7]:
# Storing data in DataFrame 
carveme_genes_inrxn_str = [", ".join(genes) for genes in carveme_genes_inrxn]
reaction_gene_map = pd.DataFrame({"CarveMe rxn ID": list(model.reactions), "Genes": carveme_genes_inrxn_str})
reaction_gene_map

Unnamed: 0,CarveMe rxn ID,Genes
0,R_12DGR120tipp,
1,R_12DGR140tipp,
2,R_12DGR141tipp,
3,R_12DGR160tipp,
4,R_12DGR161tipp,
...,...,...
1628,Growth,
1629,R_ATPM,
1630,R_BZDH,
1631,R_OXADC,


In [9]:
%store reaction_gene_map

Stored 'reaction_gene_map' (DataFrame)
