*MeNu GUIDE*

# Match Genes and Enzymes

In [1]:
import os
import pandas as pd

In [None]:
kegg_folder = '/path/to/kegg/data/folder/'
vmh_folder = '/path/to/vmh/data/folder/'
processed_data_folder = "/path/to/processed/data/folder/"

In [2]:
kegg_proteins = pd.read_csv(os.path.join(kegg_folder, "kegg_enzymes.csv"))
kegg_genes = pd.read_csv(os.path.join(kegg_folder, "kegg_enzyme_genes.csv"))
vmh_genes = pd.read_csv(os.path.join(vmh_folder, "vmh_human_genes.csv"))

## Clean up KEGG genes table

In [3]:
kegg_genes = kegg_genes[kegg_genes.gene.str.contains('\(')]
kegg_genes.loc[:, 'gene_name'] = kegg_genes.gene.apply(lambda x: x.split('(')[1].rstrip(')'))
kegg_genes.loc[:, 'kegg_gene_id'] = kegg_genes.gene.apply(lambda x: f"hsa:{x.split('(')[0]}")
kegg_genes = kegg_genes.drop(columns='gene')
kegg_genes = kegg_genes.rename(columns={'enzyme_id': 'kegg_enzyme_id'})
kegg_genes

Unnamed: 0,kegg_enzyme_id,gene_name,kegg_gene_id
0,EC 1.1.1.1,ADH1A,hsa:124
1,EC 1.1.1.1,ADH1B,hsa:125
2,EC 1.1.1.1,ADH1C,hsa:126
3,EC 1.1.1.1,ADH4,hsa:127
4,EC 1.1.1.1,ADH5,hsa:128
...,...,...,...
3744,EC 7.6.2.3,ABCC1,hsa:4363
3745,EC 7.6.2.4,ABCD1,hsa:215
3746,EC 7.6.2.4,ABCD2,hsa:225
3747,EC 7.6.2.4,ABCD3,hsa:5825


## Merge KEGG genes and proteins tables

In [4]:
kegg_proteins = kegg_proteins.merge(kegg_genes, how='inner', on='kegg_enzyme_id')
kegg_proteins = kegg_proteins.rename(columns={'gene_name': 'symbol'})
kegg_proteins

Unnamed: 0,kegg_enzyme_id,enzyme_name,enzyme_class,symbol,kegg_gene_id
0,EC 1.1.1.1,alcohol dehydrogenase,Oxidoreductases,ADH1A,hsa:124
1,EC 1.1.1.1,alcohol dehydrogenase,Oxidoreductases,ADH1B,hsa:125
2,EC 1.1.1.1,alcohol dehydrogenase,Oxidoreductases,ADH1C,hsa:126
3,EC 1.1.1.1,alcohol dehydrogenase,Oxidoreductases,ADH4,hsa:127
4,EC 1.1.1.1,alcohol dehydrogenase,Oxidoreductases,ADH5,hsa:128
...,...,...,...,...,...
3742,EC 7.6.2.3,ABC-type glutathione-S-conjugate transporter,Translocases,ABCC1,hsa:4363
3743,EC 7.6.2.4,ABC-type fatty-acyl-CoA transporter,Translocases,ABCD1,hsa:215
3744,EC 7.6.2.4,ABC-type fatty-acyl-CoA transporter,Translocases,ABCD2,hsa:225
3745,EC 7.6.2.4,ABC-type fatty-acyl-CoA transporter,Translocases,ABCD3,hsa:5825


## Match VMH & KEGG

In [5]:
vmh_genes = vmh_genes.drop(columns=['Unnamed: 0', 'genes_id', 'abbreviation'])

In [7]:
kegg_vmh_merge = vmh_genes.merge(kegg_proteins, how='outer', on='symbol')
print(f"Number of unmatched VMH genes: {len(kegg_vmh_merge[kegg_vmh_merge.gene_number.isna()])}")
print(f"Number of unmatched KEGG genes: {len(kegg_vmh_merge[kegg_vmh_merge.kegg_enzyme_id.isna()])}")
print(f"Number of matched genes: {len(kegg_vmh_merge[(kegg_vmh_merge.kegg_enzyme_id.notna()) & (kegg_vmh_merge.gene_number.notna())])}")

Number of unmatched VMH genes: 1103
Number of unmatched KEGG genes: 1018
Number of matched genes: 2991


In [8]:
kegg_vmh_merge.to_csv(os.path.join(processed_data_folder, "proteins_genes_kegg_vmh.csv"), index=False)

In [14]:
kegg_vmh_merge_genes = kegg_vmh_merge[['symbol', 'gene_number', 'chromosome', 'description', 'ensembl_gene', 'chebl_id', 'uniprot_gname', 'kegg_gene_id']]
kegg_vmh_merge_genes.to_csv(os.path.join(processed_data_folder, "genes_kegg_vmh.csv"), index=False)

In [15]:
kegg_vmh_merge_genes_to_enzymes = kegg_vmh_merge[['symbol', 'kegg_enzyme_id']]
kegg_vmh_merge_genes_to_enzymes = kegg_vmh_merge_genes_to_enzymes[kegg_vmh_merge_genes_to_enzymes.kegg_enzyme_id.notna()]
kegg_vmh_merge_genes_to_enzymes = kegg_vmh_merge_genes_to_enzymes.drop_duplicates()
kegg_vmh_merge_genes_to_enzymes.to_csv(os.path.join(processed_data_folder, "kegg_enzyme_gene_symbols.csv"), index=False)

In [16]:
kegg_vmh_merge_genes_to_enzymes[kegg_vmh_merge_genes_to_enzymes.kegg_enzyme_id == 'EC 1.1.1.1']

Unnamed: 0,symbol,kegg_enzyme_id
203,ADH1A,EC 1.1.1.1
204,ADH1B,EC 1.1.1.1
205,ADH1C,EC 1.1.1.1
206,ADH4,EC 1.1.1.1
207,ADH5,EC 1.1.1.1
209,ADH6,EC 1.1.1.1
210,ADH7,EC 1.1.1.1
