# Mentha DB

Link: https://mentha.uniroma2.it/download.php

In [1]:
import pandas as pd
import json

In [2]:
mentha_all_df = pd.read_table('2023-09-25.zip', delimiter=';')

In [4]:
mentha_all_df

Unnamed: 0,Protein A,Gene A,Taxon A,Protein B,Gene B,Taxon B,Score,PMID
0,Q9UT39,Q9UT39,284812,O42868,SSU72,284812,0.236,14617822 24945319
1,Q9H115,NAPB,9606,P21579,SYT1,9606,0.236,unassigned1304
2,Q9UT08,PAA1,284812,O94513,INT6,284812,0.309,22119525 24634168
3,P53886,APC1,4932,P16522,CDC23,4932,0.623,8895471 16429126 9469814 16481473
4,Q9UT08,PAA1,284812,O74762,RPN2,284812,0.309,22119525 24634168
...,...,...,...,...,...,...,...,...
741332,Q9UST6,SPC24,284812,Q10198,NDC80,284812,0.702,15728720 15371542
741333,Q9UTE3,SEB1,284812,P36594,RPB1,284812,0.454,28367989
741334,Q13077,TRAF1,9606,P21580,TNFAIP3,9606,0.659,8692885 25996949 9928991
741335,Q9UT46,MOC3,284812,P0CT53,TEF101,284812,0.332,19682301


In [5]:
mentha_all_df['Protein A']

0         Q9UT39
1         Q9H115
2         Q9UT08
3         P53886
4         Q9UT08
           ...  
741332    Q9UST6
741333    Q9UTE3
741334    Q13077
741335    Q9UT46
741336    Q9BYN8
Name: Protein A, Length: 741337, dtype: object

In [6]:
mentha_all_df['Protein B']

0         O42868
1         P21579
2         O94513
3         P16522
4         O74762
           ...  
741332    Q10198
741333    P36594
741334    P21580
741335    P0CT53
741336    Q00013
Name: Protein B, Length: 741337, dtype: object

## Uniprot mappings

In [8]:
idmapping_uniprot_hgnc_df = pd.read_table(
    '../STRING_DB/uniprot_mappings/idmapping_uniprot_hgnc.txt',
    header=None
).rename(columns=dict(enumerate(['uniprot_id', 'id_type', 'mapped_id'])))

In [9]:
idmapping_uniprot_hgnc_dict = (
    idmapping_uniprot_hgnc_df[
        ['uniprot_id', 'mapped_id']
    ]
        .groupby('uniprot_id')
        .agg(set)
        ['mapped_id']
        .to_dict()
)

In [10]:
mentha_all_df['prot_a_hgnc'] = (
    mentha_all_df['Protein A'].apply(
        lambda x: idmapping_uniprot_hgnc_dict.get(x)
    )
)

mentha_all_df['prot_b_hgnc'] = (
    mentha_all_df['Protein B'].apply(
        lambda x: idmapping_uniprot_hgnc_dict.get(x)
    )
)

In [11]:
mentha_all_hgnc_df = (
    mentha_all_df
        .explode('prot_a_hgnc')
        .explode('prot_b_hgnc')
        .dropna()
)

In [12]:
mentha_all_hgnc_df

Unnamed: 0,Protein A,Gene A,Taxon A,Protein B,Gene B,Taxon B,Score,PMID,prot_a_hgnc,prot_b_hgnc
1,Q9BRQ4,C11ORF70,9606,Q15750,TAB1,9606,0.126,26186194,HGNC:28188,HGNC:18157
4,P27449,ATP6V0C,9606,P06576,ATP5B,9606,0.126,28514442,HGNC:855,HGNC:830
5,Q9BRR6,ADPGK,9606,O14657,TOR1B,9606,0.126,28514442,HGNC:25250,HGNC:11995
6,Q96BS2,TESC,9606,Q92633,LPAR1,9606,0.126,28514442,HGNC:26065,HGNC:3166
8,Q9BRQ6,CHCHD6,9606,O75477,ERLIN1,9606,0.081,22939629,HGNC:28184,HGNC:16947
...,...,...,...,...,...,...,...,...,...,...
741327,Q92633,LPAR1,9606,Q8WY22,BRI3BP,9606,0.126,28514442,HGNC:3166,HGNC:14251
741330,Q9BRQ6,CHCHD6,9606,P05067,APP,9606,0.183,21832049,HGNC:28184,HGNC:620
741331,Q92685,ALG3,9606,Q92633,LPAR1,9606,0.126,26186194,HGNC:23056,HGNC:3166
741334,Q9BRQ4,C11ORF70,9606,Q92526,CCT6B,9606,0.126,26186194,HGNC:28188,HGNC:1621


In [13]:
mentha_all_hgnc_df[
    ['prot_a_hgnc', 'prot_b_hgnc', 'PMID']
].drop_duplicates()

Unnamed: 0,prot_a_hgnc,prot_b_hgnc,PMID
1,HGNC:28188,HGNC:18157,26186194
4,HGNC:855,HGNC:830,28514442
5,HGNC:25250,HGNC:11995,28514442
6,HGNC:26065,HGNC:3166,28514442
8,HGNC:28184,HGNC:16947,22939629
...,...,...,...
741327,HGNC:3166,HGNC:14251,28514442
741330,HGNC:28184,HGNC:620,21832049
741331,HGNC:23056,HGNC:3166,26186194
741334,HGNC:28188,HGNC:1621,26186194


## Mapping to UMLS

In [15]:
mrconso_path = (
    '../../UMLS_Metathesaurus/mrconso_and_semtypes_2022AA_df.pkl'
)

In [16]:
mrconso_st_df = pd.read_pickle(mrconso_path)

In [17]:
mrconso_hgnc_df = mrconso_st_df[
    mrconso_st_df['SAB'] == 'HGNC'
][['CUI', 'STR', 'CODE']]

In [18]:
mrconso_hgnc_df

Unnamed: 0,CUI,STR,CODE
235919,C0008288,CIPC gene,HGNC:20365
235921,C0008288,CIPC,HGNC:20365
235923,C0008288,"CLOCK-interacting protein, circadian",HGNC:20365
235925,C0008288,CLOCK interacting pacemaker,HGNC:20365
235926,C0008288,KIAA1737,HGNC:20365
...,...,...,...
16596953,C5446057,TPM2P1,HGNC:55137
16596954,C5446057,TPM2 pseudogene 1,HGNC:55137
16596955,C5446058,TRMT1P1 gene,HGNC:55145
16596957,C5446058,TRMT1 pseudogene 1,HGNC:55145


In [19]:
mrconso_hgnc_to_cui_dict = (
    mrconso_hgnc_df[['CUI', 'CODE']]
    .groupby('CODE')
    .agg(set)
    ['CUI']
    .to_dict()
)

In [20]:
mentha_all_hgnc_df

Unnamed: 0,Protein A,Gene A,Taxon A,Protein B,Gene B,Taxon B,Score,PMID,prot_a_hgnc,prot_b_hgnc
1,Q9BRQ4,C11ORF70,9606,Q15750,TAB1,9606,0.126,26186194,HGNC:28188,HGNC:18157
4,P27449,ATP6V0C,9606,P06576,ATP5B,9606,0.126,28514442,HGNC:855,HGNC:830
5,Q9BRR6,ADPGK,9606,O14657,TOR1B,9606,0.126,28514442,HGNC:25250,HGNC:11995
6,Q96BS2,TESC,9606,Q92633,LPAR1,9606,0.126,28514442,HGNC:26065,HGNC:3166
8,Q9BRQ6,CHCHD6,9606,O75477,ERLIN1,9606,0.081,22939629,HGNC:28184,HGNC:16947
...,...,...,...,...,...,...,...,...,...,...
741327,Q92633,LPAR1,9606,Q8WY22,BRI3BP,9606,0.126,28514442,HGNC:3166,HGNC:14251
741330,Q9BRQ6,CHCHD6,9606,P05067,APP,9606,0.183,21832049,HGNC:28184,HGNC:620
741331,Q92685,ALG3,9606,Q92633,LPAR1,9606,0.126,26186194,HGNC:23056,HGNC:3166
741334,Q9BRQ4,C11ORF70,9606,Q92526,CCT6B,9606,0.126,26186194,HGNC:28188,HGNC:1621


In [21]:
mentha_all_hgnc_df['prot_a_cui'] = (
    mentha_all_hgnc_df['prot_a_hgnc'].apply(
        lambda x: mrconso_hgnc_to_cui_dict.get(x)
    )
)

mentha_all_hgnc_df['prot_b_cui'] = (
    mentha_all_hgnc_df['prot_b_hgnc'].apply(
        lambda x: mrconso_hgnc_to_cui_dict.get(x)
    )
)

In [22]:
mentha_all_cui_df = (
    mentha_all_hgnc_df
        .explode('prot_a_cui')
        .explode('prot_b_cui')
        .dropna()
)
len(mentha_all_cui_df)

346462

In [23]:
mentha_all_cui_df

Unnamed: 0,Protein A,Gene A,Taxon A,Protein B,Gene B,Taxon B,Score,PMID,prot_a_hgnc,prot_b_hgnc,prot_a_cui,prot_b_cui
1,Q9BRQ4,C11ORF70,9606,Q15750,TAB1,9606,0.126,26186194,HGNC:28188,HGNC:18157,C1824336,C1334476
4,P27449,ATP6V0C,9606,P06576,ATP5B,9606,0.126,28514442,HGNC:855,HGNC:830,C1412675,C1412653
5,Q9BRR6,ADPGK,9606,O14657,TOR1B,9606,0.126,28514442,HGNC:25250,HGNC:11995,C1540190,C1420862
6,Q96BS2,TESC,9606,Q92633,LPAR1,9606,0.126,28514442,HGNC:26065,HGNC:3166,C1823264,C1414254
8,Q9BRQ6,CHCHD6,9606,O75477,ERLIN1,9606,0.081,22939629,HGNC:28184,HGNC:16947,C1428678,C1539836
...,...,...,...,...,...,...,...,...,...,...,...,...
741327,Q92633,LPAR1,9606,Q8WY22,BRI3BP,9606,0.126,28514442,HGNC:3166,HGNC:14251,C1414254,C1422512
741330,Q9BRQ6,CHCHD6,9606,P05067,APP,9606,0.183,21832049,HGNC:28184,HGNC:620,C1428678,C1364818
741331,Q92685,ALG3,9606,Q92633,LPAR1,9606,0.126,26186194,HGNC:23056,HGNC:3166,C1427879,C1414254
741334,Q9BRQ4,C11ORF70,9606,Q92526,CCT6B,9606,0.126,26186194,HGNC:28188,HGNC:1621,C1824336,C1413199


In [24]:
mentha_all_cui_df.to_pickle(
    'mentha_2023_09_25_all_cui_df.pkl'
)

In [67]:
mentha_all_cui_mapped_pairs = list({
    tuple(sorted(p)) for p in list(
        zip(
            mentha_all_cui_df['prot_a_cui'],
            mentha_all_cui_df['prot_b_cui']
        )
    )
})
len(mentha_all_cui_mapped_pairs)

344037

## Saving

In [70]:
with open('mentha_pairs_all.json', 'w') as f:
    json.dump(mentha_all_cui_mapped_pairs, f)