# Comparison of gene ID mapping between [g:convert](https://biit.cs.ut.ee/gprofiler/convert) (g:profiler) and [mygene](https://pypi.org/project/mygene/)

In [1]:
#Imports
import os, requests
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 300)
pd.set_option("display.max_columns", 100)

import mygene

### Import ENSEMBL IDs

In [2]:
## Load the gene list
ENSEMBL_IDs = pd.read_csv('ENSEMBL_IDs.csv', squeeze= True)
print(len(ENSEMBL_IDs))
ENSEMBL_IDs.head()

58434


0    ENSG00000000003.14
1     ENSG00000000005.6
2    ENSG00000000419.12
3    ENSG00000000457.14
4    ENSG00000000460.17
Name: Gene, dtype: object

* Trim version in gene IDs (after the dot)

In [3]:
ENSEMBL_trimmed = [ID.split('.')[0] for ID in ENSEMBL_IDs]
ENSEMBL_trimmed[:5]

['ENSG00000000003',
 'ENSG00000000005',
 'ENSG00000000419',
 'ENSG00000000457',
 'ENSG00000000460']

### Map gene IDs to symbols 

* [g:convert](https://biit.cs.ut.ee/gprofiler/convert) (g:profiler)

In [4]:
%%time
r = requests.post(
    url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
    json={
        'organism':'hsapiens',
        'target':'ENSG', #multiple other databases can be used, but targeting the ENSEMBL database seemed to recover the highest number of symbols in the 'name' field
        'query':ENSEMBL_trimmed,
        }
    )
gconvert = pd.DataFrame(r.json()['result'])

CPU times: user 420 ms, sys: 58.6 ms, total: 479 ms
Wall time: 7.62 s


In [5]:
print(gconvert.shape)
gconvert.head()

(58434, 8)


Unnamed: 0,converted,description,incoming,n_converted,n_incoming,name,namespaces,query
0,ENSG00000000003,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],ENSG00000000003,1,1,TSPAN6,"ARRAYEXPRESS,ENSG",query_1
1,ENSG00000000005,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],ENSG00000000005,1,2,TNMD,"ARRAYEXPRESS,ENSG",query_1
2,ENSG00000000419,dolichyl-phosphate mannosyltransferase subunit...,ENSG00000000419,1,3,DPM1,"ARRAYEXPRESS,ENSG",query_1
3,ENSG00000000457,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,ENSG00000000457,1,4,SCYL3,"ARRAYEXPRESS,ENSG",query_1
4,ENSG00000000460,chromosome 1 open reading frame 112 [Source:HG...,ENSG00000000460,1,5,C1orf112,"ARRAYEXPRESS,ENSG",query_1


The length is the same as the submitted list so each ENSEMBL ID corresponds to one row of the result. and it seems that no ENSEMBL ID were matched to multiple names. Let's check:

In [6]:
len(gconvert[gconvert.duplicated('incoming', keep =False)])

0

Now let's check how many entries were not found in g:profiler's database:

In [7]:
gconvert.replace('None', np.nan, inplace=True)
len(gconvert[gconvert.converted.isna()])

263

Let's also check how many ENSMBL IDs matched to the same symbol as another ID

In [8]:
len(gconvert[gconvert.duplicated('name', keep =False)].dropna(axis=0, subset=['name']))

1631

* [mygene](https://pypi.org/project/mygene/)

In [9]:
mygene.__version__

'3.1.0'

In [10]:
#translate Ensembl ID's to gene symbol
if os.path.exists('ginfo.csv'):
    #mygene is slower to run so we save the results of the query here
    ginfo = pd.read_csv('ginfo.csv', index_col='Unnamed: 0')
    ens = ENSEMBL_trimmed #feed ensembl IDs without ".xx"
else:
    mg = mygene.MyGeneInfo()
    ens = ENSEMBL_trimmed #feed ensembl IDs without ".xx"
    ginfo = mg.querymany(ens, scopes="ensembl.gene", fields="symbol", species="human", returnall=False, as_dataframe=True, df_index=False)
    ginfo.to_csv('ginfo.csv')

In [11]:
print(len(ginfo))
ginfo.head()

58437


Unnamed: 0,_id,_score,notfound,query,symbol
0,7105,19.005505,,ENSG00000000003,TSPAN6
1,64102,19.61876,,ENSG00000000005,TNMD
2,8813,19.647251,,ENSG00000000419,DPM1
3,57147,20.116043,,ENSG00000000457,SCYL3
4,55732,20.142542,,ENSG00000000460,C1orf112


This time the result length is longer than the query, indicating the presence of duplicates (same ENSEMBL ID matched to several symbols)

In [12]:
ginfo[ginfo.duplicated('query', keep =False)]

Unnamed: 0,_id,_score,notfound,query,symbol
28609,101927745,21.019394,,ENSG00000229425,LOC101927745
28610,105369302,20.87644,,ENSG00000229425,LOC105369302
57809,112268391,6.751291,,ENSG00000285607,LOC112268391
57810,112268392,6.499049,,ENSG00000285607,LOC112268392
57811,112268393,6.499049,,ENSG00000285607,LOC112268393


Let's check what symbol was chosen by g:convert for those entries

In [13]:
duplicates = ginfo[ginfo.duplicated('query', keep =False)].loc[:,'query'].unique()
gconvert[gconvert.incoming.isin(duplicates)]

Unnamed: 0,converted,description,incoming,n_converted,n_incoming,name,namespaces,query
28609,ENSG00000229425,,ENSG00000229425,1,28610,AJ009632.2,"ARRAYEXPRESS,ENSG",query_1
57808,ENSG00000285607,FAM90A pseudogene [Source:NCBI gene;Acc:112268...,ENSG00000285607,1,57809,AC084121.5,"ARRAYEXPRESS,ENSG",query_1


Now let's check how many entries were not matched to a symbol

In [14]:
ginfo[ginfo.notfound == True]

Unnamed: 0,_id,_score,notfound,query,symbol
12270,,,True,ENSG00000168078,
14960,,,True,ENSG00000181013,
16874,,,True,ENSG00000189144,
36013,,,True,ENSG00000241978,
36610,,,True,ENSG00000243444,


And a similar number of entries matched multiple symbols

In [15]:
len(ginfo[ginfo.duplicated('symbol', keep =False)].dropna(axis=0, subset=['symbol']))

1647

### Compare results

* Merge data from both mappings

In [16]:
df = gconvert.loc[:,['incoming','name']].merge(ginfo.drop_duplicates('query', keep='first').loc[:,['query','symbol']], 
                                          left_on='incoming', right_on= 'query', how='outer')
print(len(df))
df.head()

58434


Unnamed: 0,incoming,name,query,symbol
0,ENSG00000000003,TSPAN6,ENSG00000000003,TSPAN6
1,ENSG00000000005,TNMD,ENSG00000000005,TNMD
2,ENSG00000000419,DPM1,ENSG00000000419,DPM1
3,ENSG00000000457,SCYL3,ENSG00000000457,SCYL3
4,ENSG00000000460,C1orf112,ENSG00000000460,C1orf112


In [17]:
all(df['incoming'] == df['query']) #check that the ENSMBL IDs match

True

In [18]:
df = df.drop('query', axis=1).set_index('incoming')
df.head()

Unnamed: 0_level_0,name,symbol
incoming,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000003,TSPAN6,TSPAN6
ENSG00000000005,TNMD,TNMD
ENSG00000000419,DPM1,DPM1
ENSG00000000457,SCYL3,SCYL3
ENSG00000000460,C1orf112,C1orf112


* Check differences

In [19]:
# identify which entries lead to different results
diff = df[df.name != df.symbol]
print(len(diff))

1072


In [20]:
#check how many of those differences are due NaN
len(diff[diff.isnull().any(axis=1)])

268

Overall it looks like gconvert is using the old symbols and that mygene is kept more up to date

In [21]:
diff.dropna()

Unnamed: 0_level_0,name,symbol
incoming,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000036549,AC118549.1,ZZZ3
ENSG00000040487,PQLC2,SLC66A1
ENSG00000064205,WISP2,CCN5
ENSG00000065600,TMEM206,PACC1
ENSG00000100167,SEPT3,SEPTIN3
ENSG00000100890,KIAA0391,PRORP
ENSG00000104415,WISP1,CCN4
ENSG00000104964,AES,TLE5
ENSG00000108021,FAM208B,TASOR2
ENSG00000108387,SEPT4,SEPTIN4
