In [1]:
import biomart
import numpy as np
import re
import pandas as pd

import os
current_directory = os.getcwd()
data_path='/'+os.path.join(*current_directory.split('/')[:-2])+'/non_anndata_data'

In [2]:
### This code is from https://gist.github.com/ben-heil/cffbebf8865795fe2efbbfec041da969,
### I only hat to change in 'http://useast.ensembl.org/biomart' from 'uswest' to 'useast', since uswest is retired

def get_ensembl_mappings():                                                                                                                                                                                 
    # Set up connection to server                                                                                                                                                                           
    server = biomart.BiomartServer('http://useast.ensembl.org/biomart')                                                   
    #mart = server.datasets['mmusculus_gene_ensembl']                                                                                
    mart = server.datasets['hsapiens_gene_ensembl']

    # List the types of data we want                                                                                                                                                                        
    attributes = ['ensembl_transcript_id', 'hgnc_symbol',                                                                                                                                                    
                  'ensembl_gene_id', 'ensembl_peptide_id']                                                                                                                                                  

    # Get the mapping between the attributes                                                                                                                                                                
    response = mart.search({'attributes': attributes})                                                                                                                                                      
    data = response.raw.data.decode('ascii')                                                                                                                                                                

    ensembl_to_genesymbol = {}                                                                                                                                                                              
    # Store the data in a dict                                                                                                                                                                              
    for line in data.splitlines():                                                                                                                                                                          
        line = line.split('\t')                                                                                                                                                                             
        # The entries are in the same order as in the `attributes` variable                                                                                                                                 
        transcript_id = line[0]                                                                                                                                                                             
        gene_symbol = line[1]                                                                                                                                                                               
        ensembl_gene = line[2]                                                                                                                                                                              
        ensembl_peptide = line[3]                                                                                                                                                                           

        # Some of these keys may be an empty string. If you want, you can
        # avoid having a '' key in your dict by ensuring the
        # transcript/gene/peptide ids have a nonzero length before
        # adding them to the dict
        #ensembl_to_genesymbol[transcript_id] = gene_symbol
        ensembl_to_genesymbol[ensembl_gene] = gene_symbol
        #ensembl_to_genesymbol[ensembl_peptide] = gene_symbol

    return ensembl_to_genesymbol

In [3]:
# Takes about 3 min
Gene_Dict=get_ensembl_mappings()

In [4]:
Gene_Dict

{'ENSG00000210049': 'MT-TF',
 'ENSG00000211459': 'MT-RNR1',
 'ENSG00000210077': 'MT-TV',
 'ENSG00000210082': 'MT-RNR2',
 'ENSG00000209082': 'MT-TL1',
 'ENSG00000198888': 'MT-ND1',
 'ENSG00000210100': 'MT-TI',
 'ENSG00000210107': 'MT-TQ',
 'ENSG00000210112': 'MT-TM',
 'ENSG00000198763': 'MT-ND2',
 'ENSG00000210117': 'MT-TW',
 'ENSG00000210127': 'MT-TA',
 'ENSG00000210135': 'MT-TN',
 'ENSG00000210140': 'MT-TC',
 'ENSG00000210144': 'MT-TY',
 'ENSG00000198804': 'MT-CO1',
 'ENSG00000210151': 'MT-TS1',
 'ENSG00000210154': 'MT-TD',
 'ENSG00000198712': 'MT-CO2',
 'ENSG00000210156': 'MT-TK',
 'ENSG00000228253': 'MT-ATP8',
 'ENSG00000198899': 'MT-ATP6',
 'ENSG00000198938': 'MT-CO3',
 'ENSG00000210164': 'MT-TG',
 'ENSG00000198840': 'MT-ND3',
 'ENSG00000210174': 'MT-TR',
 'ENSG00000212907': 'MT-ND4L',
 'ENSG00000198886': 'MT-ND4',
 'ENSG00000210176': 'MT-TH',
 'ENSG00000210184': 'MT-TS2',
 'ENSG00000210191': 'MT-TL2',
 'ENSG00000198786': 'MT-ND5',
 'ENSG00000198695': 'MT-ND6',
 'ENSG00000210194': 

In [5]:
# Remove special characters and capitalize gene symbols
def remove_non_alphanumeric(input_string):
    return re.sub(r"[^a-zA-Z0-9]", "", input_string)

for k in Gene_Dict.keys():
    Gene_Dict[k]=remove_non_alphanumeric(Gene_Dict[k]).upper()

In [6]:
# Let's remove all the ones where the mapping is not unique, they will have to be done manually

In [7]:
from collections import Counter
def remove_non_unique_entries(dictionary):
    value_counts = Counter(dictionary.values())
    unique_entries = {key: value for key, value in dictionary.items() if value_counts[value] == 1}
    return unique_entries

In [8]:
Gene_Dict=remove_non_unique_entries(Gene_Dict)

In [9]:
Gene_Dict= {value: key for key, value in Gene_Dict.items()}

In [10]:
# Sometimes they just used alternative naming, and if they have several Ensemble-Ids, I pick the first google result

Gene_Dict['DDR1']='ENSG00000204580'
Gene_Dict['ABCF1']='ENSG00000206490'
Gene_Dict['ECH1']='ENSG00000104823'
Gene_Dict['HSPA1A']='ENSG00000204389'
Gene_Dict['PSME1']='ENSG00000284916'
Gene_Dict['HYOU1']='ENSG00000280682'
Gene_Dict['EPRS']='ENSG00000136628'
Gene_Dict['AARS']='ENSG00000090861'
Gene_Dict['PIP4K2B']='ENSG00000276293'
Gene_Dict['SQSTM1']='ENSG00000161011'
Gene_Dict['TOMM70A']='ENSG00000154174'
Gene_Dict['IER3']='ENSG00000137331'
Gene_Dict['PNP']='ENSG00000198805'
Gene_Dict['KIAA0100']='ENSG00000007202'
Gene_Dict['TJP1']='ENSG00000104067'
Gene_Dict['KIAA0907']='ENSG00000132680'
Gene_Dict['PAPD7']='ENSG00000112941'
Gene_Dict['IKBKAP']='ENSG00000070061'
Gene_Dict['DUSP11']='ENSG00000144048'
Gene_Dict['HIST2H2BE']='ENST00000369155'
Gene_Dict['WRB']='ENSG00000182093'
Gene_Dict['PRKACA']='ENSG00000288516'
Gene_Dict['INTS3']='ENSG00000262826'
Gene_Dict['GAA']='ENSG00000171298'
Gene_Dict['SLC37A4']='ENSG00000137700'
Gene_Dict['PCK2']='ENSG00000100889'
Gene_Dict['PTPRK']='ENSG00000152894'
Gene_Dict['KIF5C']='ENSG00000168280'
Gene_Dict['HAT1']='ENSG00000128708'
Gene_Dict['KIAA0355']='ENSG00000166398'
Gene_Dict['TIMM17B']='ENSG00000126768'
Gene_Dict['DUSP14']='ENSG00000276023'
Gene_Dict['RPS6KA1']='ENSG00000117676'
Gene_Dict['PRAF2']='ENSG00000243279'
Gene_Dict['SKIV2L']='ENSMUSG00000040356'
Gene_Dict['TMEM5']='ENSG00000118600'
Gene_Dict['SYNGR3']='ENSG00000127561'
Gene_Dict['NFATC4']='ENSG00000100968'
Gene_Dict['DNAJA3']='ENSG00000103423'
Gene_Dict['RNH1']='ENSG00000276230'
Gene_Dict['E2F2']='ENSG00000007968'
Gene_Dict['PTPRC']='ENSG00000081237'
Gene_Dict['FDFT1']='ENSG00000079459'
Gene_Dict['HLADRA']='ENSG00000204287'
Gene_Dict['PSMB8']='ENSG00000204264'
Gene_Dict['MBOAT7']='ENSG00000125505'
Gene_Dict['HDGFRP3']='ENSG00000166503'
Gene_Dict['PRUNE']='ENSG00000143363'
Gene_Dict['HIST1H2BK']='ENSG00000197903'
Gene_Dict['CDK7']='ENSG00000277273'
Gene_Dict['BDH1']='ENSG00000161267'
Gene_Dict['TBC1D9B']='ENSG00000197226'
Gene_Dict['HN1L']='ENSG00000206053'
Gene_Dict['H2AFV']='ENSG00000105968'
Gene_Dict['KIF1BP']='ENSG00000198954'
Gene_Dict['KAT6B']='ENSG00000156650'
Gene_Dict['PLEKHM1']='ENSG00000225190'
Gene_Dict['KIAA1033']='ENST00000332180'
Gene_Dict['SRC']='ENSG00000197122'
Gene_Dict['FAM69A']='ENSG00000154511'
Gene_Dict['TMEM110']='ENSG00000248592'
Gene_Dict['ATP5S']='ENSG00000125375'
Gene_Dict['NFKBIB']='ENSG00000104825'
Gene_Dict['CTTN']='ENSG00000085733'
Gene_Dict['PARP2']='ENSG00000129484'
Gene_Dict['NSDHL']='ENSG00000147383'
Gene_Dict['HLADMA']='ENSG00000243189'
Gene_Dict['ADI1']='ENSG00000182551'
Gene_Dict['MIF']='ENSG00000240972'
Gene_Dict['SQRDL']='ENSG00000137767'
Gene_Dict['UBR7']='ENSG00000012963'
Gene_Dict['TMEM2']='ENSG00000135048'
Gene_Dict['ADCK3']='ENSG00000163050'
Gene_Dict['CCDC92']='ENSG00000119242'
Gene_Dict['CHMP4A']='ENSG00000254505'
Gene_Dict['VPS28']='ENSG00000160948'
Gene_Dict['NARFL']='ENSG00000103245'
Gene_Dict['FAM57A']='ENSG00000167695'
Gene_Dict['TIMM22']='ENSG00000177370'
Gene_Dict['MSRA']='ENSG00000175806'
Gene_Dict['LRRC16A']='ENSG00000079691'
Gene_Dict['SLC2A6']='ENSG00000160326'
Gene_Dict['WDR61']='ENSG00000140395'
Gene_Dict['FAM63A']='ENSMUSG00000038712'
Gene_Dict['SLC27A3']='ENSG00000143554'
Gene_Dict['TSTA3']='ENSG00000104522'
Gene_Dict['PLSCR3']='ENSG00000187838'
Gene_Dict['PSME2']='ENSG00000100911'
Gene_Dict['DAXX']='ENSG00000204209'
Gene_Dict['ATMIN']='ENSG00000166454'
Gene_Dict['KIAA0196']='ENSG00000170871'

In [16]:
Gene_Dict= {value: key for key, value in Gene_Dict.items()}

In [12]:
np.save(f'{data_path}/Gene_Dict.npy', Gene_Dict)