# Protein Coding Genes 
- get gene symbols from ENSG IDs

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore')

import pyensembl
from biomart import BiomartServer

In [2]:
# Use ensembl id or position to find gene name
def gene_rename(df, release = 75):
    
    ensembl = pyensembl.EnsemblRelease(release)
    
    gene_list = []
    for probe in df['Gene']:
        # try pulling data using probeID
        try:
            gene_list.append(ensembl.gene_name_of_gene_id(probe))
        except: 
            gene_list.append("novel_or_none")
        
            
    df['Gene_symbol'] = gene_list
    
    return df

In [3]:
# load in SMR results for NDDs
ndd_df = pd.read_csv("NDD_SMR_genes.csv")

# load in druggable genome data + 
druggable_df = pd.read_csv('/../omicSynth/druggable_genome.csv', sep = ',')

# read in DGIdb interactions file
drugs_df = pd.read_csv('/../omicSynth/interactions.tsv', sep = '\t')

# ensure gene_names are all clean
drugs_df.gene_name = drugs_df.gene_name.astype('str') # treat each anme as string even if all numbers
drugs_df.gene_name = drugs_df.gene_name.apply(lambda x: x.strip()) # strip any leading/trailing white space

# drop nans in gene_name
drugs_df = drugs_df.query('gene_name != "nan"')


# fill in any NaN bc theyre annoying

drugs_df.drug_concept_id = drugs_df.drug_concept_id.fillna('none')

# clean chemblid col since we need
drugs_df['chemblid'] = drugs_df.drug_concept_id.apply(lambda x: str(x.split(':')[1]) if ':'in x else x)

# remove any rows that do not have chembl id
drugs_df_red = drugs_df.query('chemblid != "none"')

drugs_df_red['drug_claim_primary_name'] = drugs_df_red['drug_claim_primary_name'].astype('str')

drugs_df_red['drug_claim_primary_name'] = drugs_df_red['drug_claim_primary_name'].apply(lambda x: x.lower())

# list of unique gene targets from drug data
thera_genes = list(drugs_df_red['gene_name'].unique())

In [4]:
# remove non coding genes
coding = pd.read_csv('/../omicSynth/proteincodinggenes.txt', sep = '\s+')
coding

Unnamed: 0,Gene
0,ENSG00000198888
1,ENSG00000198763
2,ENSG00000198804
3,ENSG00000198712
4,ENSG00000228253
...,...
22553,ENSG00000284925
22554,ENSG00000285044
22555,ENSG00000284901
22556,ENSG00000284869


In [5]:
# need to get gene names due to mQTLs
coding2 = gene_rename(coding, 100)

In [6]:
# use biomart to find genes missing?
coding_miss = coding2.query('Gene_symbol == "novel_or_none"')

coding_miss

Unnamed: 0,Gene,Gene_symbol
13,ENSG00000277953,novel_or_none
16,ENSG00000257215,novel_or_none
18,ENSG00000282035,novel_or_none
19,ENSG00000273896,novel_or_none
20,ENSG00000281022,novel_or_none
...,...,...
22553,ENSG00000284925,novel_or_none
22554,ENSG00000285044,novel_or_none
22555,ENSG00000284901,novel_or_none
22556,ENSG00000284869,novel_or_none


In [14]:
# establish db to search from biomart uses ensembl release 109
server = BiomartServer( "http://useast.ensembl.org/biomart")

data38 = server.databases['ENSEMBL_MART_ENSEMBL']
hsap = server.datasets['hsapiens_gene_ensembl']

# get any possible names from biomart
probes = coding_miss.Gene.unique()

gene_miss = {}
for probe in probes:
    search_fil = {'ensembl_gene_id': probe}
    search_cmd = {'filters': search_fil, 'attributes': ['ensembl_gene_id', 'hgnc_symbol']}
    response = hsap.search(search_cmd)
    
    try:
        for line in response.iter_lines():
            line = line.decode('utf-8')
            output = line.split("\t")
            
            gene_miss[probe] = output[1]
    except:
            gene_miss[probe] = 'no_symbol'

In [23]:
gene_probes_dict = {'Gene': list(gene_miss.keys()), 'Gene_symbol': list(gene_miss.values())}

In [24]:
# turn dictionary into df to fill in NaN
gene_probe_df = pd.DataFrame.from_dict(gene_probes_dict)
gene_probe_df

Unnamed: 0,Gene,Gene_symbol
0,ENSG00000277953,PRPF31
1,ENSG00000257215,
2,ENSG00000282035,
3,ENSG00000273896,TSEN34
4,ENSG00000281022,MED22
...,...,...
2809,ENSG00000284925,GCSAM
2810,ENSG00000285044,SLC9C1
2811,ENSG00000284901,RUVBL1
2812,ENSG00000284869,EEFSEC


In [31]:
# fill in missing gene symbols
gene_probe_df.loc[gene_probe_df['Gene_symbol'] == '','Gene_symbol'] = 'novel_or_none'

In [38]:
# turn back into dictionary
tmp_list = zip(gene_probe_df['Gene'], gene_probe_df['Gene_symbol'])
gene_probe_dict_clean = dict(tmp_list)

In [None]:
coding_good = coding2.query('Gene_symbol != "novel_or_none"')
good_genes_dict = dict(zip(coding_good['Gene'], coding_good['Gene_symbol']))

In [42]:
# see if there are any common probes between the two dictionaries
list(set(gene_probe_dict_clean.keys()).intersection(set(good_genes_dict.keys())))

[]

In [41]:
# combine the two dictionaries into one
final_probe_dict = {**gene_probe_dict_clean , **good_genes_dict}

In [44]:
# create  column to store mapped gene values
coding['Gene_symbol'] = coding['Gene'].map(final_probe_dict)

In [46]:
coding.Gene_symbol.value_counts()

novel_or_none    191
KIR3DL2           43
KIR3DL3           42
KIR2DL4           41
KIR2DL1           33
                ... 
HRH1               1
FBXO31             1
ITGAD              1
AC010531.1         1
CMKLR2             1
Name: Gene_symbol, Length: 19772, dtype: int64

In [47]:
# export new protein coding lost with gene symbols
coding.to_csv('proteincoding_genesym.csv', index = None)

In [3]:
coding = pd.read_csv('proteincoding_genesym.csv')