This notebook contains the code used for the conversion of all gene names used throughout the pipeline to Gencode v.34. 

We convert: 

1. List of recessive genes

2. s-hets from Cassa et al.

3. s-hets from Weghorn et al. 

4. pLI from gnomad

5. Roulette 

In [1]:
import pandas as pd
from pybiomart import Dataset

In [2]:
def get_gencode_v34_HGNC_genes():
    """
        This function dowloads gene name and its HGNC id as specified in gencode v.34
    """
    # load inhouse

    inhouse = pd.read_csv("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.metadata.HGNC.gz", 
                          compression='gzip', sep='\t', header=None)
    inhouse.columns = ['Ensembl ID', 'gene', 'HGNC ID']

    inhouse = inhouse[['gene', 'HGNC ID']].drop_duplicates().dropna()

    # edit NGNC field to have the same format as in GRCh37
    inhouse['HGNC ID'] = inhouse['HGNC ID'].apply(
        lambda x: x.replace('HGNC:', '') if x else x).astype(float) 

    return inhouse

def get_grch37_inhouse_map():
    """
        This function creates the mapping from GRCh37 gene names from ensembl into gencode v.34
        using HGNC-id as key
    """

    # load GRCh37
    dataset = Dataset(name='hsapiens_gene_ensembl',  host='http://grch37.ensembl.org')
    # attributes = dataset.list_attributes() -- look into attributes

    hg37_table = dataset.query(
        attributes=['hgnc_id','chromosome_name', 'hgnc_symbol'])

    # drop scaffolds not mapped to chromosomes
    hg37_table = hg37_table[~hg37_table['Chromosome/scaffold name'].apply(
        lambda x: x.startswith('CHR_') or 'H' in x or x.startswith('GL'))]

    # leave only mapping information
    hg37_table = hg37_table[['HGNC ID', 'HGNC symbol', 'Chromosome/scaffold name']].dropna().drop_duplicates()

    
    inhouse = get_gencode_v34_HGNC_genes()[['gene', 'HGNC ID']]
    
    mapping = hg37_table.rename(columns={'HGNC symbol': 'gene_grch37'}).merge(inhouse,  how='outer', on='HGNC ID')

    return mapping


def get_grch38_inhouse_map():
    """
        This function creates the mapping from GRCh37 gene names from ensembl into gencode v.34
        using HGNC-id as key
    """

    # load GRCh38
    dataset = Dataset(name='hsapiens_gene_ensembl',  host='http://ensembl.org')
    # attributes = dataset.list_attributes() -- look into attributes

    hg38_table = dataset.query(
        attributes=['hgnc_id','chromosome_name', 'hgnc_symbol'])

    # drop scaffolds not mapped to chromosomes
    hg38_table = hg38_table[~hg38_table['Chromosome/scaffold name'].apply(
        lambda x: x.startswith('CHR_') or 'H' in x or x.startswith('GL'))]

    # leave only mapping information
    hg38_table = hg38_table[['HGNC ID', 'HGNC symbol', 'Chromosome/scaffold name']].dropna().drop_duplicates()
    hg38_table['HGNC ID'] = hg38_table['HGNC ID'].apply(lambda x: x.replace('HGNC:', '') if x else x).astype(float)

    inhouse = get_gencode_v34_HGNC_genes()[['gene', 'HGNC ID']]
    
    mapping = hg38_table.rename(columns={'HGNC symbol': 'gene_grch38'}).merge(inhouse,  how='outer', on='HGNC ID')

    return mapping

All of the filters follows the same procedure:

1. Create mapping of `GRCh37` gene names from ensembl into `gencode v.34`

2. Read data of interest, that contains gene names in some format 

3. Identify genes from the previous step, that are not in `gencode v.34`

4. Create corretion map from genes in `step 3` to `gencode v.34` where possible by using mapping from `step 1`

5. Apply this correction map to our data of interest
    

# Gene panels

In [53]:
# read original gene panel

gene_panels = pd.read_csv(".../450k/regions/gene-panel.txt", header=None)
gene_panels.columns = ['Gene name', 'panel']

print ("Number of genes:", gene_panels.shape[0])

Number of genes: 1929


In [54]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())

grch37_gencode_v34 = grch37_gencode_v34.dropna()

# get genes not in gencode v.34 format
wrong_format_genes = gene_panels.loc[~gene_panels['Gene name'].isin(gencode_v34_genes), 'Gene name'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# manual add a missing case
correcting_map['KIF1BP'] = 'KIFBP'

print ("Full correction dictionary:", len(correcting_map))

# correct gene panels
gene_panels ['Gene name'] = gene_panels['Gene name'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~gene_panels['Gene name'].isin(gencode_v34_genes)))

# # save dataset
gene_panels.to_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=False, index=False, sep=',')


Number of genes not in gencode_34: 29
Number of genes not in gencode_34, that found in GRCh37 : 28
Full correction dictionary: 29
Number of genes not in gencode_34: 0


In [55]:
! head -3 .../450k/regions/gene-panel-gencode-v34.txt

ADAT3,ID-total
AIMP2,ID-total
ANKLE2,ID-total


# S_hets

## Cassa

In [3]:
cassa = pd.read_csv(".../450k/selection_cassa/cassa_supp_table_1.txt", sep='\t').drop_duplicates().dropna()
print ("Number of genes:", cassa.shape[0])

cassa.head(3)

Number of genes: 15998


Unnamed: 0,gene_symbol,s_het
0,A1BG,0.006794
1,A1CF,0.022313
2,A2M,0.013186


In [4]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in cassa before correction", sum(cassa['gene_symbol'].isin(gene_panels)))

Genes in panel before correction: 1929
Genes in cassa before correction 1725


In [5]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())
grch37_gencode_v34 = grch37_gencode_v34.dropna()

# get genes not in gencode v.34 format
wrong_format_genes = cassa.loc[~cassa['gene_symbol'].isin(gencode_v34_genes), 'gene_symbol'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# correct gene panels
cassa['gene_symbol'] = cassa['gene_symbol'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~cassa['gene_symbol'].isin(gencode_v34_genes)))

# save dataset
cassa.to_csv(".../450k/selection_cassa/cassa_supp_table_1_gencode-v34.txt", index=False, sep='\t')

Number of genes not in gencode_34: 890
Number of genes not in gencode_34, that found in GRCh37 : 782
Number of genes not in gencode_34: 108


In [6]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in cassa after correction", sum(cassa['gene_symbol'].isin(gene_panels)))

Genes in panel after correction: 1929
Genes in cassa after correction 1761


## Weghorn

In [67]:
weghorn = pd.read_csv(".../450k/selection_weghorn/weghorn_supp_table1.txt", sep='\t')
print ("Number of genes:", weghorn.shape[0])

weghorn.head(3)

Number of genes: 16279


Unnamed: 0,Gene,U,n_NFE,k_NFE,s_het_det,low_det,up_det,s_het_drift,low_drift,up_drift
0,A1BG,1e-06,57612.75,2,0.039562,0.01,0.136,0.028927,0.000889,0.281171
1,A1CF,2e-06,66258.5714,7,0.020096,0.01,0.041,0.013627,0.000889,0.088914
2,A2M,4e-06,55288.75,19,0.012169,0.008,0.019,0.009977,0.000889,0.028117


In [68]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in weghorn before correction", sum(weghorn['Gene'].isin(gene_panels)))

Genes in panel before correction: 1929
Genes in weghorn before correction 1727


In [69]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())
grch37_gencode_v34 = grch37_gencode_v34.dropna()


# get genes not in gencode v.34 format
wrong_format_genes = weghorn.loc[~weghorn['Gene'].isin(gencode_v34_genes), 'Gene'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# correct gene panels
weghorn['Gene'] = weghorn['Gene'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~weghorn['Gene'].isin(gencode_v34_genes)))

# save dataset
weghorn.to_csv(".../450k/selection_weghorn/weghorn_supp_table1_gencode-v34.txt", index=False, sep='\t')


Number of genes not in gencode_34: 904
Number of genes not in gencode_34, that found in GRCh37 : 796
Number of genes not in gencode_34: 108


In [70]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in weghorn after correction", sum(weghorn['Gene'].isin(gene_panels)))

Genes in panel after correction: 1929
Genes in weghorn after correction 1763


In [71]:
# save weghorn in a nice way
weghorn = pd.read_csv(".../450k/selection_weghorn/weghorn_supp_table1_gencode-v34.txt", sep='\t')

# mut-sel balance modenl and drift
weghorn_det = weghorn[['Gene', 's_het_det']].copy()
weghorn_drift = weghorn[['Gene', 's_het_drift']].copy()

# rename columns
weghorn_det.columns = ['gene_symbol', 's_het']
weghorn_drift.columns = ['gene_symbol', 's_het']

# save
weghorn_det.to_csv(".../450k/selection_weghorn/weghorn_det_gencode-v34.txt", sep='\t', index=False)
weghorn_drift.to_csv(".../450k/selection_weghorn/weghorn_drift_gencode-v34.txt", sep='\t', index=False)

## PLI

In [1]:
! wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz
! mv ./gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz .../450k/selection_pli/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz 

--2023-06-05 15:53:12--  https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.179.144, 142.250.179.208, 142.251.36.16, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.179.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4609488 (4.4M) [application/octet-stream]
Saving to: ‘gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz’


2023-06-05 15:53:12 (54.3 MB/s) - ‘gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz’ saved [4609488/4609488]



In [72]:
gnomad_filepath = '.../450k/selection_pli/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz'

gnomad = pd.read_csv(gnomad_filepath, compression='gzip', sep='\t')

gnomad = gnomad[['gene', 'pLI']]
gnomad = gnomad[~gnomad['pLI'].isnull()]
gnomad.columns = ['gene_symbol', 's_het']

gnomad.head(3)

Unnamed: 0,gene_symbol,s_het
0,MED13,1.0
1,NIPBL,1.0
2,SMC3,1.0


In [73]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in gnomAD before correction", sum(gnomad['gene_symbol'].isin(gene_panels)))

Genes in panel before correction: 1929
Genes in gnomAD before correction 1884


In [74]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())
grch37_gencode_v34 = grch37_gencode_v34.dropna()


# get genes not in gencode v.34 format
wrong_format_genes = gnomad.loc[~gnomad['gene_symbol'].isin(gencode_v34_genes), 'gene_symbol'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# correct gene panels
gnomad['gene_symbol'] = gnomad['gene_symbol'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~gnomad['gene_symbol'].isin(gencode_v34_genes)))


Number of genes not in gencode_34: 1571
Number of genes not in gencode_34, that found in GRCh37 : 989
Number of genes not in gencode_34: 580


In [75]:
print ("Gnomad shape:", gnomad.shape)
print ("Gnomad unique genes:", gnomad['gene_symbol'].drop_duplicates().shape)

gnomad = gnomad.groupby('gene_symbol').max().reset_index()

print ("Gnomad shape after correction:", gnomad.shape)

Gnomad shape: (19197, 2)
Gnomad unique genes: (19154,)
Gnomad shape after correction: (19154, 2)


In [76]:
# save dataset
gnomad.to_csv(".../450k/selection_pli/gnomad.v2.1.1.PLI_gencode-v34.txt", index=False, sep='\t')

In [77]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in gnomAD after correction", sum(gnomad['gene_symbol'].isin(gene_panels)))

Genes in panel after correction: 1929
Genes in gnomAD after correction 1919


## Roulette

In [3]:
roulette = pd.read_csv(".../450k/selection_roulette/s_het_roulette.csv", sep=',').rename(
    columns={'mean_s_het': 's_het'})
roulette = roulette[['Gene_symbol', 's_het']].drop_duplicates().dropna()
roulette = roulette.groupby('Gene_symbol').mean().reset_index()

print ("Number of genes:", roulette.shape[0])

roulette.head(3)

Number of genes: 17303


Unnamed: 0,Gene_symbol,s_het
0,A1BG,0.010626
1,A1CF,0.032817
2,A2M,0.024557


In [4]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in roulette before correction", sum(roulette['Gene_symbol'].isin(gene_panels)))

Genes in panel before correction: 1929
Genes in roulette before correction 1853


In [5]:

# GRCh38 to gencode v.34 mapping
grch38_gencode_v34 = get_grch38_inhouse_map()
gencode_v34_genes = set(grch38_gencode_v34['gene'].dropna().tolist())
grch38_gencode_v34 = grch38_gencode_v34.dropna()

# get genes not in gencode v.34 format
wrong_format_genes = roulette.loc[~roulette['Gene_symbol'].isin(gencode_v34_genes), 'Gene_symbol'].tolist()

# get a dict for correction
correcting_map = grch38_gencode_v34[grch38_gencode_v34['gene_grch38'].isin(wrong_format_genes)][['gene_grch38', 'gene']]
correcting_map = correcting_map.set_index('gene_grch38').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh38 :", len(correcting_map))

# correct gene panels
roulette['Gene_symbol'] = roulette['Gene_symbol'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~roulette['Gene_symbol'].isin(gencode_v34_genes)))

# save dataset
roulette = roulette.rename(columns={'Gene_symbol': 'gene_symbol'})
roulette.to_csv(".../450k/selection_roulette/s_het_roulette_gencode-v34.csv", index=False, sep='\t')


Number of genes not in gencode_34: 293
Number of genes not in gencode_34, that found in GRCh38 : 54
Number of genes not in gencode_34: 239


In [6]:
gene_panels = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in roulette after correction", sum(roulette['gene_symbol'].isin(gene_panels)))

Genes in panel after correction: 1929
Genes in roulette after correction 1890


### High s-het genes stat for paper

In [55]:
import numpy as np

gene_panels = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panels.columns = ['gene_symbol', 'panel']
gene_panels['n_genes'] = gene_panels.groupby('panel')['gene_symbol'].transform('count')

roulette_recessive = roulette.merge(gene_panels, how='inner', on='gene_symbol')
roulette_recessive ['high_s_het'] = roulette_recessive['s_het'] >= 0.1

print ("Number of recessive genes in panel:", roulette_recessive.shape[0])
print (f"Number of high s-het genes: {roulette_recessive ['high_s_het'].sum()}")

Number of recessive genes in panel: 1890
Number of high s-het genes: 125


In [56]:
roulette_recessive = roulette_recessive.groupby('panel').agg({'n_genes': 'first', 'gene_symbol': 'count', 'high_s_het': 'sum'}).reset_index()

roulette_recessive = roulette_recessive.sort_values(by='n_genes', ascending=False).rename(
    columns={'gene_symbol': 'n_genes_with_s_het', 'high_s_het': 'n_high_s_het'})

roulette_recessive['n_genes_with_s_het_percentage'] = np.round(roulette_recessive['n_genes_with_s_het'] / roulette_recessive['n_genes'] * 100, 1)
roulette_recessive['high_s_het_prcentage'] = np.round(roulette_recessive['n_high_s_het'] / roulette_recessive['n_genes_with_s_het'] * 100, 1)

roulette_recessive['number_of_genes_with_s_het'] = roulette_recessive['n_genes_with_s_het'].astype(str) + ' (' + roulette_recessive['n_genes_with_s_het_percentage'].astype(str) + '%)'
roulette_recessive['number_of_high_s_het_genes'] = roulette_recessive['n_high_s_het'].astype(str) + ' (' + roulette_recessive['high_s_het_prcentage'].astype(str) + '%)'

roulette_recessive.to_excel('../../../data/tables/high_s_het_genes_roulette.xlsx', index=False)