This notebook contains the code used for the conversion of all gene names used throughout the pipeline to Gencode v.34. 

We convert: 

1. List of recessive genes

2. s-hets from Cassa et al.

3. s-hets from Weghorn et al. 

4. pLI from gnomad

In [None]:
import pandas as pd
from pybiomart import Dataset

In [None]:
def get_gencode_v34_HGNC_genes():
    """
        This function dowloads gene name and its HGNC id as specified in gencode v.34
    """
    # load inhouse

    inhouse = pd.read_csv("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.metadata.HGNC.gz", 
                          compression='gzip', sep='\t', header=None)
    inhouse.columns = ['Ensembl ID', 'gene', 'HGNC ID']

    inhouse = inhouse[['gene', 'HGNC ID']].drop_duplicates().dropna()

    # edit NGNC field to have the same format as in GRCh37
    inhouse['HGNC ID'] = inhouse['HGNC ID'].apply(
        lambda x: x.replace('HGNC:', '') if x else x).astype(float) 

    return inhouse

def get_grch37_inhouse_map():
    """
        This function creates the mapping from GRCh37 gene names from ensembl into gencode v.34
        using HGNC-id as key
    """

    # load GRCh37
    dataset = Dataset(name='hsapiens_gene_ensembl',  host='http://grch37.ensembl.org')
    # attributes = dataset.list_attributes() -- look into attributes

    hg37_table = dataset.query(
        attributes=['hgnc_id','chromosome_name', 'hgnc_symbol'])

    # drop scaffolds not mapped to chromosomes
    hg37_table = hg37_table[~hg37_table['Chromosome/scaffold name'].apply(
        lambda x: x.startswith('CHR_') or 'H' in x or x.startswith('GL'))]

    # leave only mapping information
    hg37_table = hg37_table[['HGNC ID', 'HGNC symbol', 'Chromosome/scaffold name']].dropna().drop_duplicates()

    
    inhouse = get_gencode_v34_HGNC_genes()[['gene', 'HGNC ID']]
    
    mapping = hg37_table.rename(columns={'HGNC symbol': 'gene_grch37'}).merge(inhouse,  how='outer', on='HGNC ID')

    return mapping

All of the filters follows the same procedure:

1. Create mapping of `GRCh37` gene names from ensembl into `gencode v.34`

2. Read data of interest, that contains gene names in some format 

3. Identify genes from the previous step, that are not in `gencode v.34`

4. Create corretion map from genes in `step 3` to `gencode v.34` where possible by using mapping from `step 1`

5. Apply this correction map to our data of interest
    

# Gene panels

In [None]:
# read original gene panel
path_to_gene_data = ".../gene-panel.txt", # input
path_to_gene_data_gencode_v34 = ".../gene-panel-gencode-v34.txt"# output

gene_panels = pd.read_csv(path_to_gene_data, header=None)
gene_panels.columns = ['Gene name', 'panel']

print ("Number of genes:", gene_panels.shape[0])

In [None]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())

grch37_gencode_v34 = grch37_gencode_v34.dropna()

# get genes not in gencode v.34 format
wrong_format_genes = gene_panels.loc[~gene_panels['Gene name'].isin(gencode_v34_genes), 'Gene name'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# manual add a missing case
correcting_map['KIF1BP'] = 'KIFBP'

print ("Full correction dictionary:", len(correcting_map))

# correct gene panels
gene_panels ['Gene name'] = gene_panels['Gene name'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~gene_panels['Gene name'].isin(gencode_v34_genes)))

# # save dataset
gene_panels.to_csv(path_to_gene_data_gencode_v34, header=False, index=False, sep=',')


# S_hets

## Cassa

In [None]:
cassa = pd.read_csv(".../gene_s_het_cassa_all_genes.txt", sep='\t').drop_duplicates().dropna()
print ("Number of genes:", cassa.shape[0])

cassa.head(3)

In [None]:
gene_panels = pd.read_csv(path_to_gene_data, header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in cassa before correction", sum(cassa['gene_symbol'].isin(gene_panels)))

In [None]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())
grch37_gencode_v34 = grch37_gencode_v34.dropna()

# get genes not in gencode v.34 format
wrong_format_genes = cassa.loc[~cassa['gene_symbol'].isin(gencode_v34_genes), 'gene_symbol'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# correct gene panels
cassa['gene_symbol'] = cassa['gene_symbol'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~cassa['gene_symbol'].isin(gencode_v34_genes)))

# save dataset
cassa.to_csv(".../gene_s_het_cassa_all_genes_gencode-v34.txt", index=False, sep='\t')

In [None]:
gene_panels = pd.read_csv(path_to_gene_data_gencode_v34, header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in cassa after correction", sum(cassa['gene_symbol'].isin(gene_panels)))

## Weghorn

In [None]:
weghorn = pd.read_csv(".../weghorn_supp_table1.txt", sep='\t')
print ("Number of genes:", weghorn.shape[0])

weghorn.head(3)

In [None]:
gene_panels = pd.read_csv(path_to_gene_data, header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in weghorn before correction", sum(weghorn['Gene'].isin(gene_panels)))

In [None]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())
grch37_gencode_v34 = grch37_gencode_v34.dropna()


# get genes not in gencode v.34 format
wrong_format_genes = weghorn.loc[~weghorn['Gene'].isin(gencode_v34_genes), 'Gene'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# correct gene panels
weghorn['Gene'] = weghorn['Gene'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~weghorn['Gene'].isin(gencode_v34_genes)))

# save dataset
weghorn.to_csv("../450k/selection_weghorn/weghorn_supp_table1_gencode-v34.txt", index=False, sep='\t')


In [None]:
gene_panels = pd.read_csv(path_to_gene_data_gencode_v34, header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in weghorn after correction", sum(weghorn['Gene'].isin(gene_panels)))

In [None]:
# save weghorn in a nice way
weghorn = pd.read_csv(".../weghorn_supp_table1_gencode-v34.txt", sep='\t')

# mut-sel balance modenl and drift
weghorn_det = weghorn[['Gene', 's_het_det']].copy()
weghorn_drift = weghorn[['Gene', 's_het_drift']].copy()

# rename columns
weghorn_det.columns = ['gene_symbol', 's_het']
weghorn_drift.columns = ['gene_symbol', 's_het']

# save
weghorn_det.to_csv(".../weghorn_det_gencode-v34.txt", sep='\t', index=False)
weghorn_drift.to_csv(".../weghorn_drift_gencode-v34.txt", sep='\t', index=False)

## PLI

In [None]:
! wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz
! mv ./gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz .../gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz 

In [None]:
gnomad_filepath = '.../gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz'

gnomad = pd.read_csv(gnomad_filepath, compression='gzip', sep='\t')

gnomad = gnomad[['gene', 'pLI']]
gnomad = gnomad[~gnomad['pLI'].isnull()]
gnomad.columns = ['gene_symbol', 's_het']

gnomad.head(3)

In [None]:
gene_panels = pd.read_csv(path_to_gene_data, header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel before correction:", len(gene_panels))
print ("Genes in gnomAD before correction", sum(gnomad['gene_symbol'].isin(gene_panels)))

In [None]:
# GRCh37 to gencode v.34 mapping
grch37_gencode_v34 = get_grch37_inhouse_map()
gencode_v34_genes = set(grch37_gencode_v34['gene'].dropna().tolist())
grch37_gencode_v34 = grch37_gencode_v34.dropna()


# get genes not in gencode v.34 format
wrong_format_genes = gnomad.loc[~gnomad['gene_symbol'].isin(gencode_v34_genes), 'gene_symbol'].tolist()

# get a dict for correction
correcting_map = grch37_gencode_v34[grch37_gencode_v34['gene_grch37'].isin(wrong_format_genes)][['gene_grch37', 'gene']]
correcting_map = correcting_map.set_index('gene_grch37').to_dict()['gene']

# print information
print ("Number of genes not in gencode_34:", len(wrong_format_genes))
print ("Number of genes not in gencode_34, that found in GRCh37 :", len(correcting_map))

# correct gene panels
gnomad['gene_symbol'] = gnomad['gene_symbol'].apply(lambda x: correcting_map.get(x, x))

print ("Number of genes not in gencode_34:", sum(~gnomad['gene_symbol'].isin(gencode_v34_genes)))


In [None]:
print ("Gnomad shape:", gnomad.shape)
print ("Gnomad unique genes:", gnomad['gene_symbol'].drop_duplicates().shape)

gnomad = gnomad.groupby('gene_symbol').max().reset_index()

print ("Gnomad shape after correction:", gnomad.shape)

In [None]:
# save dataset
gnomad.to_csv(".../gnomad.v2.1.1.PLI_gencode-v34.txt", index=False, sep='\t')

In [None]:
gene_panels = pd.read_csv(path_to_gene_data_gencode_v34, header=None)
gene_panels = set(gene_panels[0].tolist())

print ("Genes in panel after correction:", len(gene_panels))
print ("Genes in gnomAD after correction", sum(gnomad['gene_symbol'].isin(gene_panels)))