In [1]:
import pandas as pd
import requests
import json
import re
from tqdm import tqdm
import os
import pandas as pd
import time
import math
from glob import glob
import uuid

import pyarrow.feather as feather
from tqdm import tqdm
from scipy.stats import rankdata, zscore

In [2]:
sn = "Homo_sapiens"
organism = "../Serialization/Mammalia/%s"%sn
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{}.gene_info.gz'.format(organism)
file = '{}.gene_info.tsv'.format(organism)


In [3]:
def fetch_save_read(url, file, reader=pd.read_csv, sep='\t', **kwargs):
  ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
  '''
  if not os.path.exists(file):
    if os.path.dirname(file):
      os.makedirs(os.path.dirname(file), exist_ok=True)
    df = reader(url, sep=sep, index_col=None)
    df.to_csv(file, sep=sep, index=False)
  return pd.read_csv(file, sep=sep, **kwargs)


In [4]:
ncbi_gene = fetch_save_read(url, file)
ncbi_gene.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20220805,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20221009,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000291190|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20221025,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20220925,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20221009,-


In [5]:
def maybe_split(record):
    ''' NCBI Stores Nulls as '-' and lists '|' delimited
    '''
    if record in {'', '-'}:
        return set()
    return set(record.split('|'))

def supplement_dbXref_prefix_omitted(ids):
    ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
    '''
    for id in ids:
        # add original id
        yield id
        # also add id *without* prefix
        if ':' in id:
            yield id.split(':', maxsplit=1)[1]

ncbi_gene['All_synonyms'] = [
    set.union(
      maybe_split(gene_info['Symbol']),
      maybe_split(gene_info['Symbol_from_nomenclature_authority']),
      maybe_split(str(gene_info['GeneID'])),
      maybe_split(gene_info['Synonyms']),
      maybe_split(gene_info['Other_designations']),
      maybe_split(gene_info['LocusTag']),
      set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
    )
    for _, gene_info in ncbi_gene.iterrows()
  ]

ncbi_gene['All_synonyms'] = [
    set.union(
      maybe_split(gene_info['Symbol']),
      maybe_split(gene_info['Symbol_from_nomenclature_authority']),
      maybe_split(str(gene_info['GeneID'])),
      maybe_split(gene_info['Synonyms']),
      maybe_split(gene_info['Other_designations']),
      maybe_split(gene_info['LocusTag']),
      set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
    )
    for _, gene_info in ncbi_gene.iterrows()
  ]

synonyms, gene_id = zip(*{
    (synonym, gene_info['Symbol'])
    for _, gene_info in ncbi_gene.iterrows()
    for synonym in gene_info['All_synonyms']
  })
ncbi_lookup_syn = pd.Series(gene_id, index=synonyms)
symbols, cap, gene_id = zip(*{
    (gene_info['Symbol'], gene_info['Symbol'].upper(), gene_info['Symbol'])
    for _, gene_info in ncbi_gene.iterrows()
  })
ncbi_lookup_sym = pd.Series(gene_id, index=symbols)
ncbi_lookup_sym_cap = pd.Series(gene_id, index=cap)

index_values = ncbi_lookup_syn.index.value_counts()
ambiguous = index_values[index_values > 1].index
ncbi_lookup_syn_disambiguated = ncbi_lookup_syn[(
(ncbi_lookup_syn.index == ncbi_lookup_syn) | (~ncbi_lookup_syn.index.isin(ambiguous))
)]
all_genes = {}

def gene_lookup(gene):
    gene_id = ncbi_lookup_sym.to_dict().get(gene)
    if gene_id: return str(gene_id)
    gene_id = ncbi_lookup_sym_cap.to_dict().get(gene)
    if gene_id: return str(gene_id)
    return str(ncbi_lookup_syn_disambiguated.to_dict().get(gene))

In [6]:
df = pd.read_csv('data/gwas_catalog_v1.0-associations_e109_r2023-05-20.tsv', sep="\t")
df.head()

  df = pd.read_csv('data/gwas_catalog_v1.0-associations_e109_r2023-05-20.tsv', sep="\t")


Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,CONTEXT,INTERGENIC,RISK ALLELE FREQUENCY,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV
0,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI (adjusted for smoking behaviour),"98,173 European ancestry women, 64,373 Europea...","21,496 European ancestry women, 24,385 Europea...",...,upstream_gene_variant,1.0,0.7603,1.0000000000000001e-28,28.0,(women),0.0542,[0.045-0.064] kg/m2 decrease,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
1,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI (adjusted for smoking behaviour),"98,173 European ancestry women, 64,373 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intron_variant,0.0,0.3933,2e-06,5.69897,(women),0.02,[0.012-0.028] kg/m2 increase,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
2,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI (adjusted for smoking behaviour),"98,173 European ancestry women, 64,373 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intron_variant,0.0,0.3933,1e-11,11.0,,0.0223,[0.016-0.029] kg/m2 increase,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
3,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI (adjusted for smoking behaviour),"98,173 European ancestry women, 64,373 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intron_variant,0.0,0.3933,1e-07,7.0,(men),0.0254,[0.016-0.035] kg/m2 increase,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N
4,2017-08-10,28443625,Justice AE,2017-04-26,Nat Commun,www.ncbi.nlm.nih.gov/pubmed/28443625,"Genome-wide meta-analysis of 241,258 adults ac...",BMI (adjusted for smoking behaviour),"98,173 European ancestry women, 64,373 Europea...","21,496 European ancestry women, 24,385 Europea...",...,intron_variant,0.0,0.1937,8e-07,6.09691,,0.0231,[0.014-0.032] kg/m2 increase,"Affymetrix, Illumina, Perlegen [up to 2800000]...",N


In [7]:
cols = ['DISEASE/TRAIT', 
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID',  'SNP_GENE_IDS',  'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',
       ]

df = df[cols]

In [8]:
df.head()

Unnamed: 0,DISEASE/TRAIT,REPORTED GENE(S),MAPPED_GENE,UPSTREAM_GENE_ID,DOWNSTREAM_GENE_ID,SNP_GENE_IDS,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT)
0,BMI (adjusted for smoking behaviour),MC4R,RPS3AP49 - RNU4-17P,ENSG00000242060,ENSG00000202468,,1.0000000000000001e-28,28.0,(women)
1,BMI (adjusted for smoking behaviour),MTCH2,MTCH2,,,ENSG00000109919,2e-06,5.69897,(women)
2,BMI (adjusted for smoking behaviour),MTCH2,MTCH2,,,ENSG00000109919,1e-11,11.0,
3,BMI (adjusted for smoking behaviour),MTCH2,MTCH2,,,ENSG00000109919,1e-07,7.0,(men)
4,BMI (adjusted for smoking behaviour),MTIF3,MTIF3,,,ENSG00000122033,8e-07,6.09691,


In [9]:
len(set(df['DISEASE/TRAIT']))

24929

In [10]:
all_genes = {}
for genes in tqdm(df['MAPPED_GENE'].unique()):
    if type(genes) == str:
        for g in genes.split(","):
            gene = g.strip()
            if gene not in all_genes:
                all_genes[gene] = ''

100%|██████████| 44813/44813 [00:00<00:00, 1664476.51it/s]


In [11]:
len(all_genes)

42757

In [12]:
for gene in tqdm(all_genes):
    all_genes[gene] = gene_lookup(gene)

100%|██████████| 42757/42757 [2:39:36<00:00,  4.46it/s]  


In [13]:
with open("output/all_genes.json", 'w') as o:
    o.write(json.dumps(all_genes))

In [14]:
gmt = {}
none_counter = {}
for i, row in tqdm(df.iterrows()):
    disease = row['DISEASE/TRAIT']
    if disease not in gmt:
        gmt[disease] = set()
    genes = row["MAPPED_GENE"]
    if type(genes) == str:
        for g in genes.split(","):
            gene = all_genes[g.strip()]
            if gene != 'None':
                gmt[disease].add(gene)
            else:
                if g.strip() not in none_counter:
                    none_counter[g.strip()] = 0
                none_counter[g.strip()] += 1


519421it [00:08, 59273.67it/s]


In [15]:
len(none_counter)

23858

In [16]:
len(gmt)

24929

In [17]:
import csv

In [18]:
with open('output/GWAS_Catalog_2023.gmt', 'w') as o:
    csv_writer = csv.writer(o, delimiter="\t")
    for k,v in gmt.items():
        if len(v) >= 5:
            row = [k, ''] + list(v)
            csv_writer.writerow(row)