## Mapping Medication Categories to Associated Genes
#### Link to resource paper : https://www.nature.com/articles/s41467-019-09572-5#MOESM1 (Supplementary Data 5)

In [1]:
import pandas as pd
from collections import defaultdict
import json
import os

In [2]:
df = pd.read_csv('../input/UK_BioBank_GWAS_Medication_Disease.tsv', delimiter = '\t')

In [3]:
df.head()

Unnamed: 0,Medication category,Gene,Entrez ID,P,Pbc
0,A02B,HLA-B,3106,4.3e-12,Yes
1,A02B,UBD,10537,7.6e-11,Yes
2,A02B,OR2H2,7932,1.1e-10,Yes
3,A02B,HIST1H3I,8354,9.4e-10,Yes
4,A02B,HIST1H4J,8363,1e-09,Yes


In [4]:
# Unique atc codes to be annotated
print((set(df['Medication category'])))

{'C01D', 'N02C', 'N06A', 'C10AA', 'L04', 'A02B', 'N02BA', 'N02BE', 'R06A', 'M05B', 'S01E', 'C02', 'N02A', 'H03A', 'B01A', 'M01A', 'C03', 'A10', 'R03A', 'C08', 'C07', 'C09', 'R03BA'}


#### Annotating each ATC code with proper description from https://www.whocc.no/

In [5]:
atc_lookup = {'C07':'C07 BETA BLOCKING AGENTS',
           'H03A':'H03A THYROID PREPARATIONS',
           'C02':'C02 ANTIHYPERTENSIVES',
           'C08':'C08 CALCIUM CHANNEL BLOCKERS',
           'B01A':'B01A ANTITHROMBOTIC AGENTS',
           'N02BA':'N02BA SALICYLIC ACID AND DERIVATIVES',
            'C09':'C09 AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM',
            'C03':'C03 DIURETICS',
            'R06A':'R06A ANTIHISTAMINES FOR SYSTEMIC USE',
            'C10AA':'C10AA HMG CoA REDUCTASE INHIBITORS',
            'A10':'A10 DRUGS USED IN DIABETES',
            'M01A':'M01A ANTIINFLAMMATORY AND ANTIRHEUMATIC PRODUCTS, NON-STEROIDS',
            'N02C':'N02C ANTIMIGRAINE PREPARATIONS',
            'S01E':'S01E ANTIGLAUCOMA PREPARATIONS AND MIOTICS',
            'R03BA':'R03BA GLUCOCORTICOIDS',
            'L04':'L04 IMMUNOSUPPRESSANTS',
            'N02A':'N02A OPIOIDS',
            'M05B':'M05B DRUGS AFFECTING BONE STRUCTURE AND MINERALIZATION',
            'R03A':'R03A ADRENERGICS, INHALANTS',
            'N02BE':'N02BE OTHER ANALGESICS AND ANTIPYRETICS',
            'N06A':'N06A ANTIDEPRESSANTS',
            'C01D':'C01D VASODILATORS USED IN CARDIAC DISEASES',
            'A02B':'A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHAGEAL REFLUX DISEASE (GORD)'
           }

df['Medication category'] = df['Medication category'].map(atc_lookup)

In [6]:
df.head()

Unnamed: 0,Medication category,Gene,Entrez ID,P,Pbc
0,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,HLA-B,3106,4.3e-12,Yes
1,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,UBD,10537,7.6e-11,Yes
2,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,OR2H2,7932,1.1e-10,Yes
3,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,HIST1H3I,8354,9.4e-10,Yes
4,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,HIST1H4J,8363,1e-09,Yes


In [7]:
# Only keeping genes whose P values were significant after bonferroni correction (Pbc column)
df= df[~df["Pbc"].str.contains("No")]

### Ensuring all gene names are valid
#### Lookup table generated from ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/

In [9]:
# Importing lookup tables
gene_info = pd.read_csv('../input/Homo_sapiens.gene_info', delimiter = '\t',
                       usecols = ['GeneID','Symbol'])
gene_lookup = gene_info.set_index('GeneID').to_dict()['Symbol']

In [10]:
# Matching Approved Symbols to Entrez IDs provided in the data
df['Approved Symbols'] = df['Entrez ID'].map(gene_lookup)

In [11]:
# Visualizing all entries with invalid Entrez IDs / Genes
unapproved_symbols = df[df.isna().any(axis=1)]
unapproved_symbols

Unnamed: 0,Medication category,Gene,Entrez ID,P,Pbc,Approved Symbols
395,C03 DIURETICS,LOC101929490,101929490,6.2e-13,Yes,
1321,C09 AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM,LOC101929490,101929490,3e-10,Yes,
1464,C09 AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM,LOC101060022,101060022,2.1e-08,Yes,
2460,H03A THYROID PREPARATIONS,LOC101929490,101929490,8.4e-11,Yes,
3060,N02BE OTHER ANALGESICS AND ANTIPYRETICS,LOC554223,554223,1.8e-10,Yes,
3216,"R03A ADRENERGICS, INHALANTS",LOC101928947,101928947,2.7e-16,Yes,
3489,R03BA GLUCOCORTICOIDS,LOC101928947,101928947,5.8e-12,Yes,


In [12]:
# Dropping any entries that could not be matched
df = df.dropna()

In [13]:
df.head()

Unnamed: 0,Medication category,Gene,Entrez ID,P,Pbc,Approved Symbols
0,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,HLA-B,3106,4.3e-12,Yes,HLA-B
1,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,UBD,10537,7.6e-11,Yes,UBD
2,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,OR2H2,7932,1.1e-10,Yes,OR2H2
3,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,HIST1H3I,8354,9.4e-10,Yes,H3C11
4,A02B DRUGS FOR PEPTIC ULCER AND GASTRO-OESOPHA...,HIST1H4J,8363,1e-09,Yes,H4C11


### Mapping medication category to genes

In [14]:
atc = df['Medication category'].tolist()
genes = df['Approved Symbols'].tolist()

gene_dict = tuple(zip(atc,genes))
genesetlibrary = defaultdict(list)
for k,v in gene_dict:
    genesetlibrary[k].append(v)

genesetlibrary = {k:v for k,v in genesetlibrary.items() if len(v) >= 5}

### Counts of unique terms and associations

In [15]:
# Number of unique genes
len(set([gene for k,v in genesetlibrary.items() for gene in v]))

1212

In [16]:
# Number of unique attributes
len(genesetlibrary)

20

In [17]:
# Number of unique associations
len([gene for k,v in genesetlibrary.items() for gene in v])

2338

### Exporting as GMT

In [18]:
output = []
for term in genesetlibrary.keys():    
    terms = genesetlibrary[term]
    line = '{0}\t\t{1}'.format(term, '\t'.join(terms))
    output.append(line)

gmt_output = '\n'.join(output)


dataFile = open('../gmts/UK_BioBank_GWAS_Medication_SNP.gmt', 'w')
for eachitem in gmt_output:
    dataFile.write(eachitem)
dataFile.close()