In [2]:
import pandas as pd
from difflib import get_close_matches
from sklearn.metrics import roc_curve, auc
from maayanlab_bioinformatics.harmonization import ncbi_genes_lookup
import matplotlib.pyplot as plt
import numpy as np
from time import sleep
import seaborn as sns
from tqdm import tqdm
import json
lookup = ncbi_genes_lookup()

In [3]:
cox_df = pd.read_csv('data/cox_inhibitor.tsv', sep='\t')
cox_inhibitors = ['-'.join(x.upper().split(' '))  for x in cox_df['Drug'].values]
hdac_df = pd.read_csv('data/hdac_inhibitors.csv', index_col=0)
hdac_inhibitors = ['-'.join(x.upper().split(' ')) for x in hdac_df['Drug'].values]
cdk_df = pd.read_csv('data/CDK inhibitor.txt', sep='\t')
cdk_inhibitors = ['-'.join(x.upper().split(' ')) for x in cdk_df['Name'].values]

In [4]:
with open('../data/counts_perts.json') as f:
    counts_perts = json.load(f)
all_drugs = list([k.upper() for k in counts_perts.keys()])

In [5]:
with open('data/cdk_inhibitor.gmt') as fr:
    lines = fr.readlines()
with open('data/cdk_inhibitor_hu_pc.gmt', 'w') as fw:
    for line in lines:
        term = line.strip().split('\t')[0]
        genes = line.strip().split('\t')[2:]
        genes = [lookup(g.upper()) for g in genes if lookup(g.upper())]
        genes_str = '\t'.join(genes)
        fw.write(f"{term}\t\t{genes_str}\n")

In [6]:
with open('data/hdac_inhibitor.gmt') as fr:
    lines = fr.readlines()
with open('data/hdac_inhibitor_hu_pc.gmt', 'w') as fw:
    for line in lines:
        term = line.strip().split('\t')[0]
        genes = line.strip().split('\t')[2:]
        genes = [lookup(g.upper()) for g in genes if lookup(g.upper())]
        genes_str = '\t'.join(genes)
        fw.write(f"{term}\t\t{genes_str}\n")

In [7]:
#for d in list(set(cdk_inhibitors).difference(all_drugs)):
#    closest_matches = get_close_matches(d, all_drugs, n=5)
#    print(d, closest_matches)

In [None]:
print(len(set(cox_inhibitors).intersection(all_drugs)))
print(len(list(set(hdac_inhibitors).intersection(all_drugs))))
print(len(list(set(cdk_inhibitors).intersection(all_drugs))))

In [13]:

def query_consensus_l2s2(genes, sortby='pvalue'):
    import requests
    query = {
        "operationName": "EnrichmentQuery",
        "variables": {
            "filterTerm": "",
            "filterFda": False,
            "sortBy": sortby,
            "filterKo": False,
            "genes": genes,
            "pvalueLe": .99999
        },
        "query": 'query EnrichmentQuery($genes: [String]!, $filterTerm: String = "", $offset: Int, $first: Int, $filterFda: Boolean = false, $sortBy: String = "", $filterKo: Boolean = false, $pvalueLe: Float = 1.1) {\n  currentBackground {\n    enrich(\n      genes: $genes\n      filterTerm: $filterTerm\n      offset: $offset\n      first: $first\n      filterFda: $filterFda\n      sortby: $sortBy\n      filterKo: $filterKo\n      pvalueLe: $pvalueLe\n    ) {\n     consensus {\n        drug\n        oddsRatio\n        pvalue\n        adjPvalue\n        approved\n        countSignificant\n        countInsignificant\n        countUpSignificant\n        pvalueUp\n        adjPvalueUp\n        oddsRatioUp\n        pvalueDown\n        adjPvalueDown\n        oddsRatioDown\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n',
    }
    res = requests.post(
    "http://l2s2.maayanlab.cloud/graphql",
    data=json.dumps(query),
    headers={'Content-Type': 'application/json'}
    )
    if res.status_code != 200:
        raise Exception("Query failed to run by returning code of {}. {}".format(res.status_code, query))
    
    res = res.json()
    return pd.DataFrame(res['data']['currentBackground']['enrich']['consensus'])

In [None]:
with open('data/hdac_inhibitor.gmt') as f:
    for line in tqdm(f.readlines()):
        genes = line.strip().split('\t')[2:]
        genes = [g.upper() for g in genes if g != '']
        term = line.strip().split('\t')[0]
        try:
            rank_df = query_consensus_l2s2(genes, sortby='pvalue')
        except Exception as e:
            print('Error computing for', term, e)
            continue
        if 'drug' not in rank_df.columns:
            continue
        rank_df.to_csv(f'data/hdac_out/hdac_{term}.tsv', sep='\t')

In [None]:
import os
with open('data/cdk_inhibitor.gmt') as f:
    for line in tqdm(f.readlines()):
        genes = line.strip().split('\t')[2:]
        genes = [g.upper() for g in genes if g != '']
        term = line.strip().split('\t')[0]
        if os.path.exists(f'data/cdk_out/cdk_{term}.tsv'):
            continue
        try:
            rank_df = query_consensus_l2s2(genes, sortby='pvalue')
        except Exception as e:
            print('Error computing for', term, e)
            continue
        if 'drug' not in rank_df.columns:
            continue
        rank_df.to_csv(f'data/cdk_out/cdk_{term}.tsv', sep='\t')

In [None]:
import os
ranking_dict = {
    'pvalue': {'scores': [], 'labels': []},
    'oddsRatio': {'scores': [], 'labels': []}, 
    'pvalueUp': {'scores': [], 'labels': []}, 
    'pvalueDown': {'scores': [], 'labels': []}, 
    'oddsRatioUp': {'scores': [], 'labels': []},
    'oddsRatioDown': {'scores': [], 'labels': []}
}

for metric in ranking_dict.keys():
    for term in tqdm(os.listdir('data/hdac_out')):
        rank_df = pd.read_csv(f'data/hdac_out/{term}', sep='\t')
        rank_df = rank_df[(rank_df['pvalue'] < 0.05)]
        if 'pvalue' in metric:
            rank_df.sort_values(by=metric, inplace=True, ascending=True)
        else:
            rank_df.sort_values(by=metric, inplace=True, ascending=False)
        rank_df.reset_index(drop=True, inplace=True)
        rank_df['labels'] = [1 if x.upper() in hdac_inhibitors else 0 for x in rank_df['drug']]
        rank_df['scores'] = 1 -  ((rank_df.index.values + 1) / len(rank_df))
        ranking_dict[metric]['scores'].extend(list(rank_df['scores']))
        ranking_dict[metric]['labels'].extend(list(rank_df['labels']))
        

In [None]:
plt.figure(figsize=(8, 6))
for metric in ranking_dict.keys():   
    fpr, tpr, thresholds = roc_curve(ranking_dict[metric]['labels'], ranking_dict[metric]['scores'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{metric} (AUC = {roc_auc:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random')
plt.show()

In [None]:
import os
ranking_dict_up = {
    'pvalue': {'scores': [], 'labels': []},
    'oddsRatio': {'scores': [], 'labels': []}, 
    'pvalueUp': {'scores': [], 'labels': []}, 
    'pvalueDown': {'scores': [], 'labels': []}, 
    'oddsRatioUp': {'scores': [], 'labels': []},
    'oddsRatioDown': {'scores': [], 'labels': []}
}

for metric in ranking_dict_up.keys():
    for term in tqdm(os.listdir('data/hdac_out')):
        if 'up' in term:
            rank_df = pd.read_csv(f'data/hdac_out/{term}', sep='\t')
            rank_df = rank_df[(rank_df['pvalue'] < 0.05)]
            if 'pvalue' in metric:
                rank_df.sort_values(by=metric, inplace=True, ascending=True)
            else:
                rank_df.sort_values(by=metric, inplace=True, ascending=False)
            rank_df.reset_index(drop=True, inplace=True)
            rank_df['labels'] = [1 if x.upper() in hdac_inhibitors else 0 for x in rank_df['drug']]
            rank_df['scores'] = 1 -  ((rank_df.index.values + 1) / len(rank_df))
            ranking_dict_up[metric]['scores'].extend(list(rank_df['scores']))
            ranking_dict_up[metric]['labels'].extend(list(rank_df['labels']))

In [None]:
plt.figure(figsize=(8, 6))

for m in ranking_dict_up.keys():  
    fpr, tpr, thresholds = roc_curve(ranking_dict_up[m]['labels'], ranking_dict_up[m]['scores'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{m} (AUC = {roc_auc:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random')
plt.show()

In [None]:
import os
ranking_dict_dn = {
    'pvalue': {'scores': [], 'labels': []},
    'oddsRatio': {'scores': [], 'labels': []}, 
    'pvalueUp': {'scores': [], 'labels': []}, 
    'pvalueDown': {'scores': [], 'labels': []}, 
    'oddsRatioUp': {'scores': [], 'labels': []},
    'oddsRatioDown': {'scores': [], 'labels': []}
}

for metric in ranking_dict.keys():
    for term in tqdm(os.listdir('data/hdac_out')):
        if 'dn' in term:
            rank_df = pd.read_csv(f'data/hdac_out/{term}', sep='\t')
            rank_df = rank_df[(rank_df['pvalue'] < 0.05)]
            if 'pvalue' in metric:
                rank_df.sort_values(by=metric, inplace=True, ascending=True)
            else:
                rank_df.sort_values(by=metric, inplace=True, ascending=False)
            rank_df.reset_index(drop=True, inplace=True)
            rank_df['labels'] = [1 if x.upper() in hdac_inhibitors else 0 for x in rank_df['drug']]
            rank_df['scores'] = 1 -  ((rank_df.index.values + 1) / len(rank_df))
            ranking_dict_dn[metric]['scores'].extend(list(rank_df['scores']))
            ranking_dict_dn[metric]['labels'].extend(list(rank_df['labels']))


plt.figure(figsize=(8, 6))
for metric in ranking_dict_dn.keys():   
    fpr, tpr, thresholds = roc_curve(ranking_dict_dn[metric]['labels'], ranking_dict_dn[metric]['scores'])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{metric} (AUC = {roc_auc:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random')
plt.show()