In [15]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import argparse
from utils import cfh, logger, Data, bool_ext, checkCorrelations, generate_masks_from_ppi
import os
import argparse
import random
from utils import cfh, logger, Data, bool_ext, checkCorrelations, generate_masks_from_ppi
from biomodels import BioCitrus
import torch
import numpy as np
import sys
from biomodels import weightConstraint
from utils import logger, get_minibatch, evaluate, EarlyStopping, shuffle_data
from tqdm import tqdm
from pathlib import Path
from IPython.display import clear_output
from collections import Counter
import warnings

%load_ext autoreload
%autoreload 2
warnings.filterwarnings("ignore") ##This is bad but temporary

In [16]:
data = Data(
    fGEP_SGA = 'data/CITRUS_GEP_SGAseparated.csv',
    fgene_tf_SGA = 'data/CITRUS_gene_tf_SGAseparated.csv',
    fcancerType_SGA = 'data/CITRUS_canType_SGAseparated.csv',
    fSGA_SGA = 'data/CITRUS_SGA_SGAseparated.csv',
)

In [20]:
np.unique([i.split('_')[1] for i in data.sga_sga.columns]).__len__()

10552

In [97]:
_dual_alterations = np.array([('SM_'+i, 'SCNA_'+i) for i in data.sga_genes]).reshape(-1)
df = pd.DataFrame(columns=_dual_alterations)
df = data.sga_sga.reindex(columns=set(_dual_alterations).union(set(data.sga_sga.columns))).fillna(0)
df = df.astype(int)
df = df[_dual_alterations]


In [99]:
df.shape

(5803, 21104)

In [49]:
data.sga_sga.get('SM_A1BG', np.zeros(data.sga_sga.shape[0]).astype(int)).shape

(5803,)

In [7]:
def pad_list(stag_arr):
    pad = len(max(stag_arr, key=len))
    return np.array([i + [0]*(pad-len(i)) for i in stag_arr])

In [3]:
df = pd.DataFrame(np.where(data.sga_sga.values != 0)).T
df.columns = ['idx', 'indices']

In [4]:
sga = [df[df.idx==i].indices.values.tolist() for i in df.idx.unique()]

In [13]:
pad_list(sga)[[1, 3, 6, 7]]

array([[  63,   74,   78, ...,    0,    0,    0],
       [  49,   53,  112, ...,    0,    0,    0],
       [ 211,  442, 1290, ...,    0,    0,    0],
       [ 106,  113,  144, ...,    0,    0,    0]])

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
parser = argparse.ArgumentParser()

parser.add_argument(
    "--input_dir", 
    help="directory of input files", 
    type=str, 
    default="./data"
)
parser.add_argument(
    "--output_dir",
    help="directory of output files",
    type=str,
    default="./output",
)

parser.add_argument(
    "--algo", 
    help="clustering algorithm to use on the portein-protein network (DPCLUS, MCODE, COACH)", 
    type=str, 
    default='COACH'
)


parser.add_argument(
    "--learning_rate", 
    help="learning rate for Adam", 
    type=float, 
    default=1e-2
)
parser.add_argument(
    "--max_iter", 
    help="maximum number of training iterations", 
    type=int, 
    default=300
)
parser.add_argument(
    "--max_fscore",
    help="Max F1 score to early stop model from training",
    type=float,
    default=0.7
)
parser.add_argument(
    "--batch_size", 
    help="training batch size", 
    type=int, 
    default=100
)
parser.add_argument(
    "--test_batch_size", 
    help="test batch size", 
    type=int, 
    default=100
)
parser.add_argument(
    "--test_inc_size",
    help="increment interval size between log outputs",
    type=int,
    default=256
)
parser.add_argument(
    "--dropout_rate", 
    help="dropout rate", 
    type=float, 
    default=0.1
)

parser.add_argument(
    "--weight_decay", 
    help="coefficient of l2 regularizer", 
    type=float, 
    default=1e-2
)
parser.add_argument(
    "--activation",
    help="activation function used in hidden layer",
    type=str,
    default="tanh",
)
parser.add_argument(
    "--patience", 
    help="earlystopping patience", 
    type=int, 
    default=20
)
parser.add_argument(
    "--mask01",
    help="wether to ignore the float value and convert mask to 01",
    type=bool_ext,
    default=True,
)
parser.add_argument(
    "--gep_normalization", 
    help="how to normalize gep", 
    type=str, 
    default="scaleRow"
)

parser.add_argument(
    "--cancer_type",
    help="whether to use cancer type or not",
    type=bool_ext,
    default=False,
)
parser.add_argument(
    "--train_model",
    help="whether to train model or load model",
    type=bool_ext,
    default=True,
)
parser.add_argument(
    "--dataset_name",
    help="the dataset name loaded and saved",
    type=str,
    default="dataset_CITRUS",
)
parser.add_argument(
    "--tag", 
    help="a tag passed from command line", 
    type=str, 
    default=""
)
parser.add_argument(
    "--run_count", 
    help="the count for training", 
    type=str, 
    default="1"
)

parser.add_argument(
    "--ppi_weights", 
    help="", 
    type=bool_ext, 
    default=False
)

parser.add_argument(
    "--verbose", 
    help="", 
    type=bool_ext, 
    default=False
)

parser.add_argument(
    "--constrain", 
    help="force weight and biases to be strictly non-negative", 
    type=bool_ext, 
    default=False
)

parser.add_argument(
    "--biases", 
    help="enable all nn.Linear biases", 
    type=bool_ext, 
    default=True
)

parser.add_argument(
    "--sparse", 
    help="only use SIGNOR data, resulting in sparser connections", 
    type=bool_ext, 
    default=False
)

args = parser.parse_args([])

data = Data(
    fGEP_SGA = 'data/CITRUS_GEP_SGAseparated.csv',
    fgene_tf_SGA = 'data/CITRUS_gene_tf_SGAseparated.csv',
    fcancerType_SGA = 'data/CITRUS_canType_SGAseparated.csv',
    fSGA_SGA = 'data/CITRUS_SGA_SGAseparated.csv',
    cancer_type='BRCA'
)

train_set, test_set = data.get_train_test()
args.gep_size = train_set['gep'].shape[1]
args.tf_gene = data.gene_tf_sga.values.T
args.can_size = len(np.unique(data.cancer_types))


sga_mask, sga_weights, tf_mask, tf_weights = generate_masks_from_ppi(sga = data.sga_sga, tf = data.gene_tf_sga, clust_algo=args.algo, sparse=args.sparse)


sga_mask = sga_mask
sga_weights = sga_weights.t()
tf_mask = tf_mask.t()
tf_weights = tf_weights


In [None]:
from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
from captum.attr import IntegratedGradients

In [None]:
models = []
for i in range(30):
    
    model = BioCitrus(
        args = args, 
        sga_ppi_mask = sga_mask, 
        ppi_tf_mask = tf_mask, 
        sga_ppi_weights = None, 
        ppi_tf_weights = None,
        enable_bias = args.biases
    )

    model.load_state_dict(torch.load(f'/ix/hosmanbeyoglu/kor11/CITRUS_models/BRCA_{i}.pth', 
                                map_location=torch.device('cpu')))
    
    model.eval()
    
    models.append(model)
    clear_output(wait=True)

In [None]:
X = torch.tensor(test_set['sga'])
Y = test_set['gep']

In [None]:
list(data.gep_sga.columns).index('PIK3CA')

In [3]:
all_attr_scores = np.load('all_attr_scores.npy')

In [4]:
models = np.array(models)[[21, 20, 10, 17, 5, 11, 12, 14, 6, 25]]

NameError: name 'models' is not defined

In [None]:
dff = pd.DataFrame(all_attr_scores[0], 
    columns=data.sga_sga.columns, index=data.gep_sga.columns).sum(0).sort_values(ascending=False)

In [None]:
df = pd.DataFrame([data.sga_sga.sum().loc[dff.index].values, dff.values], 
                  columns=dff.index, index=['alt_freq', 'int_grad']).T

In [None]:
df['int_grad'] = ((df['int_grad'] - df['int_grad'].min())/(df['int_grad'].max()-df['int_grad'].min()))*10000

In [None]:
df['int_grad'].mean()

1.6811278043547329

In [None]:
df['int_grad'] = np.log10(df['int_grad'])
df['alt_freq'] = np.log10(df['alt_freq'])

In [None]:
# plt.figure(figsize=(10, 8))
# ax = sns.scatterplot(data=df, x='alt_freq', y='int_grad')
# plt.xlabel('Log10 Alteration Frequency')
# plt.ylabel('Log10 Integrated Gradient Importance')
# # plt.savefig('frequency_plots.png', dpi=180)
# plt.show()

In [None]:
def tf_activity(model, target_gene):
    lc = LayerConductance(model, model.gep_output_layer)
    ix = list(data.gep_sga.columns).index(target_gene)
    a = lc.attribute(X, n_steps=5, attribute_to_layer_input=False, target=[ix]*len(X))
    
    ig_attr_test_sum = a.detach().numpy().sum(0)
    ig_attr_test_norm_sum = ig_attr_test_sum / np.linalg.norm(ig_attr_test_sum, ord=1)
    
    g = np.array(data.gene_tf_sga.columns)[np.where(ig_attr_test_norm_sum != 0)[0]]
    at = ig_attr_test_norm_sum[np.where(ig_attr_test_norm_sum != 0)[0]]
    
    df = pd.DataFrame([g, at]).T
    df.columns = ['TF', 'score']
    
    return df.sort_values(by='score', ascending=False)

In [21]:
with open('/ihome/hosmanbeyoglu/kor11/tools/CITRUS/COACH_clusters_large.txt', "r") as fh:
        lines = fh.readlines()
        clusterindex_to_genes = {}
        for i, c in enumerate(lines):
            clustlist = c.strip().split(" ")
            if len(c) == 0:
                continue
            clusterindex_to_genes[i] = clustlist 

from collections import defaultdict
gene_to_clusterindices = defaultdict(list) ## 'MAPK1':[0, 75, 129, 373]

## Create mapping between genes and the protein clusters
for c in clusterindex_to_genes.keys():
    for g in clusterindex_to_genes[c]:
        gene_to_clusterindices[g].append(c)  

In [None]:
cancers = ['BLCA', 'BRCA', 'CESC', 'COAD', 'ESCA', 'GBM', 'HNSC', 'KIRC',
       'KIRP', 'LIHC', 'LUAD', 'LUSC', 'PCPG', 'PRAD', 'STAD', 'THCA',
       'UCEC']

In [None]:
# r = []
# for cancer in cancers:
#     m = np.load(f'./metrics/{cancer}_shuffled_metrics.npy')
#     mean_loss, mean_corr = np.around(np.array([x[-1] for x in m]).mean(0), 3)
#     std_loss, std_corr = np.round(np.array([x[-1] for x in m]).std(0), 3)
#     r.append((cancer, f'{mean_corr}+/-{std_corr}', f'{mean_loss}+/-{std_loss}'))
# pd.DataFrame(r, columns = ['cancer type', 'mse', 'pearson'])

Unnamed: 0,cancer type,mse,pearson
0,BLCA,0.429+/-0.013,2.853+/-0.081
1,BRCA,0.435+/-0.016,2.67+/-0.071
2,CESC,0.334+/-0.015,3.423+/-0.107
3,COAD,0.427+/-0.012,3.161+/-0.055
4,ESCA,0.374+/-0.007,3.779+/-0.082
5,GBM,0.286+/-0.012,4.659+/-0.068
6,HNSC,0.439+/-0.014,2.739+/-0.071
7,KIRC,0.359+/-0.022,3.449+/-0.116
8,KIRP,0.275+/-0.01,4.016+/-0.09
9,LIHC,0.389+/-0.014,3.145+/-0.086


In [None]:
data = Data(
    fGEP_SGA = 'data/CITRUS_GEP_SGAseparated.csv',
    fgene_tf_SGA = 'data/CITRUS_gene_tf_SGAseparated.csv',
    fcancerType_SGA = 'data/CITRUS_canType_SGAseparated.csv',
    fSGA_SGA = 'data/CITRUS_SGA_SGAseparated.csv',
    cancer_type='BRCA'
)

train_set, test_set = data.get_train_test()


In [74]:
df = data.sga_sga
X = df.values
X.shape

(720, 11998)

In [75]:
X1 = df[(df['SM_PIK3CA']==1)]
X1.shape

(265, 11998)

In [76]:
X0 = df[(df['SM_PIK3CA']==0)]
X0.shape

(455, 11998)

In [30]:
tf_profiles1 = np.load('tf_profiles_1.npy')
tf_profiles2 = np.load('tf_profiles_2.npy')


In [32]:
r = [tf_profiles1, tf_profiles2]

In [33]:
from scipy.stats import mannwhitneyu

In [34]:
rf = pd.DataFrame([data.gene_tf_sga.columns, 
        [mannwhitneyu(r[0][:, i][:], r[1][:, i]).pvalue * 5541 for i in range(320)]]).T.sort_values(by=1)

rf.columns = ['TF', 'P']
rf['EFFECTSIZE'] = abs(tf_profiles1.mean(0)  - tf_profiles2.mean(0))
print(rf[rf.P < 0.05].shape)
# rf[rf.AdjPvalue < 0.05].sort_values(by='foldchange', ascending=False)
rf = rf[rf.P < 1]

(50, 3)


In [35]:
rf.P = rf.P.astype(np.float)

In [38]:
rf

Unnamed: 0,TF,P,EFFECTSIZE
294,HEY1,1.247146e-108,1.288033e-01
128,TCF12,5.787345e-35,7.084708e-01
121,NFYC,5.657469e-08,1.046785e-07
153,FLI1,1.543887e-06,5.049044e-01
83,ERG,1.681252e-06,0.000000e+00
...,...,...,...
257,ETV1,6.305362e-01,0.000000e+00
47,RUNX1,8.064497e-01,1.074662e-06
154,HIC1,8.492413e-01,4.700601e-03
26,HIVEP1,8.729153e-01,1.254514e-06


In [5]:
import requests
import json 
import pandas as pd

class Enrichr(object):
    
    def __init__(self):
        self.ENRICHR_URL_ADDLIST = 'https://maayanlab.cloud/Enrichr/addList'
        self.ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
        self.QUERY_STR = '?userListId=%s&backgroundType=%s'
        self.libraries = [
            'VirusMINT', 
            'GO_Biological_Process_2021', 
            'MSigDB_Hallmark_2020', 
            'KEGG_2021_Human', 
            'Reactome_2016']
        
    
    def _addlist(self, module_number, geneset):
        genes_str = '\n'.join(geneset)
        description = ''
        payload = {
            'list': (None, genes_str),
            'description': (None, description)
        }
        response = requests.post(self.ENRICHR_URL_ADDLIST, files=payload)
        data = json.loads(response.text)
        
        return data['userListId']
    
    def get_enrichment_results(self, geneset, gene_set_library = 'GO_Biological_Process_2021'):
        user_list_id = self._addlist(geneset)
        response = requests.get(
        self.ENRICHR_URL + self.QUERY_STR % (user_list_id, gene_set_library))
        data = json.loads(response.text)
        df = pd.DataFrame(data[gene_set_library])[[1, 2, 3, 4, 5, 6]]
        df.columns = ['Terms', 'Pval', 'OddsRatio', 'Score', 'Genes', 'AdjPval']
        
        return df[round(df.AdjPval, 3) < 0.05].sort_values(by='AdjPval')

In [7]:
model = models[0]
target_gene = 'PIK3CA'

NameError: name 'models' is not defined

In [None]:


lc = LayerConductance(model, model.ppi_layer)
ix = list(data.gep_sga.columns).index(target_gene)
a = lc.attribute(X, n_steps=5, attribute_to_layer_input=False, target=[ix]*len(X))

ig_attr_test_sum = a.detach().numpy().sum(0)
ig_attr_test_norm_sum = ig_attr_test_sum / np.linalg.norm(ig_attr_test_sum, ord=1)

In [6]:
model.ppi_layer.data.shape

NameError: name 'model' is not defined

In [None]:

g = np.array(data.gene_tf_sga.columns)
at = ig_attr_test_norm_sum