In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
hallmark = pd.read_excel('/ihome/hosmanbeyoglu/kor11/tools/CITRUS/FW__MCF10A_wild_type_and_PIK3CA_H1047R_knock-in_cell_lines/Supplementary Table S4.xlsx', 
    sheet_name='MCF10A_hallmark_PI3K_Activation')

In [3]:
from utils import Data, get_ppi_edge_list

data_csv = Data(
    fGEP_SGA = 'data/CITRUS_GEP_SGAseparated.csv',
    fgene_tf_SGA = 'data/CITRUS_gene_tf_SGAseparated.csv',
    fcancerType_SGA = 'data/CITRUS_canType_SGAseparated.csv',
    fSGA_SGA = 'data/CITRUS_SGA_SGAseparated.csv',
)

In [4]:
ppi = pd.DataFrame(get_ppi_edge_list(sparse=False)[:, :2], columns=['A', 'B'])

[38;21m(utils.py : 457) -    DEBUG | Loaded 352251 edges from the SIGNOR and SNAP Networks[0m


In [5]:
# tf_ppi = ppi[ppi.A.isin(data_csv.tf) | ppi.B.isin(data_csv.tf)]
# tfs = pd.DataFrame(data_csv.tf)
# tfs.columns = ['tf']
# tfs['interacts_with'] = tfs.tf.apply(lambda x: set(tf_ppi[(tf_ppi==x).any(axis=1)].values.reshape(-1)))
# tfs = dict(zip(tfs['tf'], tfs['interacts_with']))
# def does_interact(tf, geneset):
#     if tf in geneset or len(tfs.get(tf, set()).intersection(geneset)) > 0:
#         return True            
#     return False

In [6]:
# pbar = tqdm(total=len(hallmark.values))
# hallmark_mask = np.zeros((hallmark.shape[0], len(data_csv.tf)), dtype=int)

# for idx, (pathway, genes) in enumerate(hallmark.values):
#     for idy, tf in enumerate(data_csv.tf):
#         pbar.set_description(f'{pathway[9:]} | ({idy}/{len(data_csv.tf)}) - {tf}')
#         if does_interact(tf, genes.split('/')):
#             hallmark_mask[idx, idy] = 1
    
#     pbar.update(1)
# pbar.close()

# np.save('hallmark_mask.npy', hallmark_mask)

In [7]:
hallmark_mask = np.load('hallmark_mask.npy')

In [8]:
import os
import argparse
from utils import bool_ext, load_dataset, split_dataset, evaluate, checkCorrelations
from models import CITRUS
import pickle
import torch
import numpy as np
import pandas as pd
import warnings 
from pathlib import Path
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")
from sklearn import metrics

import yaml

with open('args.yaml', 'r') as f:
    args_dict = yaml.safe_load(f)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device == 'cuda':
    device_name = torch.cuda.get_device_name(0)
else:
    device_name = 'cpu'


parser = argparse.ArgumentParser()
args = argparse.Namespace(**args_dict)
args.tf_gene = np.load('tf_gene.npy')

In [10]:
dataset, dataset_test = load_dataset(
    input_dir=args.input_dir,
    mask01=args.mask01,
    dataset_name=args.dataset_name,
    gep_normalization=args.gep_normalization,
)

train_set, test_set = split_dataset(dataset, ratio=0.66)

daata = pickle.load( open("/ihome/hosmanbeyoglu/kor11/tools/CITRUS/data/dataset_CITRUS.pkl", "rb") )
cancers = daata['idx2can']


models = []
for m in tqdm(list(Path('/ix/hosmanbeyoglu/kor11/output').iterdir())):
    model = CITRUS(args) 
    model.build(device=device)
    model.to(device);
    model.load_model(m, device=device)
    model.eval()
    models.append(model)

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
[m.pval_corr for m in models]

[0.3228966580749612,
 0.3056089131453084,
 0.3207356899587547,
 0.3023434502141517,
 0.3099308493777216,
 0.3081540533710628,
 0.3011909338855082,
 0.3060411067685497,
 0.3019592781046038,
 0.3128601617130239]

In [12]:
from scipy.stats import ttest_ind
import seaborn as sns
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

In [52]:
# results = pd.DataFrame(np.array([m.performance for m in models]).mean(0), columns = ['corr', 'mse'])
# results['corr_std'] = np.array([m.performance for m in models]).std(0)[:, 0]
# results['mse_std'] = np.array([m.performance for m in models]).std(0)[:, 1]
# results = results[['corr', 'corr_std', 'mse', 'mse_std']]
# results.index = models[0].cancers

In [54]:
# results.to_csv('perf_shuffled_across.csv')

In [13]:
from utils import Data 

data_csv = Data(
    fGEP_SGA = 'data/CITRUS_GEP_SGAseparated.csv',
    fgene_tf_SGA = 'data/CITRUS_gene_tf_SGAseparated.csv',
    fcancerType_SGA = 'data/CITRUS_canType_SGAseparated.csv',
    fSGA_SGA = 'data/CITRUS_SGA_SGAseparated.csv',
)

data = pickle.load( open("/ihome/hosmanbeyoglu/kor11/tools/CITRUS/data/dataset_CITRUS.pkl", "rb"))


df = pd.DataFrame(np.column_stack([data['tmr'], data['can']]), columns=['tmr', 'cancer'])
df['cancer'] = df['cancer'].astype(int).replace(data['idx2can'])
xdf = pd.read_parquet('xdf.parquet') 

In [52]:
import gzip

def split_mutants(cancer, gene):    
    _sm = f'SM_{gene}'
    _scna = f'SCNA_{gene}'
    
    if not cancer:
        dframe = data_csv.sga_sga.loc[df.tmr]
    else:
        dframe = data_csv.sga_sga.loc[df[df.cancer==cancer].tmr]
    
    wt = dframe[(dframe[_sm] == 0) & (dframe[_scna] == 0)]
    sm = dframe[(dframe[_sm] == 1) & (dframe[_scna] == 0)]
    # scna = dframe[(dframe[_sm] == 0) & (dframe[_scna] == 1)]
    # sm_scna = dframe[(dframe[_sm] == 1) & (dframe[_scna] == 1)]
    idx = xdf[xdf.id.isin(wt.index)].idx.values
    idy = xdf[xdf.id.isin(sm.index)].idx.values

    return idx, idy
    


idx, idy = split_mutants(None, 'NFE2L2')


# # wt = pd.read_parquet('wt.parquet')
# # sm_mut = pd.read_parquet('sm_mut.parquet')

f = gzip.GzipFile('sga.npy.gz', 'r')
sga = np.load(f)
f.close()

g = gzip.GzipFile('can.npy.gz', 'r')
can = np.load(g)
g.close()

idx.shape, idy.shape

((4415,), (150,))

In [28]:
clinical_from_cbioportal = pd.read_table('https://media.githubusercontent.com/media/cBioPortal/datahub/master/public/hnsc_tcga_pan_can_atlas_2018/data_clinical_patient.txt')
clinical_from_cbioportal = clinical_from_cbioportal.drop([0, 1, 2, 3])
clinical_from_cbioportal = clinical_from_cbioportal.set_index('#Patient Identifier')
traits = clinical_from_cbioportal[['Subtype']].copy()
traits.index.name = None
traits.columns = ['hpv']
traits = traits.dropna()
traits.hpv = traits.hpv.replace({'HNSC_HPV-': 0, 'HNSC_HPV+': 1})

In [69]:
idx = xdf[xdf.id.isin(traits[traits.hpv==1].index)].idx.values
idy = xdf[xdf.id.isin(traits[traits.hpv==0].index)].idx.values

In [80]:
idx.shape, idy.shape

((60,), (314,))

In [31]:
from tqdm import tqdm

In [53]:
R = None
S = None

for model in tqdm(models):
    model.eval()

    X = torch.from_numpy(sga)[idx]
    C = torch.from_numpy(can[idx])
    r = model(X, C, pathways=True).data.numpy()

    if R is None:
        R = r
    else:
        R += r

    X = torch.from_numpy(sga)[idy]
    C = torch.from_numpy(can[idy])
    s = model(X, C, pathways=True).data.numpy()

    if S is None:
        S = s
    else:
        S += s    

100%|██████████| 10/10 [00:19<00:00,  2.00s/it]


In [65]:
dx.columns

Index(['pvalue'], dtype='object')

In [72]:
dx = pd.DataFrame(ttest_ind(R, S).pvalue, 
        index=hallmark.Description, 
        columns=['pvalue']).sort_values(by='pvalue', ascending=True)

dx.to_csv('./NFE2L2.csv')

In [33]:
R = R / len(models)
S = S / len(models)

p_predicted = pd.DataFrame(ttest_ind(R, S).pvalue, 
        index=hallmark.Description, 
        columns=['pvalue']).sort_values(by='pvalue', ascending=True).loc[hallmark.Description].pvalue.values

p_exp = hallmark['pvalue'].values
spearmanr(p_predicted, p_exp)

SpearmanrResult(correlation=-0.07645024980002056, pvalue=0.5977177899674204)

In [22]:
R.shape, S.shape

((0, 50), (0, 50))

In [None]:
from statsmodels.stats.multitest import fdrcorrection

In [79]:
pd.DataFrame(ttest_ind(R, S).pvalue, 
        index=hallmark.Description, 
        columns=['pvalue']).sort_values(by='pvalue', ascending=True).to_csv('HPV_analysis.csv')

In [None]:
np.save('p_predicted2.npy', p_predicted)
np.save('p_exp2.npy', p_exp)

array([-0.03885234, -0.01304751, -0.0700935 , -0.05409943, -0.04236699,
       -0.06662035, -0.07512558, -0.04664319, -0.06729268, -0.04858711])

array([0.84551702, 0.87585314, 0.86683599, 0.87819618, 0.88397833,
       0.83529452, 0.90377233, 0.88774731, 0.86372933, 0.84456712,
       0.86610128, 0.86751381, 0.79801956, 0.92775074, 0.86453251,
       0.86601414, 0.87669903])

In [73]:
pd.read_csv('NFE2L2.csv')

Unnamed: 0,Description,pvalue
0,HALLMARK_COAGULATION,9.033616e-31
1,HALLMARK_APOPTOSIS,1.538968e-26
2,HALLMARK_DNA_REPAIR,1.135789e-24
3,HALLMARK_APICAL_JUNCTION,1.340319e-20
4,HALLMARK_INTERFERON_GAMMA_RESPONSE,9.466889000000001e-17
5,HALLMARK_BILE_ACID_METABOLISM,6.400703e-16
6,HALLMARK_PANCREAS_BETA_CELLS,1.350776e-15
7,HALLMARK_COMPLEMENT,6.458875e-14
8,HALLMARK_TGF_BETA_SIGNALING,2.144005e-13
9,HALLMARK_P53_PATHWAY,6.742763e-12
