In [10]:

import pandas as pd
import numpy as np
import csv
import os
import hickle as hkl
import improve_utils as iu
import scipy.sparse as sp
import random


In [2]:
Drug_info_file = './data/drug/1.Drug_listMon Jun 24 09_00_55 2019.csv'
Cell_line_info_file = './data/CCLE/Cell_lines_annotations_20181226.txt'
# open to output
Drug_feature_file = './data/drug/drug_graph_feat'
Cancer_response_exp_file = './data/CCLE/GDSC_IC50.csv'
selected_info_common_cell_lines = "./data/CCLE/cellline_list.txt"
celline_feature_folder = "./data/CCLE/omics_data"
# hard-coded. 
PPI_file = "./data/PPI/PPI_network.txt"
selected_info_common_genes = "./data/CCLE/gene_list.txt"

# Max_atoms = PARAMS["max_atoms"]

TCGA_label_set = ["ALL","BLCA","BRCA","DLBC","LIHC","LUAD",
                  "ESCA","GBM","HNSC","KIRC","LAML","LCML","LGG",
                  "LUSC","MM","NB","OV","PAAD","SCLC","SKCM",
                  "STAD","THCA",'COAD/READ','SARC','UCEC','MESO', 'PRAD']

def MetadataGenerate(Drug_info_file, Cell_line_info_file, Drug_feature_file, PPI_file, selected_info_common_cell_lines, selected_info_common_genes, **kwargs):
    with open(selected_info_common_cell_lines) as f:
        common_cell_lines = [item.strip() for item in f.readlines()]

    with open(selected_info_common_genes) as f:
        common_genes = [item.strip() for item in f.readlines()]
    idx_dic={}
    for index, item in enumerate(common_genes):
        idx_dic[item] = index

    ppi_adj_info = [[] for item in common_genes] 
    for line in open(PPI_file).readlines():
        gene1,gene2 = line.split('\t')[0],line.split('\t')[1]
        if idx_dic[gene1]<=idx_dic[gene2]:
            ppi_adj_info[idx_dic[gene1]].append(idx_dic[gene2])
            ppi_adj_info[idx_dic[gene2]].append(idx_dic[gene1])

    reader = csv.reader(open(Drug_info_file,'r'))
    rows = [item for item in reader]
    drugid2pubchemid = {item[0]:item[5] for item in rows if item[5].isdigit()}

    cellline2cancertype ={}
    for line in open(Cell_line_info_file).readlines()[1:]:
        cellline_id = line.split('\t')[1]
        TCGA_label = line.strip().split('\t')[-1]
        cellline2cancertype[cellline_id] = TCGA_label

    drug_pubchem_id_set = []
    drug_feature = {} 
    for each in os.listdir(Drug_feature_file):
        drug_pubchem_id_set.append(each.split('.')[0])
        feat_mat,adj_list,degree_list = hkl.load('%s/%s'%(Drug_feature_file,each))
        drug_feature[each.split('.')[0]] = [feat_mat,adj_list,degree_list]
    assert len(drug_pubchem_id_set)==len(drug_feature.values())

    IC50_df = pd.read_csv(Cancer_response_exp_file,sep=',',header=0,index_col=[0])
    drug_match_list=[item for item in IC50_df.index if item.split(':')[1] in drugid2pubchemid.keys()]
    IC50_df = IC50_df.loc[drug_match_list]
    
    index_name = [drugid2pubchemid[item.split(':')[1]] for item in IC50_df.index if item.split(':')[1] in drugid2pubchemid.keys()]
    IC50_df.index = index_name
    redundant_names = list(set([item for item in IC50_df.index if list(IC50_df.index).count(item)>1]))
    retain_idx = []
    for i in range(len(IC50_df.index)):
        if IC50_df.index[i] not in redundant_names:
            retain_idx.append(i)
    IC50_df = IC50_df.iloc[retain_idx]

    data_idx = [] 
    for i, each_drug in enumerate(IC50_df.index):
        for each_cellline in IC50_df.columns:
            if str(each_drug) in drug_pubchem_id_set and each_cellline in common_cell_lines:
                if not np.isnan(IC50_df.loc[each_drug,each_cellline]) and each_cellline in cellline2cancertype.keys() and cellline2cancertype[each_cellline] in TCGA_label_set:
                    ln_IC50 = float(IC50_df.loc[each_drug,each_cellline])
                    data_idx.append((each_cellline,each_drug,ln_IC50,cellline2cancertype[each_cellline]))
    nb_celllines = len(set([item[0] for item in data_idx]))
    nb_drugs = len(set([item[1] for item in data_idx]))
    print('%d instances across %d cell lines and %d drugs were generated.'%(len(data_idx),nb_celllines,nb_drugs))
    return ppi_adj_info, drug_feature, data_idx 

ppi_adj_info, drug_feature, data_idx = MetadataGenerate(Drug_info_file, Cell_line_info_file, Drug_feature_file, PPI_file, selected_info_common_cell_lines, selected_info_common_genes)



86530 instances across 525 cell lines and 208 drugs were generated.


In [3]:
def MetadataGenerate_version_IMPROVE(path = './data/IMPROVE_CCLE/drug/drug_graph_feat/',
                                     drug_path = './data/IMPROVE_CCLE/drug/drug_graph_feat/',
                                     PPI_file = './data/IMPROVE_CCLE/PPI/PPI_network_new.txt', 
                                     selected_info_common_genes = './data/IMPROVE_CCLE/gene_list.txt'):
    
    df_train = iu.load_single_drug_response_data_v2(source = 'CCLE', split_file_name = 'CCLE_split_0_train.txt', y_col_name='auc1')
    df_test = iu.load_single_drug_response_data_v2(source = 'CCLE', split_file_name = 'CCLE_split_0_test.txt', y_col_name='auc1')
    df_val = iu.load_single_drug_response_data_v2(source = 'CCLE', split_file_name = 'CCLE_split_0_val.txt', y_col_name='auc1')
    
    # Sorting columns and removing source column, to be like the data_idx format
    df_train = df_train[['improve_sample_id', 'improve_chem_id', 'auc1']].values.tolist()
    df_test = df_test[['improve_sample_id', 'improve_chem_id', 'auc1']].values.tolist()
    df_val = df_val[['improve_sample_id', 'improve_chem_id', 'auc1']].values.tolist
    
    drug_feature = {}
    for each in os.listdir(drug_path):
        feat_mat,adj_list,degree_list = hkl.load(drug_path + each)
        # Save the name of the drug "each" as the word for the dictionary
        
        drug_feature[each.split('.')[0]] = [feat_mat,adj_list,degree_list]
        
    # NOTE: I am adding the ppi_adj_info here. However, I am not sure why it is needed. 
    # common_genes = pd.read_csv(selected_info_common_genes, sep = '\t', header = None).values.squeeze().tolist()
    PPI_net = pd.read_csv(PPI_file, sep = '\t', header = None)

    common_genes = pd.concat([PPI_net[0], PPI_net[1]], axis = 0).drop_duplicates().reset_index(drop=True).values.tolist()
    
    # print(common_genes)
    idx_dic = {}
    for index, item in enumerate(common_genes):
        idx_dic[item] = index
    ppi_adj_info = [[] for item in common_genes] 
    # ppi_adj_info = pd.read_csv(ppi_info_file, sep = '\t', header = None)
    # print(ppi_adj_info.shape)
    for line in open(PPI_file).readlines():
        gene1, gene2 = line.split('\t')[0],line.split('\t')[1]
        # print(gene1, gene2)
        if idx_dic[gene1] <= idx_dic[gene2]:
            ppi_adj_info[idx_dic[gene1]].append(idx_dic[gene2])
            ppi_adj_info[idx_dic[gene2]].append(idx_dic[gene1])
    
    return df_train, df_test, df_val, drug_feature, ppi_adj_info

df_train, df_test, df_val, drug_feature, ppi_adj_info = MetadataGenerate_version_IMPROVE()

Response data: (7558, 4)
improve_sample_id    411
improve_chem_id       24
dtype: int64
Response data: (944, 4)
improve_sample_id    373
improve_chem_id       24
dtype: int64
Response data: (944, 4)
improve_sample_id    378
improve_chem_id       24
dtype: int64


# Axuliary Functions for training

Review this, there is an error with the featureextract function. 

In [13]:
# Normalize adjacent matrix D^{-0.5}{T}A^{T}D^{-0.5}
def NormalizeAdj(adj):
    adj = adj + np.eye(adj.shape[0])
    d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0).toarray()
    a_norm = adj.dot(d).transpose().dot(d)
    return a_norm

def random_adjacency_matrix(n):
    matrix = [[random.randint(0, 1) for i in range(n)] for j in range(n)]
    for i in range(n):
        matrix[i][i] = 0
    for i in range(n):
        for j in range(n):
            matrix[j][i] = matrix[i][j]
    return matrix

def CalculateGraphFeat(feat_mat,adj_list,israndom=False):
    assert feat_mat.shape[0] == len(adj_list)
    feat = np.zeros((Max_atoms,feat_mat.shape[-1]),dtype='float32')
    adj_mat = np.zeros((Max_atoms,Max_atoms),dtype='float32')
    if israndom:
        feat = np.random.rand(Max_atoms,feat_mat.shape[-1])
        adj_mat[feat_mat.shape[0]:,feat_mat.shape[0]:] = random_adjacency_matrix(Max_atoms-feat_mat.shape[0]) 
    feat[:feat_mat.shape[0],:] = feat_mat  
    for i in range(len(adj_list)):
        nodes = adj_list[i]
        for each in nodes:
            adj_mat[i,int(each)] = 1 
    assert np.allclose(adj_mat,adj_mat.T)
    adj_ = adj_mat[:len(adj_list),:len(adj_list)]
    adj_2 = adj_mat[len(adj_list):,len(adj_list):]
    norm_adj_ = NormalizeAdj(adj_)
    norm_adj_2 = NormalizeAdj(adj_2)
    adj_mat[:len(adj_list),:len(adj_list)] = norm_adj_
    adj_mat[len(adj_list):,len(adj_list):] = norm_adj_2
    return [feat,adj_mat]

def CelllineGraphAdjNorm(ppi_adj_info,selected_info_common_genes, **kwargs):
    with open(selected_info_common_genes) as f:
        common_genes = [item.strip() for item in f.readlines()]
    nb_nodes = len(common_genes)
    adj_mat = np.zeros((nb_nodes,nb_nodes),dtype='float32')
    for i in range(len(ppi_adj_info)):
        nodes = ppi_adj_info[i]
        for each in nodes:
            adj_mat[i,each] = 1
    assert np.allclose(adj_mat,adj_mat.T)
    norm_adj = NormalizeAdj(adj_mat)
    return norm_adj 

def FeatureExtract(data_idx,drug_feature, celline_feature_folder, selected_info_common_cell_lines, selected_info_common_genes,israndom=False, **kwargs):
    cancer_type_list = []
    nb_instance = len(data_idx)
    drug_data = [[] for item in range(nb_instance)]
    cell_line_data_feature = [[] for item in range(nb_instance)]
    target = np.zeros(nb_instance,dtype='float32')
    cellline_drug_pair = []
    with open(selected_info_common_cell_lines) as f:
        common_cell_lines = [item.strip() for item in f.readlines()]
    
    with open(selected_info_common_genes) as f:
        common_genes = [item.strip() for item in f.readlines()]
    dic_cell_line_feat = {}
    for each in common_cell_lines:
        dic_cell_line_feat[each] = pd.read_csv('%s/%s.csv'%(celline_feature_folder, each), index_col=0).loc[common_genes].values 
    for idx in range(nb_instance):
        cell_line_id,pubchem_id,ln_IC50,cancer_type = data_idx[idx]
        cellline_drug_tmp = cell_line_id + "_" + pubchem_id
        cellline_drug_pair.append(cellline_drug_tmp)
        cell_line_feat_mat =  dic_cell_line_feat[cell_line_id] 
        feat_mat,adj_list,_ = drug_feature[str(pubchem_id)] 
        drug_data[idx] = CalculateGraphFeat(feat_mat,adj_list,israndom)
        cell_line_data_feature[idx] = cell_line_feat_mat
        target[idx] = ln_IC50
    drug_feat = np.array([item[0] for item in drug_data])
    drug_adj = np.array([item[1] for item in drug_data])
    return drug_feat,drug_adj, np.array(cell_line_data_feature),target

In [12]:
adj = random_adjacency_matrix(3)
a = NormalizeAdj(adj)

AttributeError: 'list' object has no attribute 'shape'