# Decagon - Training

## 1. Data Loading

In [1]:
import scipy.sparse as sp
import pickle


### 1.1 load graph infomation

In [2]:
with open('data_decagon/graph_num_info.pkl', 'rb') as f:
    [num_gene, num_drug, num_edge_type, num_drug_additional_feature] = pickle.load(f)

print("Summary: ")
print(" -> Protein Node              : {:d}".format(num_gene))
print(" -> Drug    Node              : {:d}".format(num_drug))
print(" -> Drug    Pair  Side  Effect: {:d}".format(num_edge_type))
print(" -> Single  Drug  Side  Effect: {:d}".format(num_drug_additional_feature))

Summary: 
 -> Protein Node              : 19089
 -> Drug    Node              : 645
 -> Drug    Pair  Side  Effect: 1317
 -> Single  Drug  Side  Effect: 10184


### 1.2 load sparse adjacency matrix

In [3]:
# gene-gene
gene_adj = sp.load_npz("data_decagon/gene-sparse-adj.npz")

# gene-drug
gene_drug_adj = sp.load_npz("data_decagon/gene-drug-sparse-adj.npz")
drug_gene_adj = sp.load_npz("data_decagon/drug-gene-sparse-adj.npz")

# drug-drug
drug_drug_adj_list = []
for i in range(num_edge_type):
    drug_drug_adj_list.append(sp.load_npz("".join(["data_decagon/drug-sparse-adj/type_", str(i), ".npz"])))
    

### 1.3 load sparse drug feature matrix

In [4]:
drug_feat_sparse = sp.load_npz("data_decagon/drug-feature-sparse.npz")


## 2. Feature Vector Generation

### 2.1 gene one-hot vector

In [5]:
import decagon.utility.preprocessing as preprocessing

In [6]:
gene_feat = sp.identity(num_gene)

gene_nonzero_feat, gene_num_feat = gene_feat.shape
gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

### 2.2 drug feature vector with additional features (single side effect)

In [8]:
import numpy as np

In [9]:
# drug feature vectors have been constructed in pre-process and loaded above

drug_nonzero_feat, drug_num_feat = drug_feat_sparse.shape[1], np.count_nonzero(drug_feat_sparse.sum(axis=0))
drug_feat = preprocessing.sparse_to_tuple(drug_feat_sparse.tocoo())


## 3. Data Representation

### 3.1 adjacency matrix

In [10]:
# representation
adj_mats_orig = {
    (0, 0): [gene_adj, gene_adj.transpose(copy=True)],
    (0, 1): [gene_drug_adj],
    (1, 0): [drug_gene_adj],
    (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list],
}

### 3.2 gene & drug degrees

In [11]:
# gene
gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()
# drug
drug_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list]

# representation
degrees = {
    0: [gene_degrees, gene_degrees],
    1: drug_degrees_list + drug_degrees_list,
}


### 3.3 features & non-zero features

In [12]:
# representation
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}
num_nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}
feat = {
    0: gene_feat,
    1: drug_feat,
}

### 3.4 edge types

In [13]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

# edge type contains: 2 p-p, 1 p-d, 1 d-p, 2*1317 d-d
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
num_edge_types = sum(edge_types.values())
print("Edge types:", "%d" % num_edge_types)

Edge types: 2638
