# Create Embedding for DeepDTNet (step1)
1. Random Surfing Model to generate PCO(Probabilistic Co-Occurence) Matrix and capture graph structure information.
2. Based on the probabilistic co-occurrence matrix, calculate the shifting positive pointwise mutual information (PPMI) matrix.

In [2]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import torch
import random
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# ====== Random Seed Initialization ====== #
def seed_everything(seed = 3078):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything()

In [4]:
# get drug data and drug similarity data from raw/ folder
data_raw_direct = "data/raw/"
data_d = os.path.join(os.getcwd(), data_raw_direct) 
data_d

'/data/project/james/PZdeepdrug/DeepDTNet/data/raw/'

In [5]:
drug_dict = pd.read_csv(os.path.join(data_d, "drug_dict.txt"), engine='python', header=None) #! 732 x 1
drugDis = pd.read_csv(os.path.join(data_d, "drugDisease.txt"), engine='python', header=None, sep='\t') #! 732 x 440
drugdrug = pd.read_csv(os.path.join(data_d, "drugdrug.txt"), engine='python', header=None, sep='\t') #! 732 x 732
drugProt = pd.read_csv(os.path.join(data_d, "drugProtein.txt"), engine='python', header=None, sep='\t') #! 732 x 1915
drugSide = pd.read_csv(os.path.join(data_d, "drugsideEffect.txt"), engine='python', header=None, sep='\t') #! 732 x 12904

In [6]:
drugSim1 = pd.read_csv(os.path.join(data_d, "drugsim1network.txt"), engine='python', header=None, sep='\t') #! 732 x 732 Chemical sim
drugSim2 = pd.read_csv(os.path.join(data_d, "drugsim2network.txt"), engine='python', header=None, sep='\t') #! 732 x 732 Therapeutic
drugSim3 = pd.read_csv(os.path.join(data_d, "drugsim3network.txt"), engine='python', header=None, sep='\t') #! 732 x 732 Protein sequence
drugSim4 = pd.read_csv(os.path.join(data_d, "drugsim4network.txt"), engine='python', header=None, sep='\t') #! 732 x 732 Biological process
drugSim5 = pd.read_csv(os.path.join(data_d, "drugsim5network.txt"), engine='python', header=None, sep='\t') #! 732 x 732 Cellular component
drugSim6 = pd.read_csv(os.path.join(data_d, "drugsim6network.txt"), engine='python', header=None, sep='\t') #! 732 x 732 Molecular function
# 이 6개는 RWR > AE > concat > Embedding

In [7]:
G1 = nx.from_pandas_adjacency(drugSim1)
rwr1 = nx.google_matrix(G1, alpha = 0.85)

G2 = nx.from_pandas_adjacency(drugSim2)
rwr2 = nx.google_matrix(G2, alpha = 0.85)

G3 = nx.from_pandas_adjacency(drugSim3)
rwr3 = nx.google_matrix(G3, alpha = 0.85)

G4 = nx.from_pandas_adjacency(drugSim4)
rwr4 = nx.google_matrix(G4, alpha = 0.85)

G5 = nx.from_pandas_adjacency(drugSim5)
rwr5 = nx.google_matrix(G5, alpha = 0.85)

G6 = nx.from_pandas_adjacency(drugSim6)
rwr6 = nx.google_matrix(G6, alpha = 0.85)

rwr1 = pd.DataFrame(rwr1)
rwr2 = pd.DataFrame(rwr2)
rwr3 = pd.DataFrame(rwr3)
rwr4 = pd.DataFrame(rwr4)
rwr5 = pd.DataFrame(rwr5)
rwr6 = pd.DataFrame(rwr6)

NetworkX version 3.0.
  rwr1 = nx.google_matrix(G1, alpha = 0.85)
NetworkX version 3.0.
  rwr2 = nx.google_matrix(G2, alpha = 0.85)
NetworkX version 3.0.
  rwr3 = nx.google_matrix(G3, alpha = 0.85)
NetworkX version 3.0.
  rwr4 = nx.google_matrix(G4, alpha = 0.85)
NetworkX version 3.0.
  rwr5 = nx.google_matrix(G5, alpha = 0.85)
NetworkX version 3.0.
  rwr6 = nx.google_matrix(G6, alpha = 0.85)


In [9]:
rwr1.iloc[2,:].sum()

0.9999999999999999

In [10]:
def pmi(df, positive=True):
    col_totals = df.sum(axis=1) # 열
    total = col_totals.sum()
    row_totals = df.sum(axis=0) # 행 #! 행 합이 1 임
    expected = np.outer(row_totals, col_totals) / total
    df = df / expected
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = 0.0  # log(0) = 0
    if positive:
        df[df < 0] = 0.0
    return df

In [11]:
pmi1 = pmi(rwr1)
pmi2 = pmi(rwr2)
pmi3 = pmi(rwr3)
pmi4 = pmi(rwr4)
pmi5 = pmi(rwr5)
pmi6 = pmi(rwr6)

In [12]:
# get drug data and drug similarity data from raw/ folder
data_matrix_direct = "data/matrix/"
data_matrix_d = os.path.join(os.getcwd(), data_matrix_direct) 
data_matrix_d

'/data/project/james/PZdeepdrug/DeepDTNet/data/matrix/'

In [13]:
pmi1.to_csv(os.path.join(data_matrix_d,'PPMI_matrix_Chemical.txt'), sep='\t')
pmi2.to_csv(os.path.join(data_matrix_d,'PPMI_matrix_Therapeutic.txt'), sep='\t')
pmi3.to_csv(os.path.join(data_matrix_d,'PPMI_matrix_ProteinSequence.txt'), sep='\t')
pmi4.to_csv(os.path.join(data_matrix_d,'PPMI_matrix_BiologicalProcess.txt'), sep='\t')
pmi5.to_csv(os.path.join(data_matrix_d,'PPMI_matrix_CellularComponent.txt'), sep='\t')
pmi6.to_csv(os.path.join(data_matrix_d,'PPMI_matrix_MolecularFunction.txt'), sep='\t')