In [1]:
import os
import random
import time

import numpy as np
import torch

from func import out_string_nets, textread, out_moment_emb
from mashup import mashup_multi, load_multi

torch.manual_seed(1)
random.seed(1)
np.random.seed(1)

# Load data

In [2]:
# Change these two according to your dataset
org = 'yeast'
net = 'mashup'
# org = ''
# net = ''

string_nets = out_string_nets(net, org)
string_nets = ['coexpression',
             'cooccurence',
             'neighborhood']
# change string_nets according to your dataset 
# string_nets = []

# you can download the sample data by using 'sh get_data_mashup.sh'
# and check the format of xxx_adjacency.txt and xxx_genes.txt
# then put your data under the data/networks
# Load networks
network_files = []
for i in range(len(string_nets)):
    network_files.append(
        f'../data/networks/{org}/{org}_string_{string_nets[i]}_adjacency.txt')

# Load gene list
gene_file = f'../data/networks/{org}/{org}_{net}_genes.txt'
genes = textread(gene_file)
ngene = len(genes)

# Gemini

In [5]:
method = 'Gemini'
mixup = True

# torch_thread*num_thread is the total threads to be used
num_thread = 2
torch_thread = 4

# This is a hyperparameter, larger ndim has more representation power.
ndim = 400

# Change to your desired device
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


## Calculate embedding

In [None]:
# Network integration
# calculate embedding for each networks using random walk with restarting (rwr)
weights = None
node_weights = None
rwr = 'rwr'
start_time = time.time()
print(f'{method}_{org}_{net}_{ndim}')
print('[Mashup]')

embd_name = f'../data/embed/{method}_{org}_{net}_{ndim}'
x = mashup_multi(network_files, ngene, ndim,
                 mixup, num_thread, torch_thread,
                 weights,
                 node_weights=node_weights,
                 rwr=rwr, device=device)


## Calculate embedding for clustering 

In [None]:
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm

embed_type = 'Qsm4' # 4th standard moment on Q(rwr)
axis = 1
level = 'network' # network or node, we averaged on network in our research
average_type = 0 # 0 means on rwr, 1 means on log(rwr), we only implement 0 in this project

embeds = []
embed_name = f'data/embed/{net}_{org}_type{average_type}_' + \
    f'{embed_type}{axis}_{level}.npy'

data = network_files, average_type, ngene
f = partial(out_moment_emb, data)
num_net = len(network_files)
max_len = num_net

max_idx = max_len//num_thread
max_idx = max_idx+1 if max_len % num_thread > 0 else max_idx
for i in tqdm(range(max_idx)):
    start_idx = num_thread*(i)
    end_idx = num_thread*(i+1)
    end_idx = end_idx if end_idx <= max_len else max_len
    idxs = np.arange(num_net)[start_idx:end_idx]
    use_pool = True
    if not use_pool:
        embed_part = []
        for idx in idxs:
            embed_part.append(f(idx))
    else:
        with Pool(processes=num_thread) as pl:
            embed_part = pl.map(f, idxs)
    embeds.extend(embed_part)

    
i_ = -1
for od in [1, 2, 3, 4]:
    for em in ['Q']:
        for embed_type in [f'{em}sm{od}', f'{em}m{od}']:
            for axis in [1]:
                i_ += 1
                c = np.array([embeds[i][i_]
                              for i in range(len(embeds))])
                np.save(
                    f'../data/embed/{net}_{org}_type{average_type}_' +
                    f'{embed_type}{axis}_{level}', c)
c = np.load(
    f'../data/embed/{net}_{org}_type{average_type}_' +
    f'{embed_type}{axis}_{level}.npy')
c = c[:len(network_files)]
print(c.shape)



## Clustering: Affinity Propagation

In [None]:
cluster = 'ap'
from sklearn.cluster import AffinityPropagation
print('run AffinityPropagation')
clustering = AffinityPropagation(
    damping=0.875, random_state=0).fit(c)
separate = clustering.labels_

print(separate)
print(set(separate))
print(len(set(separate)))
num2i = {num: i for i, num in enumerate(list(set(separate)))}
separate = [num2i[num] for num in separate]
if not os.path.exists('../data/separate'):
    os.mkdir('../data/separate')
np.save(
    f'../data/separate/{net}_{org}_type{average_type}_' +
    f'{embed_type}{axis}_{cluster}_{level}',
    separate)


In [None]:
weight = 1 # using clster size to set weight to each network
ori_weight = 0.5
ori_seed = 1
mixup2 = 1.0
mixup = 1
gamma = 0.5

cluster_method = cluster
weights = np.zeros(len(network_files))
embed_type = embed_type
axis = 1
separate = np.load(
    f'../data/separate/{net}_{org}_type0_' +
    f'{embed_type}{axis}_{cluster}_' +
    f'{level}.npy')
if weight == 2:
    clus_count = np.ones(len(set(separate)))
elif weight == 1:
    clus_count = np.zeros(len(set(separate)))
separate = separate[:len(network_files)]
for i in separate:
    clus_count[i] += 1
if weight == 2:
    clus_weight = 1/clus_count + \
        ori_weight/len(network_files)
elif weight == 1:
    clus_weight = 1/clus_count
weights += np.array([clus_weight[i] for i in separate])


embd_name += f'_{embed_type}{axis}_' + \
    f'separate{separate}_{cluster_method}' + \
    f'_weight{weight}_{ori_weight}'
separate = None
print(level)
embd_name += f'_{level}'

if mixup > 0:
    network_pairs_mixup_ = []
    from numpy.random import choice
    random.seed(1)
    print(ndim)
    # np.random.seed(1)
    p = weights
    p = p/p.sum()
    list_of_candidates = np.arange(len(network_files))
    for idd in range(mixup):
        network_pairs_mixup = []
        ori_seed = int(np.floor(ori_seed*10000)/10000)
        np.random.seed(idd+ori_seed)
        for _ in range(round(len(network_files)*mixup2)):
            # for ixd in range(mixup):
            draw = choice(list_of_candidates, 2,
                          p=p)
            d0, d1 = draw[0], draw[1]
            # if separate[d0] != separate[d1]:
            n0 = network_files[d0]
            n1 = network_files[d1]
            network_pairs_mixup.append([n0, 1, n1, 1])
        network_pairs_mixup_.append(network_pairs_mixup)
    mixup = 'mixup'
    embd_name += f'_mixup{mixup}_{mixup2}'
    embd_name += f'_gamma{gamma}'
    network_files_all = network_pairs_mixup_
    

In [None]:
print('Using multiply time mixup to make new embeding')
xs = []
for network_files in network_files_all:
    xs.append(load_multi(network_files, ngene, ndim,
                         mixup, num_thread,
                         torch_thread,
                         weights,
                         node_weights=node_weights,
                         gamma=gamma, device=device))
if len(xs) > 0:
    x = np.concatenate(xs, axis=0)
np.save(embd_name, x)
# x is the embedding we need