In [2]:
import matplotlib.pyplot as plt
from math import isclose
from sklearn.decomposition import PCA
import os
import networkx as nx
import numpy as np
import pandas as pd
import csv
from stellargraph import StellarGraph as sg
from stellargraph.data import EdgeSplitter
from collections import Counter
import multiprocessing
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split

# Load dataset

In [28]:
# Create graph for csv file with edge weights
def parse_ckn_csv(fname):
   g = nx.DiGraph()
   fields = ['from', 'to', 'type', 'reliability']
   with open(fname, newline='') as csvfile:
       reader = csv.DictReader(csvfile, fieldnames=fields, dialect=csv.excel_tab, restkey='rest', )
       for row in reader:
           g.add_edge(row['from'], row['to'], type=row['type'], reliability=row['reliability'])
           if row['type'] == 'binding':
               g.add_edge(row['to'], row['from'], type=row['type'])
   return g

In [75]:
g = parse_ckn_csv("/Users/mmarzi/MLprojects/nib_ckn/data/LKN_ath_v3B_2016-08-30.txt")

In [76]:
# Check for errors in the graph
print(sorted(g.nodes)[:20])
print(sorted(g.nodes)[-20:])
isolated = list(nx.isolates(g))
print(f"isolated nodes={isolated}")
#g.remove_nodes_from(isolated)

['6K1', '6K2', 'AT1G01010', 'AT1G01020', 'AT1G01030', 'AT1G01040', 'AT1G01050', 'AT1G01060', 'AT1G01070', 'AT1G01080', 'AT1G01090', 'AT1G01100', 'AT1G01110', 'AT1G01120', 'AT1G01130', 'AT1G01140', 'AT1G01150', 'AT1G01160', 'AT1G01170', 'AT1G01180']
['MIR858', 'MIR859', 'MIR860', 'MIR861', 'MIR862', 'MIR863', 'MIR864', 'MIR865', 'MIR866', 'MIR867', 'MIR868', 'MIR869', 'MIR870', 'NIa-Pro', 'NIb', 'P1', 'P3', 'P3N-PIPO', 'R-gene', 'VPg']
isolated nodes=[]


In [77]:
# Some general characteristics of the graph
print(f'nodes={g.number_of_nodes()}, edges={g.number_of_edges()}, density: {nx.density(g):.4f}, weak components: {nx.number_weakly_connected_components(g)}')

nodes=20011, edges=94542, density: 0.0002, weak components: 2427


Graph is sparse with a high number of weakly connected components and has a very low density (0-1).

In [94]:
# List of weakly connected components
[len(c) for c in sorted(nx.weakly_connected_components(g), key=len, reverse=True)][:25]

[17550, 8, 7, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1]

In [93]:

from collections import Counter
from pprint import pprint
c_sizes = Counter()
for c in nx.weakly_connected_components(g):
    c_sizes[len(c)] += 1
pprint(sorted(c_sizes.items()))

[(1, 2404), (2, 19), (4, 1), (7, 1), (8, 1), (17550, 1)]


There is only one large weakly connected component that rest are negligible so we will consider only the largest component.

In [113]:
# identify the largest weakly connected component and create a corresponding subgraph
max_c = max(nx.weakly_connected_components(g))
print(f"biggest component size={len(max_c)}")
g = nx.subgraph(g, max_c)

# Some general characteristics of the subgraph
print(f"subgraph nodes={g.number_of_nodes()}, subgraph edges={g.number_of_edges()}, density: {nx.density(g):.4f}")

biggest component size=17550
subgraph nodes=17550, subgraph edges=92056, density: 0.0003


# Graph embeddings

## Node2vec embeddings

In [None]:
# Node2vec ... adapt
import node2vec
import random
random.seed(a=12345)
n2v = node2vec.Node2Vec(g, num_walks=300, walk_length=500, workers=4, quiet=True)

model = n2v.fit(window=10, min_count=3, seed=42)

## Dimensionality reduction and visualisation

In [None]:
#Naredi node2vec (100dim) in naredi umap (preveri če treba prej PCA) da narišemo sliko grafa in dodaš imena vozlišč

# vzameš samo največji graf; poišči weakly connected

# Link prediction

In [34]:
G = StellarGraph.from_networkx(G)

In [35]:
print(G.info())

StellarDiGraph: Directed multigraph
 Nodes: 20011, Edges: 94542

 Node types:
  default: [20011]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [94542]
        Weights: all 1 (default)
        Features: none


## Construct splits of the input data

In [11]:
# Define an edge splitter on the original graph:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
G_test, examples_test, labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global"
)

print(G_test.info())

** Sampled 9454 positive and 9454 negative edges. **
StellarDiGraph: Directed multigraph
 Nodes: 20011, Edges: 85088

 Node types:
  default: [20011]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [85088]
        Weights: all 1 (default)
        Features: none


## Node embeddings

### Node2Vec 

In [None]:
# parameters
p = 1.0
q = 1.0
dimensions = 128
num_walks = 10
walk_length = 80
window_size = 10
num_iter = 1
workers = multiprocessing.cpu_count()

## Train and evaluate the link prediction model

### Evaluate the best model using the test set

### Visualise representations of link embeddings