In [36]:
import pandas as pd
from tqdm import tqdm
import json
import os
import umap
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, confusion_matrix


import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN

import warnings
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import activations, initializers, constraints, regularizers
from tensorflow.keras.layers import Input, Layer, Lambda, Dropout, Reshape, Dense
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras import layers, optimizers, losses, metrics, Model
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
edges_path = 'git_web_ml/musae_git_edges.csv'
targets_path = 'git_web_ml/musae_git_target.csv'
features_path = 'git_web_ml/musae_git_features.json'

In [38]:
# Read in edges
edges = pd.read_csv(edges_path)
edges.columns = ['source', 'target'] # renaming for StellarGraph compatibility
display(edges.shape, edges)

(289003, 2)

Unnamed: 0,source,target
0,0,23977
1,1,34526
2,1,2370
3,1,14683
4,1,29982
...,...,...
288998,37527,37596
288999,37529,37601
289000,37644,2347
289001,25879,2347


In [39]:
# Read in features
with open(features_path) as json_data:
    features = json.load(json_data)
    
max_feature = np.max([v for v_list in features.values() for v in v_list])
features_matrix = np.zeros(shape = (len(list(features.keys())), max_feature+1))

i = 0
for k, vs in tqdm(features.items()):
    for v in vs:
        features_matrix[i, v] = 1
    i+=1

100%|██████████| 37700/37700 [00:01<00:00, 22719.85it/s]


In [40]:
display(max_feature, features_matrix )

4004

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
node_features = pd.DataFrame(features_matrix, index = features.keys())
display(node_features.shape, node_features.head(), node_features.dtypes)

(37700, 4005)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3995,3996,3997,3998,3999,4000,4001,4002,4003,4004
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0       float64
1       float64
2       float64
3       float64
4       float64
         ...   
4000    float64
4001    float64
4002    float64
4003    float64
4004    float64
Length: 4005, dtype: object

In [42]:
# Read in targets
targets = pd.read_csv(targets_path)
targets.index = targets.id.astype(str)
targets = targets.loc[features.keys(), :]
display(targets.shape, targets.head(), targets.ml_target.value_counts(normalize=True))

(37700, 3)

Unnamed: 0_level_0,id,name,ml_target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Eiryyy,0
1,1,shawflying,0
2,2,JpMCarrilho,1
3,3,SuhwanCha,0
4,4,sunilangadi2,1


0    0.741671
1    0.258329
Name: ml_target, dtype: float64

In [43]:
print(node_features.index)

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '37690', '37691', '37692', '37693', '37694', '37695', '37696', '37697',
       '37698', '37699'],
      dtype='object', length=37700)


In [44]:
print(edges.astype(str).dtypes)
print(edges.dtypes)

source    object
target    object
dtype: object
source    int64
target    int64
dtype: object


In [45]:
G = sg.StellarGraph(node_features, edges.astype(str))

In [46]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 37700, Edges: 289003

 Node types:
  default: [37700]
    Features: float32 vector, length 4005
    Edge types: default-default->default

 Edge types:
    default-default->default: [289003]
        Weights: all 1 (default)
        Features: none


In [47]:
target_encoding = LabelBinarizer()

# 1000 training examples
train_pages, test_pages = train_test_split(targets, train_size=1000)
val_pages, test_pages = train_test_split(test_pages, train_size=500)

train_targets = target_encoding.fit_transform(train_pages['ml_target'])
val_targets = target_encoding.transform(val_pages['ml_target'])
test_targets = target_encoding.transform(test_pages['ml_target'])

In [48]:
# Get the adjacency matrix
A = G.to_adjacency_matrix(weighted=False)

# Add self-connections
A_t = A + sp.diags(np.ones(A.shape[0]) - A.diagonal())

# Degree matrix to the power of -1/2
D_t = sp.diags(np.power(np.array(A.sum(1)), -0.5).flatten(), 0)

# Normalise the Adjacency matrix
A_norm = A.dot(D_t).transpose().dot(D_t).todense()

In [49]:
# Define the function to get these indices
def get_node_indices(G, ids):
    # find the indices of the nodes
    node_ids = np.asarray(ids)
    flat_node_ids = node_ids.reshape(-1)

    flat_node_indices = G.node_ids_to_ilocs(flat_node_ids) # in-built function makes it really easy
    # back to the original shape
    node_indices = flat_node_indices.reshape(1, len(node_ids)) # add 1 extra dimension
    
    return node_indices

# Get indices
train_indices = get_node_indices(G, train_pages.index)
val_indices = get_node_indices(G, val_pages.index)
test_indices = get_node_indices(G, test_pages.index)

# Expand dimensions
features_input = np.expand_dims(features_matrix, 0)
A_input = np.expand_dims(A_norm, 0)

y_train = np.expand_dims(train_targets, 0)
y_val = np.expand_dims(val_targets, 0)
y_test = np.expand_dims(test_targets, 0)

In [50]:
# Initialise the generator
generator = FullBatchNodeGenerator(G, method="gcn")

# Use the .flow method to prepare it for use with GCN
train_gen = generator.flow(train_pages.index, train_targets)
val_gen = generator.flow(val_pages.index, val_targets)
test_gen = generator.flow(test_pages.index, test_targets)

# Build necessary layers
gcn = GCN(
    layer_sizes=[32, 32], activations=["relu", "relu"], generator=generator, dropout=0.5
)

# Access the input and output tensors
x_inp, x_out = gcn.in_out_tensors()

# Pass the output tensor through the dense layer with sigmoid
predictions = layers.Dense(units=train_targets.shape[1], activation="sigmoid")(x_out)

model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.binary_crossentropy,
    metrics=["acc"],
)

Using GCN (local pooling) filters...


  super(Adam, self).__init__(name, **kwargs)


In [51]:
# Train the model
history = model.fit(
    train_gen,
    epochs=200,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback],
)

new_preds = model.predict(test_gen)

auc, pr, f_score = evaluate_preds(test_targets.ravel(),new_preds[0].ravel())

Epoch 1/200
1/1 - 5s - loss: 0.6868 - acc: 0.7250 - val_loss: 0.6250 - val_acc: 0.7240 - 5s/epoch - 5s/step
Epoch 2/200
1/1 - 4s - loss: 0.6118 - acc: 0.7580 - val_loss: 0.5781 - val_acc: 0.7240 - 4s/epoch - 4s/step
Epoch 3/200
1/1 - 4s - loss: 0.5506 - acc: 0.7580 - val_loss: 0.5634 - val_acc: 0.7240 - 4s/epoch - 4s/step
Epoch 4/200
1/1 - 4s - loss: 0.5152 - acc: 0.7580 - val_loss: 0.5811 - val_acc: 0.7240 - 4s/epoch - 4s/step
Epoch 5/200
1/1 - 4s - loss: 0.5205 - acc: 0.7580 - val_loss: 0.5785 - val_acc: 0.7240 - 4s/epoch - 4s/step
Epoch 6/200
1/1 - 4s - loss: 0.5140 - acc: 0.7580 - val_loss: 0.5521 - val_acc: 0.7240 - 4s/epoch - 4s/step
Epoch 7/200
1/1 - 4s - loss: 0.4902 - acc: 0.7580 - val_loss: 0.5205 - val_acc: 0.7240 - 4s/epoch - 4s/step
Epoch 8/200
1/1 - 4s - loss: 0.4628 - acc: 0.7600 - val_loss: 0.4934 - val_acc: 0.7260 - 4s/epoch - 4s/step
Epoch 9/200
1/1 - 4s - loss: 0.4457 - acc: 0.7610 - val_loss: 0.4741 - val_acc: 0.7520 - 4s/epoch - 4s/step
Epoch 10/200
1/1 - 4s - loss

In [55]:
print(A_input.shape)

(1, 37700, 37700)


In [53]:
# Define the embedding model
embedding_model = Model(inputs=x_inp, outputs=model.layers[-2].output)

# Get indices of all nodes
all_indices = get_node_indices(G, targets.index)

#Get embeddings
emb = embedding_model.predict([features_input, all_indices, A_input])
print(emb.shape)
# Shape: (1, 37700, 32)

# UMAP for visualisation
u = umap.UMAP(random_state=42)
umap_embs = u.fit_transform(emb[0])

#Plot the embeddingsembe
plt.figure(figsize=(20,10))
ax = sns.scatterplot(x = umap_embs[:, 0], y = umap_embs[:, 1], hue = targets['ml_target'])

MemoryError: Unable to allocate 5.29 GiB for an array with shape (1, 37700, 37700) and data type float32