In [1]:
import networkx as nx

attributes should be on nodes, not edges. parse/read_edgelist will place attributes on edges.
one file with edgelist, one file with nodes and their attributes. can combine these easily to create a networkx graph.

In [None]:
data = open('Datasets/hedden_network_with_followers_verified.csv', "r")
next(data, None)  # skip the first line in the input file
DiGraph = nx.DiGraph()

G = nx.parse_edgelist(data, delimiter=',', create_using=DiGraph,
                      nodetype=int, data=(('weight', float),))

in the meantime - use a dummy digraph

In [2]:
import random

num_nodes = 1000
prob = 0.001
DummyGraph = nx.erdos_renyi_graph(n=num_nodes, p=prob, directed=True)
attr_dict = {}
for n in range(num_nodes):
    # get followers
    f = len(DummyGraph.pred[n])
    # simple weighted random assignment of verified status
    v = False
    if f/num_nodes > random.uniform(0,1):
        v = True
    attr_dict[n] = {"followers": f, "verified": v}

nx.set_node_attributes(DummyGraph, attr_dict)

# test
print(DummyGraph.nodes[0]["followers"])
print(DummyGraph.nodes[0]["verified"])
verified = nx.get_node_attributes(DummyGraph, "verified")
print("number of verified nodes:", sum(verified.values()))

nx.write_gexf(DummyGraph, "test.gexf")

0
False
number of verified nodes: 4


Convert .csv network to .gexf file format

In [3]:
import networkx as nx

data = open('cs4352-final-project/Scraping/dejan.csv', "r")
G = nx.DiGraph()

SoccerGraph = nx.parse_edgelist(data, delimiter=' ', create_using=G)
nx.write_gexf(SoccerGraph, "dejan_gephi.gexf")

Convert .gml network to .gexf file format

In [4]:
SoccerGraph = nx.read_gml("Scraping/Dejan-Full-Node-Info.gml")
SoccerGraph = nx.convert_node_labels_to_integers(SoccerGraph, label_attribute="id")

nx.write_gexf(SoccerGraph, "dejan_full_node_info.gexf")

Inspect individual node attributes

In [10]:
len(SoccerGraph.nodes())

685

GCN

Convert networkx graph to dataset accessible by pytorch, cast all floats to integers (Longs)

In [26]:
import torch
from torch_geometric.utils.convert import from_networkx

SoccerGraph = nx.read_gml("Scraping/Dejan-Full-Node-Info.gml")
SoccerGraph = nx.convert_node_labels_to_integers(SoccerGraph, label_attribute="id")

# rename verified to y attribute (verified) to graph before conversion 
for n, data in SoccerGraph.nodes(data=True):
    data["y"] = data.pop('verified')

data = from_networkx(SoccerGraph, group_node_attrs=['follower_count','friends_count','listed_count','favourites_count','statuses_count'])
print(data)

# equivalent of dataset.num_node_features
num_node_features = len(data.x[0])
num_classes = 2 # verified (y) is binary
print(num_node_features)


# TRYING TO FIX 'expected scalar type Long but found Float' error
for i in range(len(data.x)):
    data.x[i] = data.x[i].type(torch.FloatTensor)
    
data.x = data.x.type(torch.FloatTensor)
print(data.x.dtype)
    
    

Data(edge_index=[2, 7258], y=[685], name=[685], screen_name=[685], protected=[685], geo_enabled=[685], contributors_enabled=[685], is_translator=[685], is_translation_enabled=[685], profile_use_background_image=[685], has_extended_profile=[685], default_profile=[685], default_profile_image=[685], id=[685], x=[685, 5])
5
torch.float32


Split up data into train and test

In [27]:
# Split the data 
train_ratio = 0.8
num_nodes = data.x.shape[0]
num_train = int(num_nodes * train_ratio)
idx = [i for i in range(num_nodes)]

import numpy as np
np.random.shuffle(idx)
train_mask = torch.full_like(data.y, False, dtype=bool)
train_mask[idx[:num_train]] = True
test_mask = torch.full_like(data.y, False, dtype=bool)
test_mask[idx[num_train:]] = True
data['train_mask'] = train_mask
data['test_mask'] = test_mask


In [16]:
# no floats in x
for r in data.x:
    if torch.is_floating_point(r):
        print("float row")
        
print(data.y.dtype)
if torch.is_floating_point(data.y):
    print("float")
        
# remove unnecessary lists
data.__delattr__('name')
data.__delattr__('screen_name')
data.__delattr__('protected')
data.__delattr__('geo_enabled')
data.__delattr__('contributors_enabled')
data.__delattr__('is_translator')
data.__delattr__('is_translation_enabled')
data.__delattr__('profile_use_background_image')
data.__delattr__('has_extended_profile')
data.__delattr__('default_profile')
data.__delattr__('default_profile_image')
data.__delattr__('id')
print(data.x)

torch.int64
tensor([[   160,    819,      0,   6881,    251],
        [    83,    937,      0,   4160,    106],
        [   221,    790,      0,   3927,   2038],
        ...,
        [   319,    392,      4,   5284,   4587],
        [   606,   2088,      3, 440321,   6319],
        [   228,    767,      5,  44066,   1007]])


Define GCN

In [28]:
# train partially on a general twitter network, and then finetune for each specific network?

import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class Twitter_GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # two hidden layers
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = data.to(device)

# data.validate(raise_on_error=True)

model = Twitter_GCN().to(device) 

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


train, test

In [29]:
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

evaluation

In [30]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9854
