In [2]:
import networkx as nx

attributes should be on nodes, not edges. parse/read_edgelist will place attributes on edges.
one file with edgelist, one file with nodes and their attributes. can combine these easily to create a networkx graph.

In [None]:
data = open('Datasets/hedden_network_with_followers_verified.csv', "r")
next(data, None)  # skip the first line in the input file
DiGraph = nx.DiGraph()

G = nx.parse_edgelist(data, delimiter=',', create_using=DiGraph,
                      nodetype=int, data=(('weight', float),))

in the meantime - use a dummy digraph

In [37]:
import random

num_nodes = 1000
prob = 0.001
DummyGraph = nx.erdos_renyi_graph(n=num_nodes, p=prob, directed=True)
attr_dict = {}
for n in range(num_nodes):
    # get followers
    f = len(DummyGraph.pred[n])
    # simple weighted random assignment of verified status
    v = False
    if f/num_nodes > random.uniform(0,1):
        v = True
    attr_dict[n] = {"followers": f, "verified": v}

nx.set_node_attributes(DummyGraph, attr_dict)

# test
print(DummyGraph.nodes[0]["followers"])
print(DummyGraph.nodes[0]["verified"])
verified = nx.get_node_attributes(DummyGraph, "verified")
print("number of verified nodes:", sum(verified.values()))

nx.write_gexf(DummyGraph, "test.gexf")

1
False
number of verified nodes: 1


Convert .csv network to .gexf file format

In [4]:
import networkx as nx

data = open('cs4352-final-project/Scraping/dejan.csv', "r")
G = nx.DiGraph()

SoccerGraph = nx.parse_edgelist(data, delimiter=' ', create_using=G)
nx.write_gexf(SoccerGraph, "dejan_gephi.gexf")

Convert .gml network to .gexf file format

In [15]:
SoccerGraph = nx.read_gml("Scraping/Dejan-Full-Node-Info.gml")
SoccerGraph = nx.convert_node_labels_to_integers(SoccerGraph, label_attribute="id")

nx.write_gexf(SoccerGraph, "dejan_full_node_info.gexf")

Inspect individual node attributes

In [23]:
SoccerGraph.nodes()[40]

{'follower_count': 1028023,
 'verified': 1,
 'name': 'Rob Perez',
 'screen_name': 'WorldWideWob',
 'protected': 0,
 'friends_count': 1046766,
 'listed_count': 6431,
 'favourites_count': 49205,
 'geo_enabled': 1,
 'statuses_count': 17582,
 'contributors_enabled': 0,
 'is_translator': 0,
 'is_translation_enabled': 0,
 'profile_use_background_image': 1,
 'has_extended_profile': 1,
 'default_profile': 0,
 'default_profile_image': 0,
 'id': '24897626.0'}

GCN

Convert networkx graph to dataset accessible by pytorch, cast all floats to integers (Longs)

In [60]:
from torch_geometric.utils.convert import from_networkx

SoccerGraph = nx.read_gml("Scraping/Dejan-Full-Node-Info.gml")
SoccerGraph = nx.convert_node_labels_to_integers(SoccerGraph, label_attribute="id")

# rename verified to y attribute (verified) to graph before conversion 
for n, data in SoccerGraph.nodes(data=True):
    data["y"] = data.pop('verified')

data = from_networkx(SoccerGraph, group_node_attrs=['follower_count','friends_count','listed_count','favourites_count','statuses_count'])
print(data)

# equivalent of dataset.num_node_features
num_node_features = len(data.x[0])
num_classes = 2 # verified (y) is binary


# TRYING TO FIX 'expected scalar type Long but found Float' error
for row in data.x:
    row = row.type(torch.LongTensor)

Data(edge_index=[2, 7258], y=[685], name=[685], screen_name=[685], protected=[685], geo_enabled=[685], contributors_enabled=[685], is_translator=[685], is_translation_enabled=[685], profile_use_background_image=[685], has_extended_profile=[685], default_profile=[685], default_profile_image=[685], id=[685], x=[685, 5])


Split up data into train and test

In [64]:
# Split the data 
train_ratio = 0.8
num_nodes = data.x.shape[0]
num_train = int(num_nodes * train_ratio)
idx = [i for i in range(num_nodes)]
import numpy as np
np.random.shuffle(idx)
train_mask = torch.full_like(data.y, False, dtype=bool)
train_mask[idx[:num_train]] = True
test_mask = torch.full_like(data.y, False, dtype=bool)
test_mask[idx[num_train:]] = True

# print(train_mask)
# tensor([ True, False, False, False, False])
# print(test_mask)
# tensor([False,  True,  True,  True,  True])

Define GCN

In [62]:
# use complete soccer data with attributes
# 4 layers (?) because our graph won't be that 'deep'
# gnn scalability considerations
# train partially on a general twitter network, and then finetune for each specific network?

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class Twitter_GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # two hidden layers
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = data.to(device)
data.validate(raise_on_error=True)

model = Twitter_GCN().to(device) 

model.train()

# can test params similar to problem set 3 - test out different numbers of layers, 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


In [None]:
# train, test

In [63]:
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

RuntimeError: expected scalar type Long but found Float