In [2]:
import networkx as nx

attributes should be on nodes, not edges. parse/read_edgelist will place attributes on edges.
one file with edgelist, one file with nodes and their attributes. can combine these easily to create a networkx graph.

In [None]:
data = open('Datasets/hedden_network_with_followers_verified.csv', "r")
next(data, None)  # skip the first line in the input file
DiGraph = nx.DiGraph()

G = nx.parse_edgelist(data, delimiter=',', create_using=DiGraph,
                      nodetype=int, data=(('weight', float),))

in the meantime - use a dummy digraph

In [3]:
import random

num_nodes = 1000
prob = 0.001
DummyGraph = nx.erdos_renyi_graph(n=num_nodes, p=prob, directed=True)
attr_dict = {}
for n in range(num_nodes):
    # get followers
    f = len(DummyGraph.pred[n])
    # simple weighted random assignment of verified status
    v = False
    if f/num_nodes > random.uniform(0,1):
        v = True
    attr_dict[n] = {"followers": f, "verified": v}

nx.set_node_attributes(DummyGraph, attr_dict)

# test
print(DummyGraph.nodes[0]["followers"])
print(DummyGraph.nodes[0]["verified"])
verified = nx.get_node_attributes(DummyGraph, "verified")
print("number of verified nodes:", sum(verified.values()))

nx.write_gexf(DummyGraph, "test.gexf")

1
False
number of verified nodes: 1


Convert .csv network to .gexf file format

In [4]:
import networkx as nx

data = open('cs4352-final-project/Scraping/dejan.csv', "r")
G = nx.DiGraph()

SoccerGraph = nx.parse_edgelist(data, delimiter=' ', create_using=G)
nx.write_gexf(SoccerGraph, "dejan_gephi.gexf")

Convert .gml network to .gexf file format

In [5]:
SoccerGraph = nx.read_gml("Scraping/Dejan-Full-Node-Info.gml")
SoccerGraph = nx.convert_node_labels_to_integers(SoccerGraph, label_attribute="id")

nx.write_gexf(SoccerGraph, "dejan_full_node_info.gexf")

Inspect individual node attributes

In [6]:
len(SoccerGraph.nodes())

685

## GCN

Convert networkx graph to dataset accessible by pytorch, cast all floats to integers (Longs)

In [11]:
import torch
from torch_geometric.utils.convert import from_networkx

SoccerGraph = nx.read_gml("Scraping/Dejan-Full-Node-Info.gml")
SoccerGraph = nx.convert_node_labels_to_integers(SoccerGraph, label_attribute="id")

# rename verified to y attribute (verified) to graph before conversion 
for n, data in SoccerGraph.nodes(data=True):
    data["y"] = data.pop('verified')

data = from_networkx(SoccerGraph, group_node_attrs=['follower_count','friends_count','listed_count','favourites_count','statuses_count'])
print(data)

# equivalent of dataset.num_node_features
num_node_features = len(data.x[0])
num_classes = 2 # verified (y) is binary
print(num_node_features)

# TRYING TO FIX 'expected scalar type Long but found Float' error
# for i in range(len(data.x)):
#     data.x[i] = data.x[i].type(torch.FloatTensor)
    
# CRUCIAL
data.x = data.x.type(torch.FloatTensor)
    
    

Data(edge_index=[2, 7258], y=[685], name=[685], screen_name=[685], protected=[685], geo_enabled=[685], contributors_enabled=[685], is_translator=[685], is_translation_enabled=[685], profile_use_background_image=[685], has_extended_profile=[685], default_profile=[685], default_profile_image=[685], id=[685], x=[685, 5])
5


Split up data into train and test

In [12]:
# Split the data 
# ps 3 - train 140, val 500, test 1000
# pytorch docs: 'As a rule of thumb, we use 20% of the training set as the validation set.'

# Going with 60 training, 20 val, 20 test
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 1 - train_ratio - val_ratio

num_nodes = data.x.shape[0]
num_train = int(num_nodes * train_ratio)
num_val = int(num_nodes * val_ratio)
num_test = 1 - num_train - num_val
idx = [i for i in range(num_nodes)]

import numpy as np
np.random.shuffle(idx)

train_mask = torch.full_like(data.y, False, dtype=bool)
train_mask[idx[:num_train]] = True

val_mask = torch.full_like(data.y, False, dtype=bool)
val_mask[idx[num_train:num_train+num_val]] = True

test_mask = torch.full_like(data.y, False, dtype=bool)
test_mask[idx[num_train+num_val:]] = True

data['train_mask'] = train_mask
data['val_mask'] = val_mask
data['test_mask'] = test_mask



In [13]:
count_true = 0
count_false = 0
for n in data.train_mask:
    if n:
        count_true += 1
    else:
        count_false += 1
print(count_true, count_false)
count_true = 0
count_false = 0
for n in data.val_mask:
    if n:
        count_true += 1
    else:
        count_false += 1
print(count_true, count_false)
count_true = 0
count_false = 0
for n in data.test_mask:
    if n:
        count_true += 1
    else:
        count_false += 1
print(count_true, count_false)


411 274
137 548
137 548


In [15]:
# remove unnecessary lists
data.__delattr__('name')
data.__delattr__('screen_name')
data.__delattr__('protected')
data.__delattr__('geo_enabled')
data.__delattr__('contributors_enabled')
data.__delattr__('is_translator')
data.__delattr__('is_translation_enabled')
data.__delattr__('profile_use_background_image')
data.__delattr__('has_extended_profile')
data.__delattr__('default_profile')
data.__delattr__('default_profile_image')
data.__delattr__('id')

print(data)

Data(edge_index=[2, 7258], y=[685], x=[685, 5], train_mask=[685], val_mask=[685], test_mask=[685])


Define GCN

In [16]:
# train partially on a general twitter network, and then finetune for each specific network?

import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class Twitter_GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # two hidden layers
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = data.to(device)

# data.validate(raise_on_error=True)

model = Twitter_GCN().to(device) 

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


train, test

In [17]:
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

evaluation

In [18]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

# FROM DATASET IN PROB 3 LOOKS LIKE 20% IS USED TO TRAIN AND 80% TO TEST INSTEAD OF VICE VERSA -- 
# IS THIS WHY ACCURACY IS SO HIGH??


Accuracy: 0.9708


change to val_mask and tune hyperparameters just like hw 3

In [22]:
import statistics

class GCN(torch.nn.Module):
    def __init__(self, num_layers):
        super().__init__()
        self.layers = torch.nn.ModuleList()
        self.layers.append(GCNConv(num_node_features, 16))
        for i in range(num_layers):
            self.layers.append(GCNConv(16, 16))
        self.layers.append(GCNConv(16, num_classes))

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        for layer in self.layers[:-1]:
            x = layer(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, training=self.training)
        x = self.layers[-1](x, edge_index)

        return F.log_softmax(x, dim=1)


num_layers = [2, 3]
l2_regs = [0, 0.0001, 0.001]
learning_rates = [0.001, 0.01, 0.05]

NUM_ITERS = 5

all_trials = []
for n in num_layers:            
    for l2_reg in l2_regs:
        for learning_rate in learning_rates:
            
            results = []
            
            for i in range(NUM_ITERS):
                model = GCN(n).to(device)
                
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_reg)

                model.train()
                for epoch in range(200):
                    optimizer.zero_grad()
                    out = model(data)
                    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
                    loss.backward()
                    optimizer.step()
                model.eval()
                pred = model(data).argmax(dim=1)
                correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
                acc = int(correct) / int(data.val_mask.sum())
                print(n, l2_reg, learning_rate)
                print(f'Accuracy: {acc:.4f}')
                
                results.append(acc)
                
            avg = sum(results) / len(results)
            stdev = statistics.pstdev(results)
            all_trials.append(((n, l2_reg, learning_rate), avg, stdev))

print(("Number of layers", "L2 regularization", "Learning rate"), "Average", "Standard Dev.")
for t in all_trials:
    print(t)

max = max(all_trials, key=lambda x:x[1])
print(max)

2 0 0.001
Accuracy: 1.0000
2 0 0.001
Accuracy: 0.9708
2 0 0.001
Accuracy: 1.0000
2 0 0.001
Accuracy: 1.0000
2 0 0.001
Accuracy: 0.9489
2 0 0.01
Accuracy: 1.0000
2 0 0.01
Accuracy: 1.0000
2 0 0.01
Accuracy: 1.0000
2 0 0.01
Accuracy: 1.0000


KeyboardInterrupt: 

do linear regression model to compare w/ GCN

In [24]:
import numpy as np
from sklearn.linear_model import LinearRegression


model = LinearRegression().fit(data.x, data.y)

r_sq = model.score(data.x, data.y)
print(f"coefficient of determination: {r_sq}")

coefficient of determination: 0.4021911191578005


can we train a model on one twitter network and then apply it to other people's networks?