In [1]:
import torch #The torch package contains data structures for multi-dimensional tensors and mathematical operations over these are defined.
import torchvision #The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
import pandas as pd

In [2]:
edge_dataset=pd.read_csv('large_twitch_edges.csv')

In [3]:
features_dataset=pd.read_csv('large_twitch_features.csv')

In [4]:
edge_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797557 entries, 0 to 6797556
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   numeric_id_1  int64
 1   numeric_id_2  int64
dtypes: int64(2)
memory usage: 103.7 MB


In [5]:
edge_dataset.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [6]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  object
 8   affiliate     168114 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [7]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [8]:
features_dataset = features_dataset.set_index('numeric_id')

In [9]:
features_dataset.head()

Unnamed: 0_level_0,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7879,1,969,2016-02-16,2018-10-12,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,0,EN,0


In [10]:
edge_dataset.describe()

Unnamed: 0,numeric_id_1,numeric_id_2
count,6797557.0,6797557.0
mean,83828.01,84015.23
std,48205.13,48527.19
min,0.0,0.0
25%,42217.0,42045.0
50%,83546.0,83851.0
75%,125642.0,125957.0
max,168112.0,168113.0


In [11]:
merged_dataset = pd.merge(edge_dataset, features_dataset,left_on='numeric_id_1',right_on='numeric_id')

In [12]:
merged_dataset = merged_dataset.set_index('numeric_id_1')
merged_dataset.head()

Unnamed: 0_level_0,numeric_id_2,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98343,141493,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,58736,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,140703,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,151401,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,157118,282,0,2086,2012-12-27,2018-09-13,0,EN,0


In [13]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6797557 entries, 98343 to 27819
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   numeric_id_2  int64 
 1   views         int64 
 2   mature        int64 
 3   life_time     int64 
 4   created_at    object
 5   updated_at    object
 6   dead_account  int64 
 7   language      object
 8   affiliate     int64 
dtypes: int64(6), object(3)
memory usage: 518.6+ MB


In [14]:
merged_dataset.describe()

Unnamed: 0,numeric_id_2,views,mature,life_time,dead_account,affiliate
count,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0
mean,84015.23,16717070.0,0.4857826,1981.736,0.002065448,0.3226697
std,48527.19,50465520.0,0.4997979,756.7212,0.04540024,0.4674976
min,0.0,0.0,0.0,34.0,0.0,0.0
25%,42045.0,20964.0,0.0,1447.0,0.0,0.0
50%,83851.0,437055.0,0.0,1976.0,0.0,0.0
75%,125957.0,6237401.0,1.0,2502.0,0.0,1.0
max,168113.0,384396600.0,1.0,4161.0,1.0,1.0


**Converting Dataset to Pytorch Geometric Data**

In [15]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   dead_account  168114 non-null  int64 
 6   language      168114 non-null  object
 7   affiliate     168114 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 11.5+ MB


In [16]:
features_dataset['language'].unique()

array(['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO'], dtype=object)

In [17]:
languages = ['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO']

from sklearn.preprocessing import LabelEncoder

def encode_df(dataframe):
    le = LabelEncoder()
    features_dataset['language'] = le.fit_transform(features_dataset['language'])
    return dataframe

#encode the dataframe
features_dataset = encode_df(features_dataset)
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   dead_account  168114 non-null  int64 
 6   language      168114 non-null  int32 
 7   affiliate     168114 non-null  int64 
dtypes: int32(1), int64(5), object(2)
memory usage: 10.9+ MB


In [18]:
node_features = features_dataset[["views","mature","life_time","created_at","updated_at"]]

In [19]:
node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
node_features[["updated_year", "updated_month", "updated_day"]] = node_features["updated_at"].str.split("-", expand = True).astype('int32')

node_features = node_features.drop(['created_at','updated_at'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')


In [20]:
# node_features['created_at'] = pd.to_datetime(node_features['created_at']).astype('int64')/ 10**9
# node_features['updated_at'] = pd.to_datetime(node_features['updated_at']).astype('int64')/ 10**9
node_features.head()

Unnamed: 0_level_0,views,mature,life_time,created_year,created_month,created_day,updated_year,updated_month,updated_day
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,7879,1,969,2016,2,16,2018,10,12
1,500,0,2699,2011,5,19,2018,10,8
2,382502,1,3149,2010,2,27,2018,10,12
3,386,0,1344,2015,1,26,2018,10,1
4,2486,0,1784,2013,11,22,2018,10,11


In [21]:
node_features = node_features.astype('float32')
node_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   views          168114 non-null  float32
 1   mature         168114 non-null  float32
 2   life_time      168114 non-null  float32
 3   created_year   168114 non-null  float32
 4   created_month  168114 non-null  float32
 5   created_day    168114 non-null  float32
 6   updated_year   168114 non-null  float32
 7   updated_month  168114 non-null  float32
 8   updated_day    168114 non-null  float32
dtypes: float32(9)
memory usage: 7.1 MB


In [22]:
x =  torch.from_numpy(node_features.to_numpy())
x.shape # [num_nodes x num_features]

torch.Size([168114, 9])

In [23]:
x.dtype

torch.float32

In [24]:
features_dataset[["dead_account","language","affiliate"]] = features_dataset[["dead_account","language","affiliate"]].astype('int64')
features_dataset[["dead_account","language","affiliate"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   dead_account  168114 non-null  int64
 1   language      168114 non-null  int64
 2   affiliate     168114 non-null  int64
dtypes: int64(3)
memory usage: 5.1 MB


In [92]:
# Select node features
labels = features_dataset[["dead_account"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [93]:
y = y.reshape(-1,)

In [94]:
y.shape

torch.Size([168114])

In [95]:
labels.dtypes

dead_account    int64
dtype: object

In [29]:
edge_dataset = edge_dataset.astype('int64')

In [30]:
edge_dataset.dtypes

numeric_id_1    int64
numeric_id_2    int64
dtype: object

In [31]:
edge_dataset = edge_dataset.sort_values(by=['numeric_id_1'])

In [32]:
edge_index = edge_dataset.transpose()

In [33]:
edge_index.dtypes.unique()

array([dtype('int64')], dtype=object)

In [34]:
all_edges =  torch.from_numpy(edge_index.to_numpy()) # [2, num_edges]
print(all_edges.shape)

torch.Size([2, 6797557])


In [35]:
all_edges

tensor([[     0,      0,      0,  ..., 168112, 168112, 168112],
        [ 10464,  59443, 151601,  ...,  77866,  95086,  12740]])

In [36]:
len(features_dataset)

168114

In [37]:
train_arr = np.array([True for i in range(round(len(features_dataset)*0.85))])

In [38]:
train_arr = np.append(train_arr,np.array([False for i in range(int(len(features_dataset)*0.15))]))

In [39]:
train_arr.shape

(168114,)

In [40]:
test_arr = np.array([False for i in range(round(len(features_dataset)*0.55))])
test_arr = np.append(test_arr,np.array([True for i in range(int(len(features_dataset)*0.45))]))

In [41]:
test_arr.shape

(168114,)

In [42]:
val_arr = np.array([False for i in range(round(len(features_dataset)*0.35))])
val_arr = np.append(val_arr,np.array([True for i in range(round(len(features_dataset)*0.35))]))
val_arr = np.append(val_arr,np.array([False for i in range(int(len(features_dataset)*0.30))]))

In [43]:
val_arr.shape

(168114,)

In [44]:
train_mask = torch.from_numpy(train_arr)
test_mask = torch.from_numpy(test_arr)
val_mask = torch.from_numpy(val_arr)

In [96]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [97]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114])

In [98]:
data.num_classes = 2
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [99]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [100]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

In [50]:
# import torch_geometric
# from torch_geometric.utils.convert import to_networkx
# import networkx as nx
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 10))
# twitch_gamers = torch_geometric.data.Data(x=data.x[:500], edge_index=data.edge_index[:500])
# # g = torch_geometric.utils.to_networkx(twitch_gamers, to_undirected=True)
# twitchgraph = to_networkx(twitch_gamers)
# node_labels = data.y[list(twitchgraph.nodes)].numpy()
# nx.draw(g, cmap=plt.get_cmap('Set1'),node_color = node_labels,node_size=75,linewidths=6)

Live Account / Dead Account Classification

In [101]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 2
number of classes:		 [0 1]
number of node features:	 9
number of edge features:	 0
X shape:  torch.Size([168114, 9])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [102]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(9, 16)
  (conv2): GCNConv(16, 2)
)

In [103]:
# useful function for computing accuracy
def compute_accuracy(pred_y, y):
    return (pred_y == y).sum()

In [104]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item())
    accuracies.append(acc*100)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 686322.5625, Training Acc: 0.2865
Epoch: 20, Loss: 105689.7969, Training Acc: 0.9678
Epoch: 30, Loss: 144642.6250, Training Acc: 0.9688
Epoch: 40, Loss: 156741.3594, Training Acc: 0.9689
Epoch: 50, Loss: 142311.3906, Training Acc: 0.9689
Epoch: 60, Loss: 130392.6328, Training Acc: 0.9686
Epoch: 70, Loss: 122234.8438, Training Acc: 0.9688
Epoch: 80, Loss: 102030.7031, Training Acc: 0.9682
Epoch: 90, Loss: 92980.6875, Training Acc: 0.9676
Epoch: 100, Loss: 71958.9141, Training Acc: 0.9672
Epoch: 110, Loss: 61853.7227, Training Acc: 0.9647
Epoch: 120, Loss: 53470.5078, Training Acc: 0.9616
Epoch: 130, Loss: 43364.6719, Training Acc: 0.9567
Epoch: 140, Loss: 35949.3750, Training Acc: 0.9441
Epoch: 150, Loss: 34805.5430, Training Acc: 0.9361
Epoch: 160, Loss: 30102.8027, Training Acc: 0.9280
Epoch: 170, Loss: 29969.6562, Training Acc: 0.9210
Epoch: 180, Loss: 26680.4512, Training Acc: 0.9301
Epoch: 190, Loss: 27657.6406, Training Acc: 0.9304
Epoch: 200, Loss: 25518.0156, Tr

In [105]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9694


In [106]:
torch.cuda.empty_cache()

Language Classification

In [53]:
# Select node features
labels = features_dataset[["language"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [54]:
y = y.reshape(-1,)

In [55]:
y.shape

torch.Size([168114])

In [56]:
labels.dtypes

language    int64
dtype: object

In [58]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [59]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114])

In [60]:
data.num_classes = 21
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [61]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114], num_classes=21, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [None]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

In [83]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(9, 16)
  (conv2): GCNConv(16, 21)
)

In [84]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 541530.6250, Training Acc: 0.5419
Epoch: 20, Loss: 371076.1562, Training Acc: 0.5903
Epoch: 30, Loss: 197118.4531, Training Acc: 0.6057
Epoch: 40, Loss: 110943.2422, Training Acc: 0.6042
Epoch: 50, Loss: 54391.0039, Training Acc: 0.5280
Epoch: 60, Loss: 25161.1953, Training Acc: 0.5914
Epoch: 70, Loss: 12145.9326, Training Acc: 0.5759
Epoch: 80, Loss: 3462.8628, Training Acc: 0.5815
Epoch: 90, Loss: 15.3097, Training Acc: 0.1582
Epoch: 100, Loss: 9.2986, Training Acc: 0.1428
Epoch: 110, Loss: 5.8773, Training Acc: 0.7031
Epoch: 120, Loss: 4.5190, Training Acc: 0.7235
Epoch: 130, Loss: 3.7033, Training Acc: 0.7298
Epoch: 140, Loss: 3.2177, Training Acc: 0.7325
Epoch: 150, Loss: 2.8677, Training Acc: 0.7307
Epoch: 160, Loss: 2.4898, Training Acc: 0.7352
Epoch: 170, Loss: 2.2603, Training Acc: 0.7362
Epoch: 180, Loss: 2.1383, Training Acc: 0.7371
Epoch: 190, Loss: 1.9845, Training Acc: 0.7373
Epoch: 200, Loss: 1.8495, Training Acc: 0.7382


In [86]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.7401


In [87]:
torch.cuda.empty_cache()

Affiliation Status Identification

In [107]:
# Select node features
labels = features_dataset[["affiliate"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [108]:
y = y.reshape(-1,)

In [109]:
y.shape

torch.Size([168114])

In [111]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [112]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114])

In [113]:
data.num_classes = 2
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [114]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [115]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 2
number of classes:		 [0 1]
number of node features:	 9
number of edge features:	 0
X shape:  torch.Size([168114, 9])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [121]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(9, 16)
  (conv2): GCNConv(16, 2)
)

In [122]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(600):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 527534.9375, Training Acc: 0.5017
Epoch: 20, Loss: 496870.7812, Training Acc: 0.4975
Epoch: 30, Loss: 451187.2812, Training Acc: 0.5018
Epoch: 40, Loss: 295116.0938, Training Acc: 0.4926
Epoch: 50, Loss: 236138.0938, Training Acc: 0.4908
Epoch: 60, Loss: 174689.4062, Training Acc: 0.4985
Epoch: 70, Loss: 108665.3125, Training Acc: 0.4956
Epoch: 80, Loss: 79670.9844, Training Acc: 0.5054
Epoch: 90, Loss: 45223.2695, Training Acc: 0.4937
Epoch: 100, Loss: 18112.8535, Training Acc: 0.4979
Epoch: 110, Loss: 8365.0947, Training Acc: 0.4906
Epoch: 120, Loss: 16982.6348, Training Acc: 0.4958
Epoch: 130, Loss: 15764.5186, Training Acc: 0.5029
Epoch: 140, Loss: 3564.3943, Training Acc: 0.5184
Epoch: 150, Loss: 3276.8066, Training Acc: 0.5141
Epoch: 160, Loss: 13772.0264, Training Acc: 0.5149
Epoch: 170, Loss: 26841.0664, Training Acc: 0.4904
Epoch: 180, Loss: 14900.9424, Training Acc: 0.4919
Epoch: 190, Loss: 19246.4102, Training Acc: 0.4924
Epoch: 200, Loss: 11301.3086, Traini

In [123]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.5365
