In [1]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
#!pip3 install torch-geometric
#!pip list | grep torch
#!pip3 install --upgrade torchvision

### importing libraries 

In [1]:
import torch #The torch package contains data structures for multi-dimensional tensors and mathematical operations over these are defined.
import torchvision #The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
import pandas as pd

In [2]:
# importing the dataset :-- 
edge_dataset=pd.read_csv('large_twitch_edges.csv')

In [52]:
features_dataset=pd.read_csv('large_twitch_features.csv')

In [53]:
edge_dataset.info()
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797557 entries, 0 to 6797556
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   numeric_id_1  int64
 1   numeric_id_2  int64
dtypes: int64(2)
memory usage: 103.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  object
 8   affiliate     168114 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [54]:
edge_dataset.shape
features_dataset.shape

(168114, 9)

### converting life_time column in years 

In [55]:

features_dataset['life_time'] = features_dataset['life_time'] // 365.25
features_dataset['life_time'].value_counts()

4.0     31843
3.0     28895
5.0     28128
2.0     23924
1.0     18249
6.0     17710
7.0      7448
0.0      6597
8.0      2762
9.0      1907
10.0      578
11.0       73
Name: life_time, dtype: int64

In [56]:
features_dataset['life_time'].head()

0    2.0
1    7.0
2    8.0
3    3.0
4    4.0
Name: life_time, dtype: float64

In [57]:
print(max(features_dataset['life_time']))

11.0


In [58]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,2.0,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,7.0,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,8.0,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,3.0,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,4.0,2013-11-22,2018-10-11,4,0,EN,0


In [59]:
# check wether the dataset have null values or not 
features_dataset.isna().sum()

views           0
mature          0
life_time       0
created_at      0
updated_at      0
numeric_id      0
dead_account    0
language        0
affiliate       0
dtype: int64

In [60]:
edge_dataset.isna().sum()

numeric_id_1    0
numeric_id_2    0
dtype: int64

In [61]:
# lets merge the dataset 
merged_dataset = pd.merge(edge_dataset, features_dataset,left_on='numeric_id_1',right_on='numeric_id')

In [13]:
merged_dataset = merged_dataset.set_index('numeric_id_1')
merged_dataset.head()

Unnamed: 0_level_0,numeric_id_2,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98343,141493,282,0,5.0,2012-12-27,2018-09-13,0,EN,0
98343,58736,282,0,5.0,2012-12-27,2018-09-13,0,EN,0
98343,140703,282,0,5.0,2012-12-27,2018-09-13,0,EN,0
98343,151401,282,0,5.0,2012-12-27,2018-09-13,0,EN,0
98343,157118,282,0,5.0,2012-12-27,2018-09-13,0,EN,0


In [14]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6797557 entries, 98343 to 27819
Data columns (total 9 columns):
 #   Column        Dtype  
---  ------        -----  
 0   numeric_id_2  int64  
 1   views         int64  
 2   mature        int64  
 3   life_time     float64
 4   created_at    object 
 5   updated_at    object 
 6   dead_account  int64  
 7   language      object 
 8   affiliate     int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 518.6+ MB


In [15]:
merged_dataset.describe()

Unnamed: 0,numeric_id_2,views,mature,life_time,dead_account,affiliate
count,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0
mean,84015.23,16717070.0,0.4857826,4.923242,0.002065448,0.3226697
std,48527.19,50465520.0,0.4997979,2.100253,0.04540024,0.4674976
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,42045.0,20964.0,0.0,3.0,0.0,0.0
50%,83851.0,437055.0,0.0,5.0,0.0,0.0
75%,125957.0,6237401.0,1.0,6.0,0.0,1.0
max,168113.0,384396600.0,1.0,11.0,1.0,1.0


features_dataset['language'].unique()

In [62]:
languages = ['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO']

from sklearn.preprocessing import LabelEncoder

def encode_df(dataframe):
    le = LabelEncoder()
    features_dataset['language'] = le.fit_transform(features_dataset['language'])
    return dataframe

#encode the dataframe
features_dataset = encode_df(features_dataset)
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   views         168114 non-null  int64  
 1   mature        168114 non-null  int64  
 2   life_time     168114 non-null  float64
 3   created_at    168114 non-null  object 
 4   updated_at    168114 non-null  object 
 5   numeric_id    168114 non-null  int64  
 6   dead_account  168114 non-null  int64  
 7   language      168114 non-null  int32  
 8   affiliate     168114 non-null  int64  
dtypes: float64(1), int32(1), int64(5), object(2)
memory usage: 10.9+ MB


In [164]:
node_features = features_dataset[["views","created_at","updated_at", "dead_account"]]
# node_features = features_dataset[["views"]]

In [165]:
node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
node_features[["updated_year", "updated_month", "updated_day"]] = node_features["updated_at"].str.split("-", expand = True).astype('int32')

node_features = node_features.drop(['created_at','updated_at'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pa

In [166]:
node_features.head()

Unnamed: 0,views,dead_account,created_year,created_month,created_day,updated_year,updated_month,updated_day
0,7879,0,2016,2,16,2018,10,12
1,500,0,2011,5,19,2018,10,8
2,382502,0,2010,2,27,2018,10,12
3,386,0,2015,1,26,2018,10,1
4,2486,0,2013,11,22,2018,10,11


In [167]:
node_features = node_features.astype('float32')
node_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   views          168114 non-null  float32
 1   dead_account   168114 non-null  float32
 2   created_year   168114 non-null  float32
 3   created_month  168114 non-null  float32
 4   created_day    168114 non-null  float32
 5   updated_year   168114 non-null  float32
 6   updated_month  168114 non-null  float32
 7   updated_day    168114 non-null  float32
dtypes: float32(8)
memory usage: 5.1 MB


In [168]:
x =  torch.from_numpy(node_features.to_numpy())
x.shape # [num_nodes x num_features]
x.dtype

torch.float32

In [169]:
features_dataset[["dead_account","language","affiliate"]] = features_dataset[["dead_account","language","affiliate"]].astype('int64')
features_dataset[["dead_account","language","affiliate"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   dead_account  168114 non-null  int64
 1   language      168114 non-null  int64
 2   affiliate     168114 non-null  int64
dtypes: int64(3)
memory usage: 3.8 MB


In [170]:
# Select node features
labels = features_dataset[["life_time"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [171]:
y = y.reshape(-1,)
y.shape

torch.Size([168114])

In [172]:
labels.dtypes

life_time    float64
dtype: object

In [173]:
edge_dataset = edge_dataset.astype('int64')

In [174]:
edge_dataset.dtypes

numeric_id_1    int64
numeric_id_2    int64
dtype: object

In [175]:
edge_index = edge_dataset.transpose()

In [176]:
edge_index.dtypes.unique()

array([dtype('int64')], dtype=object)

In [177]:
all_edges =  torch.from_numpy(edge_index.to_numpy()) # [2, num_edges]
print(all_edges.shape)

torch.Size([2, 6797557])


In [178]:
all_edges

tensor([[ 98343,  98343,  98343,  ..., 151702, 118034,  27819],
        [141493,  58736, 140703,  ..., 128281,  38021, 153993]])

In [179]:
len(features_dataset)

168114

In [180]:

train_arr = np.array([True for i in range(round(len(features_dataset)*0.85))])
train_arr = np.append(train_arr,np.array([False for i in range(int(len(features_dataset)*0.15))]))

In [181]:
train_arr.shape

(168114,)

In [182]:

test_arr = np.array([False for i in range(round(len(features_dataset)*0.55))])
test_arr = np.append(test_arr,np.array([True for i in range(int(len(features_dataset)*0.45))]))

In [183]:
test_arr.shape

(168114,)

In [184]:
val_arr = np.array([False for i in range(round(len(features_dataset)*0.35))])
val_arr = np.append(val_arr,np.array([True for i in range(round(len(features_dataset)*0.35))]))
val_arr = np.append(val_arr,np.array([False for i in range(int(len(features_dataset)*0.30))]))

In [185]:
val_arr.shape

(168114,)

In [186]:
train_mask = torch.from_numpy(train_arr)
test_mask = torch.from_numpy(test_arr)
val_mask = torch.from_numpy(val_arr)

In [187]:
print(train_mask)

tensor([ True,  True,  True,  ..., False, False, False])


In [188]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [189]:
data

Data(x=[168114, 8], edge_index=[2, 6797557], y=[168114])

In [190]:
data.num_classes = 12
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [191]:
data

Data(x=[168114, 8], edge_index=[2, 6797557], y=[168114], num_classes=12, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [192]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

In [193]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 8], edge_index=[2, 6797557], y=[168114], num_classes=12, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 12
number of classes:		 [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11.]
number of node features:	 8
number of edge features:	 0
X shape:  torch.Size([168114, 8])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [200]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(8, 16)
  (conv2): GCNConv(16, 12)
)

In [201]:
# useful function for computing accuracy
def compute_accuracy(pred_y, y):
    return (pred_y == y).sum()

In [202]:
# # print(torch.unique(data.y))
# # print(969 in data.y)

# # print(data.y.shape)
# print(data.y[data.train_mask])
# print(969 in data.y)

In [203]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask].long())
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 910889.2500, Training Acc: 0.1536
Epoch: 20, Loss: 640078.4375, Training Acc: 0.1202
Epoch: 30, Loss: 326723.8125, Training Acc: 0.1345
Epoch: 40, Loss: 211649.5469, Training Acc: 0.1494
Epoch: 50, Loss: 82388.1172, Training Acc: 0.1390
Epoch: 60, Loss: 33592.7500, Training Acc: 0.1410
Epoch: 70, Loss: 52.2565, Training Acc: 0.0610
Epoch: 80, Loss: 26.5294, Training Acc: 0.0480
Epoch: 90, Loss: 18.0157, Training Acc: 0.1611
Epoch: 100, Loss: 16.7672, Training Acc: 0.1541
Epoch: 110, Loss: 13.5711, Training Acc: 0.1575
Epoch: 120, Loss: 12.3832, Training Acc: 0.1619
Epoch: 130, Loss: 11.4522, Training Acc: 0.1768
Epoch: 140, Loss: 10.6828, Training Acc: 0.1801
Epoch: 150, Loss: 9.9875, Training Acc: 0.1768
Epoch: 160, Loss: 9.9844, Training Acc: 0.1782
Epoch: 170, Loss: 8.9583, Training Acc: 0.1787
Epoch: 180, Loss: 8.4833, Training Acc: 0.1776
Epoch: 190, Loss: 8.2628, Training Acc: 0.1813
Epoch: 200, Loss: 7.5771, Training Acc: 0.1810


In [204]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.1895


In [None]:
# Accuracy: 0.1613

## sageCONV 

In [205]:
# useful function for computing accuracy
def compute_accuracy1(pred_y, y):
    return (pred_y == y).sum()

In [206]:
import torch

class SAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
        
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [207]:
model = SAGE(in_channels=data.num_node_features, hidden_channels=16, out_channels=data.num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
losses = []
accuracies = []

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask].long())
    correct = compute_accuracy1(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item()/1000)
    accuracies.append(acc*10000)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 2291334.0000, Training Acc: 0.1229
Epoch: 20, Loss: 995977.8125, Training Acc: 0.1549
Epoch: 30, Loss: 404959.2188, Training Acc: 0.1227
Epoch: 40, Loss: 106589.3984, Training Acc: 0.1246
Epoch: 50, Loss: 21122.2188, Training Acc: 0.0799
Epoch: 60, Loss: 7.4983, Training Acc: 0.1663
Epoch: 70, Loss: 4.3624, Training Acc: 0.1672
Epoch: 80, Loss: 2.9559, Training Acc: 0.1671
Epoch: 90, Loss: 2.6682, Training Acc: 0.1672
Epoch: 100, Loss: 2.5919, Training Acc: 0.1671
Epoch: 110, Loss: 2.5291, Training Acc: 0.1672
Epoch: 120, Loss: 2.4418, Training Acc: 0.1673
Epoch: 130, Loss: 2.3995, Training Acc: 0.1674
Epoch: 140, Loss: 2.3564, Training Acc: 0.1676
Epoch: 150, Loss: 2.3110, Training Acc: 0.1676
Epoch: 160, Loss: 2.2834, Training Acc: 0.1675
Epoch: 170, Loss: 2.2522, Training Acc: 0.1675
Epoch: 180, Loss: 2.2330, Training Acc: 0.1885
Epoch: 190, Loss: 2.2061, Training Acc: 0.1888
Epoch: 200, Loss: 2.1951, Training Acc: 0.1888


In [208]:
## Acuracy : 0.18

In [209]:
data.edge_index

tensor([[ 98343,  98343,  98343,  ..., 151702, 118034,  27819],
        [141493,  58736, 140703,  ..., 128281,  38021, 153993]],
       device='cuda:0')

In [210]:
model.eval()
pred = model(data.x,data.edge_index).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.1892
