In [1]:
import torch #The torch package contains data structures for multi-dimensional tensors and mathematical operations over these are defined.
import torchvision #The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
import pandas as pd

In [2]:
edge_dataset=pd.read_csv('large_twitch_edges.csv')

In [3]:
features_dataset=pd.read_csv('large_twitch_features.csv')

In [4]:
edge_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797557 entries, 0 to 6797556
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   numeric_id_1  int64
 1   numeric_id_2  int64
dtypes: int64(2)
memory usage: 103.7 MB


In [5]:
edge_dataset.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [6]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   numeric_id    168114 non-null  int64 
 6   dead_account  168114 non-null  int64 
 7   language      168114 non-null  object
 8   affiliate     168114 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [7]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [8]:
features_dataset = features_dataset.set_index('numeric_id')

In [9]:
features_dataset.head()

Unnamed: 0_level_0,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7879,1,969,2016-02-16,2018-10-12,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,0,EN,0


In [10]:
edge_dataset.describe()

Unnamed: 0,numeric_id_1,numeric_id_2
count,6797557.0,6797557.0
mean,83828.01,84015.23
std,48205.13,48527.19
min,0.0,0.0
25%,42217.0,42045.0
50%,83546.0,83851.0
75%,125642.0,125957.0
max,168112.0,168113.0


In [11]:
merged_dataset = pd.merge(edge_dataset, features_dataset,left_on='numeric_id_1',right_on='numeric_id')

In [12]:
merged_dataset = merged_dataset.set_index('numeric_id_1')
merged_dataset.head()

Unnamed: 0_level_0,numeric_id_2,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98343,141493,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,58736,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,140703,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,151401,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,157118,282,0,2086,2012-12-27,2018-09-13,0,EN,0


In [13]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6797557 entries, 98343 to 27819
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   numeric_id_2  int64 
 1   views         int64 
 2   mature        int64 
 3   life_time     int64 
 4   created_at    object
 5   updated_at    object
 6   dead_account  int64 
 7   language      object
 8   affiliate     int64 
dtypes: int64(6), object(3)
memory usage: 518.6+ MB


In [14]:
merged_dataset.describe()

Unnamed: 0,numeric_id_2,views,mature,life_time,dead_account,affiliate
count,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0
mean,84015.23,16717070.0,0.4857826,1981.736,0.002065448,0.3226697
std,48527.19,50465520.0,0.4997979,756.7212,0.04540024,0.4674976
min,0.0,0.0,0.0,34.0,0.0,0.0
25%,42045.0,20964.0,0.0,1447.0,0.0,0.0
50%,83851.0,437055.0,0.0,1976.0,0.0,0.0
75%,125957.0,6237401.0,1.0,2502.0,0.0,1.0
max,168113.0,384396600.0,1.0,4161.0,1.0,1.0


**Converting Dataset to Pytorch Geometric Data**

In [15]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   dead_account  168114 non-null  int64 
 6   language      168114 non-null  object
 7   affiliate     168114 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 11.5+ MB


In [16]:
features_dataset['language'].unique()

array(['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO'], dtype=object)

In [17]:
languages = ['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO']

from sklearn.preprocessing import LabelEncoder

def encode_df(dataframe):
    le = LabelEncoder()
    features_dataset['language'] = le.fit_transform(features_dataset['language'])
    return dataframe

#encode the dataframe
features_dataset = encode_df(features_dataset)
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   views         168114 non-null  int64 
 1   mature        168114 non-null  int64 
 2   life_time     168114 non-null  int64 
 3   created_at    168114 non-null  object
 4   updated_at    168114 non-null  object
 5   dead_account  168114 non-null  int64 
 6   language      168114 non-null  int32 
 7   affiliate     168114 non-null  int64 
dtypes: int32(1), int64(5), object(2)
memory usage: 10.9+ MB


In [18]:
node_features = features_dataset[["views","mature","life_time","created_at","updated_at"]]

In [19]:
node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
node_features[["updated_year", "updated_month", "updated_day"]] = node_features["updated_at"].str.split("-", expand = True).astype('int32')

node_features = node_features.drop(['created_at','updated_at'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')


In [20]:
# node_features['created_at'] = pd.to_datetime(node_features['created_at']).astype('int64')/ 10**9
# node_features['updated_at'] = pd.to_datetime(node_features['updated_at']).astype('int64')/ 10**9
node_features.head()

Unnamed: 0_level_0,views,mature,life_time,created_year,created_month,created_day,updated_year,updated_month,updated_day
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,7879,1,969,2016,2,16,2018,10,12
1,500,0,2699,2011,5,19,2018,10,8
2,382502,1,3149,2010,2,27,2018,10,12
3,386,0,1344,2015,1,26,2018,10,1
4,2486,0,1784,2013,11,22,2018,10,11


In [21]:
node_features = node_features.astype('float32')
node_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   views          168114 non-null  float32
 1   mature         168114 non-null  float32
 2   life_time      168114 non-null  float32
 3   created_year   168114 non-null  float32
 4   created_month  168114 non-null  float32
 5   created_day    168114 non-null  float32
 6   updated_year   168114 non-null  float32
 7   updated_month  168114 non-null  float32
 8   updated_day    168114 non-null  float32
dtypes: float32(9)
memory usage: 7.1 MB


In [22]:
x =  torch.from_numpy(node_features.to_numpy())
x.shape # [num_nodes x num_features]

torch.Size([168114, 9])

In [23]:
x.dtype

torch.float32

In [24]:
features_dataset[["dead_account","language","affiliate"]] = features_dataset[["dead_account","language","affiliate"]].astype('int64')
features_dataset[["dead_account","language","affiliate"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   dead_account  168114 non-null  int64
 1   language      168114 non-null  int64
 2   affiliate     168114 non-null  int64
dtypes: int64(3)
memory usage: 5.1 MB


In [25]:
# Select node features
labels = features_dataset[["dead_account"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [26]:
y = y.reshape(-1,)

In [27]:
y.shape

torch.Size([168114])

In [28]:
labels.dtypes

dead_account    int64
dtype: object

In [29]:
edge_dataset = edge_dataset.astype('int64')

In [30]:
edge_dataset.dtypes

numeric_id_1    int64
numeric_id_2    int64
dtype: object

In [31]:
edge_dataset = edge_dataset.sort_values(by=['numeric_id_1'])

In [32]:
edge_index = edge_dataset.transpose()

In [33]:
edge_index.dtypes.unique()

array([dtype('int64')], dtype=object)

In [34]:
all_edges =  torch.from_numpy(edge_index.to_numpy()) # [2, num_edges]
print(all_edges.shape)

torch.Size([2, 6797557])


In [35]:
all_edges

tensor([[     0,      0,      0,  ..., 168112, 168112, 168112],
        [ 10464,  59443, 151601,  ...,  77866,  95086,  12740]])

In [36]:
len(features_dataset)

168114

In [37]:
train_arr = np.array([True for i in range(round(len(features_dataset)*0.85))])

In [38]:
train_arr = np.append(train_arr,np.array([False for i in range(int(len(features_dataset)*0.15))]))

In [39]:
train_arr.shape

(168114,)

In [40]:
test_arr = np.array([False for i in range(round(len(features_dataset)*0.55))])
test_arr = np.append(test_arr,np.array([True for i in range(int(len(features_dataset)*0.45))]))

In [41]:
test_arr.shape

(168114,)

In [42]:
val_arr = np.array([False for i in range(round(len(features_dataset)*0.35))])
val_arr = np.append(val_arr,np.array([True for i in range(round(len(features_dataset)*0.35))]))
val_arr = np.append(val_arr,np.array([False for i in range(int(len(features_dataset)*0.30))]))

In [43]:
val_arr.shape

(168114,)

In [44]:
train_mask = torch.from_numpy(train_arr)
test_mask = torch.from_numpy(test_arr)
val_mask = torch.from_numpy(val_arr)

In [45]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [46]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114])

In [47]:
data.num_classes = 2
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [48]:
data

Data(x=[168114, 9], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [49]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = str(1)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

**Hidden Layers**

In [50]:
class Net(torch.nn.Module):
    def __init__(self, arg):
        super(Net, self).__init__()
        hidden_layer_dimension = arg
        self.float()
        self.conv1 = GCNConv(data.num_node_features, hidden_layer_dimension)
        self.conv2 = GCNConv(hidden_layer_dimension, data.num_classes)


    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [51]:
torch.cuda.is_available()

True

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
hidden_dimensions = [2,4,8,16,64]
accs = []
epoch_stable = []
for i in hidden_dimensions:
    print('For Hidden Dimension = '+str(i))
    model = Net(i).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    model.train()
    train_accuracies = []
    for epoch in range(101):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        _, pred = model(data).max(dim=1)
        correct = float (pred[data.train_mask].eq(data.y[data.train_mask]).sum().item())
        acc = correct / data.train_mask.sum().item()
            
        train_accuracies.append(acc)
    
    for i in range(101):
        if train_accuracies[i] == max(train_accuracies):
            epoch_stable.append(i)
            break
            
    _, pred = model(data).max(dim=1)
    correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
    acc = correct / data.test_mask.sum().item()
    accs.append(acc)
    print(acc)
    print()
    
for i in range(len(hidden_dimensions)):
    hidden_dimensions[i] = 'Hidden Dimension = '+str(hidden_dimensions[i])

df_hidden_dims = pd.DataFrame([accs,epoch_stable],columns = hidden_dimensions,index = ['Accuracy','Number of Epochs'])

For Hidden Dimension = 2
0.9647327860834622

For Hidden Dimension = 4
0.9692667644842765

For Hidden Dimension = 8
0.9693989504434839

For Hidden Dimension = 16
0.030601049556516106

For Hidden Dimension = 64
0.9687248020515261



In [53]:
df_hidden_dims

Unnamed: 0,Hidden Dimension = 2,Hidden Dimension = 4,Hidden Dimension = 8,Hidden Dimension = 16,Hidden Dimension = 64
Accuracy,0.964733,0.969267,0.969399,0.030601,0.968725
Number of Epochs,0.0,37.0,13.0,2.0,1.0


In [54]:
torch.cuda.empty_cache()

**Number of Layers**

In [56]:
class Net(torch.nn.Module):
    def __init__(self, num_layers):
        super(Net, self).__init__()
        hidden_layer_dimension = 16
        self.conv1 = GCNConv(data.num_node_features, hidden_layer_dimension)
        self.conv = torch.nn.ModuleList(GCNConv(hidden_layer_dimension, hidden_layer_dimension) for i in range(num_layers))
        self.convn = GCNConv(hidden_layer_dimension, data.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        for i in range(len(self.conv)):
            x = self.conv[i](x,edge_index)
            x = F.relu(x)
        x = self.convn(x, edge_index)

        return F.log_softmax(x, dim=1)

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

number_of_layers = [1,2,4,8]
accs = []
epoch_stable = []
for i in number_of_layers:
    print('For Number of Layers = '+str(i))
    model = Net(i).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    model.train()
    train_accuracies = []
    for epoch in range(101):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        _, pred = model(data).max(dim=1)
        correct = float (pred[data.train_mask].eq(data.y[data.train_mask]).sum().item())
        acc = correct / data.train_mask.sum().item()
            
        train_accuracies.append(acc)
    
    for i in range(101):
        if train_accuracies[i] == max(train_accuracies):
            epoch_stable.append(i)
            break
            
    _, pred = model(data).max(dim=1)
    correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
    acc = correct / data.test_mask.sum().item()
    accs.append(acc)
    print(acc)
    print()
    
for i in range(len(number_of_layers)):
    number_of_layers[i] = 'Number of Layers = '+str(number_of_layers[i])

df_num_layers = pd.DataFrame([accs,epoch_stable],columns = number_of_layers,index = ['Accuracy','Number of Epochs'])
torch.cuda.empty_cache()

For Number of Layers = 1
0.9693989504434839

For Number of Layers = 2
0.9692535458883558

For Number of Layers = 4
0.9693989504434839

For Number of Layers = 8
0.9693989504434839



In [62]:
df_num_layers

Unnamed: 0,Number of Layers = 1,Number of Layers = 2,Number of Layers = 4,Number of Layers = 8
Accuracy,0.969399,0.969254,0.969399,0.969399
Number of Epochs,3.0,2.0,2.0,1.0


In [63]:
torch.cuda.empty_cache()

**Learning Rate**

In [66]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        hidden_layer_dimension = 16
        self.conv1 = GCNConv(data.num_node_features, hidden_layer_dimension)
        self.convn = GCNConv(hidden_layer_dimension, data.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.convn(x, edge_index)

        return F.log_softmax(x, dim=1)

In [77]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

learning_rate = [1,0.5,0.1,0.01,0.0001]
accs = []
epoch_stable = []
for i in learning_rate:
    print('For Learning Rate = '+str(i))
    model = Net().to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=i, weight_decay=5e-4)
    model.train()
    train_accuracies = []
    for epoch in range(101):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        _, pred = model(data).max(dim=1)
        correct = float (pred[data.train_mask].eq(data.y[data.train_mask]).sum().item())
        acc = correct / data.train_mask.sum().item()
            
        train_accuracies.append(acc)
    
    for i in range(101):
        if train_accuracies[i] == max(train_accuracies):
            epoch_stable.append(i)
            break
            
    _, pred = model(data).max(dim=1)
    correct = float (pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
    acc = correct / data.test_mask.sum().item()
    accs.append(acc)
    print(acc)
    print()
    
for i in range(len(learning_rate)):
    learning_rate[i] = 'Learning Rate = '+str(learning_rate[i])

df_learn_rate = pd.DataFrame([accs,epoch_stable],columns = learning_rate,index = ['Accuracy','Number of Epochs'])


For Learning Rate = 1
0.9693989504434839

For Learning Rate = 0.5
0.9693989504434839

For Learning Rate = 0.1
0.9688437694148128

For Learning Rate = 0.01
0.9679184677003608

For Learning Rate = 0.0001
0.9689230809903372



In [78]:
df_learn_rate

Unnamed: 0,Learning Rate = 1,Learning Rate = 0.5,Learning Rate = 0.1,Learning Rate = 0.01,Learning Rate = 0.0001
Accuracy,0.969399,0.969399,0.968844,0.967918,0.968923
Number of Epochs,1.0,0.0,0.0,1.0,42.0


In [79]:
torch.cuda.empty_cache()