In [113]:
import torch #The torch package contains data structures for multi-dimensional tensors and mathematical operations over these are defined.
import torchvision #The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
import pandas as pd

In [114]:
edge_dataset=pd.read_csv('twitch_gamers_dataset/large_twitch_edges.csv')

In [115]:
features_dataset=pd.read_csv('twitch_gamers_dataset/large_twitch_features.csv')

In [116]:
edge_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797557 entries, 0 to 6797556
Data columns (total 2 columns):
numeric_id_1    int64
numeric_id_2    int64
dtypes: int64(2)
memory usage: 103.7 MB


In [117]:
edge_dataset.head()

Unnamed: 0,numeric_id_1,numeric_id_2
0,98343,141493
1,98343,58736
2,98343,140703
3,98343,151401
4,98343,157118


In [118]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168114 entries, 0 to 168113
Data columns (total 9 columns):
views           168114 non-null int64
mature          168114 non-null int64
life_time       168114 non-null int64
created_at      168114 non-null object
updated_at      168114 non-null object
numeric_id      168114 non-null int64
dead_account    168114 non-null int64
language        168114 non-null object
affiliate       168114 non-null int64
dtypes: int64(6), object(3)
memory usage: 11.5+ MB


In [119]:
features_dataset.head()

Unnamed: 0,views,mature,life_time,created_at,updated_at,numeric_id,dead_account,language,affiliate
0,7879,1,969,2016-02-16,2018-10-12,0,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,1,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,2,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,3,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,4,0,EN,0


In [120]:
features_dataset = features_dataset.set_index('numeric_id')

In [121]:
features_dataset.head()

Unnamed: 0_level_0,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,7879,1,969,2016-02-16,2018-10-12,0,EN,1
1,500,0,2699,2011-05-19,2018-10-08,0,EN,0
2,382502,1,3149,2010-02-27,2018-10-12,0,EN,1
3,386,0,1344,2015-01-26,2018-10-01,0,EN,0
4,2486,0,1784,2013-11-22,2018-10-11,0,EN,0


In [122]:
edge_dataset.describe()

Unnamed: 0,numeric_id_1,numeric_id_2
count,6797557.0,6797557.0
mean,83828.01,84015.23
std,48205.13,48527.19
min,0.0,0.0
25%,42217.0,42045.0
50%,83546.0,83851.0
75%,125642.0,125957.0
max,168112.0,168113.0


In [123]:
merged_dataset = pd.merge(edge_dataset, features_dataset,left_on='numeric_id_1',right_on='numeric_id')

In [124]:
merged_dataset = merged_dataset.set_index('numeric_id_1')
merged_dataset.head()

Unnamed: 0_level_0,numeric_id_2,views,mature,life_time,created_at,updated_at,dead_account,language,affiliate
numeric_id_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98343,141493,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,58736,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,140703,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,151401,282,0,2086,2012-12-27,2018-09-13,0,EN,0
98343,157118,282,0,2086,2012-12-27,2018-09-13,0,EN,0


In [125]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6797557 entries, 98343 to 27819
Data columns (total 9 columns):
numeric_id_2    int64
views           int64
mature          int64
life_time       int64
created_at      object
updated_at      object
dead_account    int64
language        object
affiliate       int64
dtypes: int64(6), object(3)
memory usage: 518.6+ MB


In [126]:
merged_dataset.describe()

Unnamed: 0,numeric_id_2,views,mature,life_time,dead_account,affiliate
count,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0,6797557.0
mean,84015.23,16717070.0,0.4857826,1981.736,0.002065448,0.3226697
std,48527.19,50465520.0,0.4997979,756.7212,0.04540024,0.4674976
min,0.0,0.0,0.0,34.0,0.0,0.0
25%,42045.0,20964.0,0.0,1447.0,0.0,0.0
50%,83851.0,437055.0,0.0,1976.0,0.0,0.0
75%,125957.0,6237401.0,1.0,2502.0,0.0,1.0
max,168113.0,384396600.0,1.0,4161.0,1.0,1.0


**Converting Dataset to Pytorch Geometric Data**

In [127]:
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 8 columns):
views           168114 non-null int64
mature          168114 non-null int64
life_time       168114 non-null int64
created_at      168114 non-null object
updated_at      168114 non-null object
dead_account    168114 non-null int64
language        168114 non-null object
affiliate       168114 non-null int64
dtypes: int64(5), object(3)
memory usage: 11.5+ MB


In [128]:
features_dataset['language'].unique()

array(['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO'], dtype=object)

In [129]:
languages = ['EN', 'FR', 'KO', 'JA', 'RU', 'PL', 'DE', 'ES', 'IT', 'PT',
       'OTHER', 'TR', 'ZH', 'SV', 'NL', 'TH', 'CS', 'DA', 'HU', 'FI',
       'NO']

from sklearn.preprocessing import LabelEncoder

def encode_df(dataframe):
    le = LabelEncoder()
    features_dataset['language'] = le.fit_transform(features_dataset['language'])
    return dataframe

#encode the dataframe
features_dataset = encode_df(features_dataset)
features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 8 columns):
views           168114 non-null int64
mature          168114 non-null int64
life_time       168114 non-null int64
created_at      168114 non-null object
updated_at      168114 non-null object
dead_account    168114 non-null int64
language        168114 non-null int64
affiliate       168114 non-null int64
dtypes: int64(6), object(2)
memory usage: 11.5+ MB


In [130]:
node_features = features_dataset[["views","life_time", "dead_account","language","affiliate"]]

In [131]:
# node_features[["created_year", "created_month", "created_day"]] = node_features["created_at"].str.split("-", expand = True).astype('int32')
# node_features[["updated_year", "updated_month", "updated_day"]] = node_features["updated_at"].str.split("-", expand = True).astype('int32')

# node_features = node_features.drop(['created_at','updated_at'],axis=1)

In [132]:
# node_features['created_at'] = pd.to_datetime(node_features['created_at']).astype('int64')/ 10**9
# node_features['updated_at'] = pd.to_datetime(node_features['updated_at']).astype('int64')/ 10**9
node_features.head()

Unnamed: 0_level_0,views,life_time,dead_account,language,affiliate
numeric_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7879,969,0,3,1
1,500,2699,0,3,0
2,382502,3149,0,3,1
3,386,1344,0,3,0
4,2486,1784,0,3,0


In [133]:
node_features = node_features.astype('float32')
node_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168114 entries, 0 to 168113
Data columns (total 5 columns):
views           168114 non-null float32
life_time       168114 non-null float32
dead_account    168114 non-null float32
language        168114 non-null float32
affiliate       168114 non-null float32
dtypes: float32(5)
memory usage: 4.5 MB


In [134]:
x =  torch.from_numpy(node_features.to_numpy())
x.shape # [num_nodes x num_features]

torch.Size([168114, 5])

In [135]:
x.dtype

torch.float32

In [136]:
# features_dataset.info()

In [137]:
# features_dataset[["dead_account","language","affiliate"]] = features_dataset[["dead_account","language","affiliate"]].astype('int64')
# features_dataset[["dead_account","language","affiliate"]].info()

In [138]:
# Select node features
labels = features_dataset[["mature"]]
labels.head()

# labels = labels.astype(float)

# Convert to numpy
y =  torch.from_numpy(labels.to_numpy())
y.shape # [num_nodes, 1] --> node regression

torch.Size([168114, 1])

In [139]:
y = y.reshape(-1,)

In [140]:
y.shape

torch.Size([168114])

In [141]:
labels.dtypes

mature    int64
dtype: object

In [142]:
edge_dataset = edge_dataset.astype('int64')

In [143]:
edge_dataset.dtypes

numeric_id_1    int64
numeric_id_2    int64
dtype: object

In [144]:
edge_dataset = edge_dataset.sort_values(by=['numeric_id_1'])

In [145]:
edge_index = edge_dataset.transpose()

In [146]:
edge_index.dtypes.unique()

array([dtype('int64')], dtype=object)

In [147]:
all_edges =  torch.from_numpy(edge_index.to_numpy()) # [2, num_edges]
print(all_edges.shape)

torch.Size([2, 6797557])


In [148]:
all_edges

tensor([[     0,      0,      0,  ..., 168112, 168112, 168112],
        [ 10464,  59443, 151601,  ...,  77866,  95086,  12740]])

In [149]:
len(features_dataset)

168114

In [150]:
train_arr = np.array([True for i in range(round(len(features_dataset)*0.85))])

In [151]:
train_arr = np.append(train_arr,np.array([False for i in range(int(len(features_dataset)*0.15))]))

In [152]:
train_arr.shape

(168114,)

In [153]:
test_arr = np.array([False for i in range(round(len(features_dataset)*0.55))])
test_arr = np.append(test_arr,np.array([True for i in range(int(len(features_dataset)*0.45))]))

In [154]:
test_arr.shape

(168114,)

In [155]:
val_arr = np.array([False for i in range(round(len(features_dataset)*0.35))])
val_arr = np.append(val_arr,np.array([True for i in range(round(len(features_dataset)*0.35))]))
val_arr = np.append(val_arr,np.array([False for i in range(int(len(features_dataset)*0.30))]))

In [156]:
val_arr.shape

(168114,)

In [157]:
train_mask = torch.from_numpy(train_arr)
test_mask = torch.from_numpy(test_arr)
val_mask = torch.from_numpy(val_arr)

In [158]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=all_edges, y=y)

In [159]:
data

Data(x=[168114, 5], edge_index=[2, 6797557], y=[168114])

In [160]:
data.num_classes = 2
data.train_mask = train_mask
data.test_mask = test_mask
data.val_mask = val_mask

In [161]:
data

Data(x=[168114, 5], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])

In [162]:
import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

In [163]:
# import torch_geometric
# from torch_geometric.utils.convert import to_networkx
# import networkx as nx
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 10))
# twitch_gamers = torch_geometric.data.Data(x=data.x[:500], edge_index=data.edge_index[:500])
# # g = torch_geometric.utils.to_networkx(twitch_gamers, to_undirected=True)
# twitchgraph = to_networkx(twitch_gamers)
# node_labels = data.y[list(twitchgraph.nodes)].numpy()
# nx.draw(g, cmap=plt.get_cmap('Set1'),node_color = node_labels,node_size=75,linewidths=6)

Explicit Content Classification

In [164]:
print(data)
print("number of graphs:\t\t",len(data))
print("number of classes:\t\t",data.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Data(x=[168114, 5], edge_index=[2, 6797557], y=[168114], num_classes=2, train_mask=[168114], test_mask=[168114], val_mask=[168114])
number of graphs:		 7
number of classes:		 2
number of classes:		 [0 1]
number of node features:	 5
number of edge features:	 0
X shape:  torch.Size([168114, 5])
Edge shape:  torch.Size([2, 6797557])
Y shape:  torch.Size([168114])


In [177]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.conv3 = GCNConv(16, data.num_classes)   
    
    def forward(self, data):
        # x: Node feature matrix 
        # edge_index: Graph connectivity matrix        
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        return F.log_softmax(x, dim=1)

model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
print("Graph Convolutional Network (GCN):")
GCN()

Graph Convolutional Network (GCN):


GCN(
  (conv1): GCNConv(5, 16)
  (conv2): GCNConv(16, 16)
  (conv3): GCNConv(16, 2)
)

In [178]:
# useful function for computing accuracy
def compute_accuracy(pred_y, y):
    return (pred_y == y).sum()

In [179]:
# train the model
model.train()
losses = []
accuracies = []
epoch_stable = []
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    correct = compute_accuracy(out.argmax(dim=1)[data.train_mask], data.y[data.train_mask])
    acc = int(correct) / int(data.train_mask.sum())
    losses.append(loss.item())
    accuracies.append(acc*100)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))

Epoch: 10, Loss: 2975546.7500, Training Acc: 0.2281
Epoch: 20, Loss: 326282.5312, Training Acc: 0.4962
Epoch: 30, Loss: 8110.2295, Training Acc: 0.5187
Epoch: 40, Loss: 3629.8186, Training Acc: 0.5241
Epoch: 50, Loss: 1446.0750, Training Acc: 0.5269
Epoch: 60, Loss: 495.0789, Training Acc: 0.5288
Epoch: 70, Loss: 510.2564, Training Acc: 0.5288
Epoch: 80, Loss: 346.4532, Training Acc: 0.5289
Epoch: 90, Loss: 464.6475, Training Acc: 0.5288
Epoch: 100, Loss: 461.0890, Training Acc: 0.5291
Epoch: 110, Loss: 400.8167, Training Acc: 0.5290
Epoch: 120, Loss: 463.7697, Training Acc: 0.5290
Epoch: 130, Loss: 419.3738, Training Acc: 0.5291
Epoch: 140, Loss: 329.2858, Training Acc: 0.5289
Epoch: 150, Loss: 423.4977, Training Acc: 0.5290
Epoch: 160, Loss: 397.6691, Training Acc: 0.5288
Epoch: 170, Loss: 383.1092, Training Acc: 0.5290
Epoch: 180, Loss: 231.6026, Training Acc: 0.5294
Epoch: 190, Loss: 144.8299, Training Acc: 0.5295
Epoch: 200, Loss: 235.1077, Training Acc: 0.5293


In [180]:
model.eval()
pred = model(data).argmax(dim=1)
correct = compute_accuracy(pred[data.test_mask], data.y[data.test_mask])
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.5313


In [181]:
torch.cuda.empty_cache()