Questions:
- Set a seed ✓
- Read data ✓
- Check nulls ✓
- How many users? ✓
- How many categories? ✓
    - How much records does each one have? ✓
- How many posts? ✓
- How many interactions? ✓
- What is the average interactions per post? ✓
- Remove "insignificant" connections. ✓
- How representative was the reduction? ✓
- Create a reasonable visualization from the graph (e.g. Gephi)
- Create a mapping from all names to indices (e.g. LabelEncoder). How to get all names? ✓
- Create a mapping from all labels to an indices. ✓
- Create a mapping from all nodes to a label index. ✓
- Create a toy model (e.g. the GCN example provided in the documentation). ✓
- Check if the data object was created correctly. 
- Define the embedding dimension.
- Create and save a [Node2Vec](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.models.Node2Vec) model. ✓
- Create and save a [GCN](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GCNConv) model. ✓
- Create ans save a [GAT](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GATConv) model.
- Create ans save a [SAGE](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.SAGEConv) model.
- Create ans save a [GIN](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GINConv) model.
- Use a t-SNE and plot the classes with different colors.
- How consistent are the embeddings? 
- Do they group well together?
- From which models does the greatest embeddings come from?
- Which metric will be optimized by the learning models?
- Which model should be used to classify the nodes?
    - If a neural model:
        - Which learning rate? Is it adaptive? 
        - How many epochs? 
        - Which architecture?
        - Present a training erro vs test error analysis chart.
- Which categories reach the greatest performance? 
    - Why?
    - Is there any pausible reason or maybe characteristic from a method/family of methods that helps to perform better in our case? If so, what is?
- 
   
Resources:
- https://graphreason.github.io/papers/39.pdf (Must Read)
- https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8
- https://pytorch-geometric.readthedocs.io/

In [None]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import os

from torch_geometric.data import Data
from torch.utils.data import DataLoader
from torch_geometric.nn.models import Node2Vec
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv
from torch.optim import Adam
from torch.nn import NLLLoss
import torch.nn.functional as F
from collections import defaultdict

In [None]:
torch.manual_seed(0)
np.random.seed(0)

In [None]:
profiles = pd.read_csv("../data/profiles.csv", usecols=["profile_username", "category_1"]).drop_duplicates()
profiles.tail(1)

In [None]:
profiles.isna().sum()

In [None]:
comments = pd.read_csv("../data/comments.csv", usecols=["media_short_code", "media_author", "commenter"])
comments = comments.drop_duplicates()
comments.tail(1)

In [None]:
comments.isna().sum()

In [None]:
results = profiles.category_1.value_counts()

figure = go.Figure(
    data=[go.Pie(labels=results.index.values, values=results.values)],
    layout_title_text="Percentage of Each Category"
)

figure.show()

In [None]:
category_to_index = {category: index for index, category in enumerate(results.index)}
profiles.category_1 = profiles.category_1.map(lambda key: category_to_index[key])
profiles.head(3)

In [None]:
known_users = profiles.profile_username.unique().tolist()
followers = comments.commenter.unique().tolist()
prev_number_of_users = len(set(known_users + followers))

print("There are originally {} users".format(prev_number_of_users))

In [None]:
all_posts = len(comments.media_short_code.unique())
all_interactions = len(comments)
print("There are {} distinct posts and {} interactions. An average of {} interactions per post"\
          .format(all_posts, all_interactions, np.round(all_interactions/all_posts, 2)))

In [None]:
RELEVANCE_THRESHOLD = 5
subgraph = []
for commenter, frequency in comments.commenter.value_counts().items():
    if frequency > RELEVANCE_THRESHOLD:
        subgraph.append(commenter)
        
comments = comments[comments.commenter.isin(subgraph)]

In [None]:
known_users = profiles.profile_username.unique().tolist()
followers = comments.commenter.unique().tolist()
all_users = set(known_users + followers)
cur_number_of_users = len(all_users)

print("The new graph drawn from relevance threshold {} has {} users and {} interactions"\
          .format(RELEVANCE_THRESHOLD, cur_number_of_users, len(comments)))

In [None]:
print("The number of users was reduced by ~ {}%"\
          .format(np.round(1-cur_number_of_users/prev_number_of_users, 2) * 100))

In [None]:
username_to_index = {name: index for index, name in enumerate(all_users)}
all_users_indices = [username_to_index[user] for user in username_to_index]

In [None]:
user_to_label = {user: category for user, category in profiles[["profile_username", "category_1"]].values}

In [None]:
interactions = [(username_to_index[author], username_to_index[commenter]) 
                    for author, commenter in comments[['media_author', 'commenter']].drop_duplicates().values]

print("The final graph has {} interactions".format(len(interactions)))

In [None]:
graph = nx.Graph()
graph.add_edges_from(interactions)

In [None]:
x = torch.tensor(all_users_indices, dtype=torch.long).view(-1, 1)
y = torch.tensor([user_to_label.get(user, 4) for user in all_users], dtype=torch.float)
edge_index = torch.tensor(nx.to_pandas_edgelist(graph).values.T, dtype=torch.long)

assert len(x)==len(y), "Input and Output tensor do not have the same dimensions"

In [None]:
# data = Data(x=x, edge_index=edge_index, y=y)

In [None]:
%%script False

class Node2VecModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super(Node2VecModel, self).__init__()
        self.model = Node2Vec(*args, **kwargs)
        
        self.optimizer = Adam(self.model.parameters(), lr=0.01)

    def forward(self, data):
        return self.model(data)
        
    def fit(self, data, epochs=10):
        data_loader = DataLoader(torch.arange(data.num_nodes), batch_size=128, shuffle=True)
        self.train() # To set the modules in training state
        for epoch in range(epochs):
            running_loss = 0.0
            for subset in data_loader:
                self.optimizer.zero_grad()
                loss = self.model.loss(data.edge_index, subset)
                loss.backward()
                self.optimizer.step()
                
                running_loss += loss.item()
            
            print("The running loss is: {}".format(running_loss / len(loader)))

In [None]:
%%script False

n2v = Node2VecModel(data.num_nodes, embedding_dim=128, walk_length=20, context_size=10, walks_per_node=10)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n2v, data = n2v.to(device), data.to(device)

n2v.fit(data)

In [None]:
%%script False

class GCNModel(torch.nn.Module):
    def __init__(self, n_features, n_hidden_units, n_classes, **kwargs):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(n_features, n_hidden_units, cached=True)
        self.conv2 = GCNConv(n_hidden_units, n_classes, cached=True)
        
        self.loss = NLLLoss()
        self.optimizer = Adam(self.parameters(), lr=0.01, weight_decay=5e-4)


    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)
    
    def fit(self, data, epochs=10):
        model.train()
        for epoch in range(epochs):
            self.optimizer.zero_grad()

            outputs = self.forward(data.x, data.edge_index)
            loss = self.loss(outputs[data.train_mask], data.y[data.train_mask])
            loss.backward()

            self.optimizer.step()
            print("The running loss is: {}".format(loss.item()))

In [None]:
%%script False

gcn = GCNModel(dataset.num_features, 16, dataset.num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gcn, data = gcn.to(device), data.to(device)

gcn.fit(data)

In [None]:
class GATModel(torch.nn.Module):
    def __init__(self, n_features, n_hidden_units, n_classes):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(n_features, n_hidden_units, heads=8, dropout=0.6)
        self.conv2 = GATConv(n_hidden_units**2, n_classes, heads=1, concat=True, dropout=0.6)
        
        self.loss = NLLLoss()
        self.optimizer = Adam(self.parameters(), lr=0.01, weight_decay=5e-4)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)
    
    def fit(self, data, epochs=10):
        self.train()
        for epoch in range(epochs):
            self.optimizer.zero_grad()
            
            outputs = self.forward(data.x, data.edge_index)
            loss = self.loss(outputs[data.train_mask], data.y[data.train_mask])
            loss.backward()
            
            self.optimizer.step()
            print("The running loss is: {}".format(loss.item()))

In [None]:
%%script False

gat = GATModel(dataset.num_features, 8, dataset.num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gat, data = gat.to(device), data.to(device)

gat.fit(data)