# Dataset Creation

In [None]:
import torch
import numpy as np
from gensim.models import Word2Vec
from torch_geometric.data import Data

## Example Cora

## Creating our own Data Object

 - make a graph out of the corpus 
 - node: word 
 - feature: embedding
 - label: POS
 - edge: between node that neighbour each other in the text

In [4]:
# Example corpus
corpus = [
    'This is the first document',
    'This document is the second document',
    'And this is the third one',
    'Is this the first document',
]

### Feature Vectors

In [5]:
# Tokenize the corpus
tokens = [[word.lower() for word in document.split(" ")] for document in corpus]

# Train Word2Vec model
word2vec = Word2Vec(sentences=tokens, vector_size=9, window=2, min_count=1, sg=1)
w2v_embedding = word2vec.wv

# Create word-to-index mapping
word_to_idx = {word: idx for idx, word in enumerate(w2v_embedding.index_to_key)}

# Prepare node embeddings
embeddings = [w2v_embedding[word].tolist() for word in word_to_idx]
embeddings = torch.tensor(embeddings, dtype=torch.float)

In [7]:
word_to_idx

{'document': 0,
 'the': 1,
 'is': 2,
 'this': 3,
 'first': 4,
 'one': 5,
 'third': 6,
 'and': 7,
 'second': 8}

In [6]:
embeddings

tensor([[-0.0060,  0.0026,  0.0567,  0.1001, -0.1034, -0.0791,  0.0718,  0.0997,
         -0.0557],
        [-0.0418,  0.0820, -0.0170, -0.0504,  0.0728, -0.0540, -0.0202,  0.0320,
          0.0110],
        [-0.0921, -0.1050,  0.0812,  0.0563,  0.0751,  0.0085,  0.0706, -0.0378,
         -0.0105],
        [ 0.0641, -0.0836, -0.0437, -0.0835, -0.0103,  0.1060, -0.0813, -0.0259,
         -0.0215],
        [ 0.0897, -0.0659,  0.0005, -0.0528, -0.1067,  0.0556, -0.0973, -0.0488,
         -0.0004],
        [-0.0033, -0.0851,  0.1068,  0.0554,  0.1026, -0.0906,  0.0500, -0.0460,
          0.0092],
        [ 0.0944, -0.0496,  0.0502, -0.0754, -0.0394,  0.1044, -0.0175,  0.0036,
         -0.0460],
        [-0.0854, -0.0168,  0.0274, -0.0099,  0.0615, -0.0305,  0.0251,  0.0606,
          0.0927],
        [-0.0162, -0.1023,  0.0486,  0.0064,  0.0827, -0.0090, -0.0293, -0.0973,
         -0.0095]])

### Edge_Index

In [9]:
# Prepare edge indices based on the corpus
edges = [[], []]
for doc in tokens:
    for i in range(len(doc) - 1):
        edges[0].append(word_to_idx[doc[i]])
        edges[1].append(word_to_idx[doc[i + 1]])

edges = torch.tensor([edges[0], edges[1]], dtype=torch.long)

edges = edges.sort(dim=1)[0]

### Labels
 - DT: Determiner (e.g., "the", "this")
 - VBZ: Verb, 3rd person singular present (e.g., "is")
 - NN: Noun, singular (e.g., "document")
 - JJ: Adjective (e.g., "first", "second", "third")
 - CC: Coordinating conjunction (e.g., "and")

In [10]:
word_to_idx

{'document': 0,
 'the': 1,
 'is': 2,
 'this': 3,
 'first': 4,
 'one': 5,
 'third': 6,
 'and': 7,
 'second': 8}

In [11]:
# Define labels (example)
label_to_id = {
    "DT": 0, "JJ": 1, "CC": 2, "NN": 3, "VBZ": 4,
}
labels = [0, 1, 2, 1, 0, 3, 4, 3, 1]  # Example labels for nodes
labels = torch.tensor(labels, dtype=torch.long)

In [13]:

# Create the PyTorch Geometric Data object
data = Data(x=embeddings, edge_index=edges, y=labels)

In [15]:
data

Data(x=[9, 9], edge_index=[2, 18], y=[9])

# GNN Training

In [16]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import DataLoader
from torch.optim import Adam

# GraphSAGE Model Definition
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GraphSAGE, self).__init__()
        # Define two layers of SAGEConv
        self.conv1 = SAGEConv(in_channels, 16)
        self.conv2 = SAGEConv(16, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        # Apply the first GraphSAGE layer (with ReLU activation)
        x = F.relu(self.conv1(x, edge_index))
        # Apply the second GraphSAGE layer
        x = self.conv2(x, edge_index)
        return x

In [17]:
in_channels = data.x.size(1)  
out_channels = len(label_to_id)  
learning_rate = 0.01
epochs = 100

# Create the model
model = GraphSAGE(in_channels, out_channels)
optimizer = Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

In [18]:
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.y != -1], data.y[data.y != -1])
    loss.backward()
    optimizer.step()

model.eval()
with torch.no_grad():
    out = model(data)
    pred = out.argmax(dim=1)  # Predicted labels are the ones with the highest score
    correct = (pred == data.y).sum().item()
    accuracy = correct / data.num_nodes
    print(f'Accuracy: {accuracy:.4f}')

Accuracy: 1.0000
