## Pipeline 4: Word Embeddings + GNN

In [1]:
# import package
import numpy as np
import pandas as pd
import torch
import evaluate
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [2]:
# load data
train_df = pd.read_csv('dataset/train_embedding.csv', sep='\t', encoding='utf-8')
val_df = pd.read_csv('dataset/val_embedding.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test_embedding.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Validation data shape: {val_df.shape}")
print(val_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

Training data shape: (3988, 2)
                                           embedding  label
0  [101, 3220, 7158, 5708, 1010, 2040, 2038, 3130...      0
1  [101, 11977, 2086, 3283, 1010, 1037, 5637, 215...      0
2  [101, 1996, 2388, 999, 1997, 2035, 6550, 2003,...      1
3  [101, 1996, 3951, 2162, 2006, 2308, 4247, 1447...      1
4  [101, 2004, 4202, 9170, 4455, 2041, 1996, 5223...      0
Validation data shape: (998, 2)
                                           embedding  label
0  [101, 2577, 10805, 18856, 7828, 3240, 1006, 21...      1
1  [101, 2079, 2017, 2514, 2009, 1999, 2115, 3093...      0
2  [101, 15147, 1996, 2548, 2155, 5935, 2023, 285...      0
3  [101, 5074, 9932, 4244, 1010, 2280, 4419, 2739...      0
4  [101, 2137, 7642, 2775, 4424, 6905, 2099, 1998...      0
Testing data shape: (1247, 2)
   id                                          embedding
0   2  [101, 1996, 2418, 9458, 3601, 2982, 5103, 2001...
1   3  [101, 1996, 4164, 1010, 2112, 1997, 1523, 1996...
2   4  [101, 191

In [3]:
# transfrom embedding to list of int
import ast

# read
print(train_df['embedding'][0])   
print(type(train_df['embedding'][0])) # string

# convert the embeddings to list
train_df['embedding'] = train_df['embedding'].apply(lambda x: ast.literal_eval(x))
val_df['embedding'] = val_df['embedding'].apply(lambda x: ast.literal_eval(x))
test_df['embedding'] = test_df['embedding'].apply(lambda x: ast.literal_eval(x))

# convert the embeddings to list of integers
train_df['embedding'] = train_df['embedding'].apply(lambda x: list(map(int, x)))
val_df['embedding'] = val_df['embedding'].apply(lambda x: list(map(int, x)))
test_df['embedding'] = test_df['embedding'].apply(lambda x: list(map(int, x)))

print(train_df['embedding'][0])
print(type(train_df['embedding'][0])) # list of integers



[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045, 2318, 2000, 24

In [4]:

print(train_df['embedding'][0])
print(type(train_df['embedding'][0])) # list of integers
print(len(train_df))

print(val_df['embedding'][0])
print(type(val_df['embedding'][0])) # list of integers
print(len(val_df))

print(test_df['embedding'][0])
print(type(test_df['embedding'][0])) # list of integers
print(len(test_df))

[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045, 2318, 2000, 24

In [5]:
import os
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
# %pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
# %pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
# %pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
# %pip install -q torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}.html

2.3.0+cu118


In [6]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
print(train_df.columns)
print(val_df.columns)
print(test_df.columns)

Index(['embedding', 'label'], dtype='object')
Index(['embedding', 'label'], dtype='object')
Index(['id', 'embedding'], dtype='object')


In [8]:
# 创建PyTorch Geometric数据对象
def create_data(df):
    X = torch.tensor(df['embedding'].tolist(), dtype=torch.float32)
    Y = torch.tensor(df['label'].tolist(), dtype=torch.long)
    
    # 假设邻接矩阵是已知的（这里使用全连接图作为示例）
    edge_index = torch.tensor([
        [0, 1, 2, 1, 0, 2],  # 每个边的起点
        [1, 2, 0, 0, 2, 1]   # 每个边的终点
    ], dtype=torch.long)
    
    return Data(x=X, edge_index=edge_index, y=Y)

train_data = create_data(train_df)
val_data = create_data(val_df)


In [9]:
## model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# 模型参数
input_dim = train_data.num_node_features
hidden_dim = 16
output_dim = len(train_df['label'].unique())

# 创建模型实例
model = GCN(input_dim, hidden_dim, output_dim)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data)
    loss = F.nll_loss(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

num_epochs = 2000
for epoch in range(num_epochs):
    loss = train()
    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}')

Epoch [20/2000], Loss: 0.6732
Epoch [40/2000], Loss: 0.6732
Epoch [60/2000], Loss: 0.6732
Epoch [80/2000], Loss: 0.6732
Epoch [100/2000], Loss: 0.6732
Epoch [120/2000], Loss: 0.6732
Epoch [140/2000], Loss: 0.6732
Epoch [160/2000], Loss: 0.6732
Epoch [180/2000], Loss: 0.6732
Epoch [200/2000], Loss: 0.6732
Epoch [220/2000], Loss: 0.6732
Epoch [240/2000], Loss: 0.6732
Epoch [260/2000], Loss: 0.6732
Epoch [280/2000], Loss: 0.6732
Epoch [300/2000], Loss: 0.6732
Epoch [320/2000], Loss: 0.6732
Epoch [340/2000], Loss: 0.6732
Epoch [360/2000], Loss: 0.6732
Epoch [380/2000], Loss: 0.6732
Epoch [400/2000], Loss: 0.6732
Epoch [420/2000], Loss: 0.6732
Epoch [440/2000], Loss: 0.6732
Epoch [460/2000], Loss: 0.6732
Epoch [480/2000], Loss: 0.6732
Epoch [500/2000], Loss: 0.6732
Epoch [520/2000], Loss: 0.6732
Epoch [540/2000], Loss: 0.6732
Epoch [560/2000], Loss: 0.6732
Epoch [580/2000], Loss: 0.6732
Epoch [600/2000], Loss: 0.6732
Epoch [620/2000], Loss: 0.6732
Epoch [640/2000], Loss: 0.6732
Epoch [660/2

In [13]:
def test(data):
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = pred.eq(data.y).sum().item()
    accuracy = correct / len(data.y)
    return accuracy

train_acc = test(train_data)
val_acc = test(val_data)

print(f'Training Accuracy: {train_acc * 100:.2f}%')
print(f'Validation Accuracy: {val_acc * 100:.2f}%')

Training Accuracy: 59.95%
Validation Accuracy: 58.22%
