## Pipeline 4: Word Embeddings + GNN

In [1]:
# import package
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

# Dataset

In [2]:
# load data
train_df = pd.read_csv('dataset/train_embedding.csv', sep='\t', encoding='utf-8')
val_df = pd.read_csv('dataset/val_embedding.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test_embedding.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Validation data shape: {val_df.shape}")
print(val_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

Training data shape: (3988, 2)
                                           embedding  label
0  [101, 3220, 7158, 5708, 1010, 2040, 2038, 3130...      0
1  [101, 11977, 2086, 3283, 1010, 1037, 5637, 215...      0
2  [101, 1996, 2388, 999, 1997, 2035, 6550, 2003,...      1
3  [101, 1996, 3951, 2162, 2006, 2308, 4247, 1447...      1
4  [101, 2004, 4202, 9170, 4455, 2041, 1996, 5223...      0
Validation data shape: (998, 2)
                                           embedding  label
0  [101, 2577, 10805, 18856, 7828, 3240, 1006, 21...      1
1  [101, 2079, 2017, 2514, 2009, 1999, 2115, 3093...      0
2  [101, 15147, 1996, 2548, 2155, 5935, 2023, 285...      0
3  [101, 5074, 9932, 4244, 1010, 2280, 4419, 2739...      0
4  [101, 2137, 7642, 2775, 4424, 6905, 2099, 1998...      0
Testing data shape: (1247, 2)
   id                                          embedding
0   2  [101, 1996, 2418, 9458, 3601, 2982, 5103, 2001...
1   3  [101, 1996, 4164, 1010, 2112, 1997, 1523, 1996...
2   4  [101, 191

In [3]:
# transfrom embedding to list of int
import ast

# read
print(train_df['embedding'][0])   
print(type(train_df['embedding'][0])) # string

# convert the embeddings to list
train_df['embedding'] = train_df['embedding'].apply(lambda x: ast.literal_eval(x))
val_df['embedding'] = val_df['embedding'].apply(lambda x: ast.literal_eval(x))
test_df['embedding'] = test_df['embedding'].apply(lambda x: ast.literal_eval(x))

# convert the embeddings to list of integers
train_df['embedding'] = train_df['embedding'].apply(lambda x: list(map(int, x)))
val_df['embedding'] = val_df['embedding'].apply(lambda x: list(map(int, x)))
test_df['embedding'] = test_df['embedding'].apply(lambda x: list(map(int, x)))

print(train_df['embedding'][0])
print(type(train_df['embedding'][0])) # list of integers



[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045, 2318, 2000, 24

In [4]:

print(train_df['embedding'][0])
print(type(train_df['embedding'][0])) # list of integers
print(len(train_df))

print(val_df['embedding'][0])
print(type(val_df['embedding'][0])) # list of integers
print(len(val_df))

print(test_df['embedding'][0])
print(type(test_df['embedding'][0])) # list of integers
print(len(test_df))

[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045, 2318, 2000, 24

# Install package for PyTorch Geometric

In [5]:
import os
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
%pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
%pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
%pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
%pip install -q torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}.html

2.3.0+cu121
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
print(train_df.columns)
print(val_df.columns)
print(test_df.columns)

Index(['embedding', 'label'], dtype='object')
Index(['embedding', 'label'], dtype='object')
Index(['id', 'embedding'], dtype='object')


# Caculate Cosine Similarity for training embeddings

In [None]:
# Convert the embeddings to PyTorch tensors
train_df['embedding'] = train_df['embedding'].apply(lambda x: torch.tensor(x))

# Assuming train_df is already defined and has 'embedding' column with PyTorch tensors
embeddings = torch.stack(train_df['embedding'].tolist())  # Convert list of tensors to a tensor

# Convert tensor embedding to float
embeddings = embeddings.float()

print(f"Shape of the embeddings tensor: {embeddings.shape}")

# Normalize the embeddings to have unit length
norm_embeddings = F.normalize(embeddings, p=2, dim=1)

# Calculate cosine similarity
cosine_sim = torch.mm(norm_embeddings, norm_embeddings.t())

print(f"Shape of the cosine similarity DataFrame: {cosine_sim.shape}")
print(cosine_sim)

# Define Adjencency Matrix

In [26]:
# Flatten the upper triangle of the cosine similarity matrix
cosine_sim_scalar = cosine_sim.flatten()
# print(cosine_sim_scalar)
cosine_sim_scalar = cosine_sim_scalar[cosine_sim_scalar != 1]  # Remove the diagonal elements
# print(cosine_sim_scalar)

In [27]:
# Calculate the Median of the cosine similarity matrix

stats_max = torch.max(cosine_sim_scalar).item()
stats_min = torch.min(cosine_sim_scalar).item()
stats_mean = torch.mean(cosine_sim_scalar).item()
stats_median = torch.median(cosine_sim_scalar).item()
stats_std = torch.std(cosine_sim_scalar).item()

print(f"Max: {stats_max}")
print(f"Min: {stats_min}")
print(f"Mean: {stats_mean}")
print(f"Median: {stats_median}")
print(f"Standard Deviation: {stats_std}")

Max: 1.0000009536743164
Min: 0.009328627958893776
Mean: 0.3206745982170105
Median: 0.34415799379348755
Standard Deviation: 0.09867545962333679


In [31]:
A = pd.DataFrame(cosine_sim.numpy())

# for each row, if the value bigger than Median, then 1, else 0
A = (A > stats_median).astype(int)

# and make sure diagonal to be 0
np.fill_diagonal(A.values, 0)

In [32]:
print(f"Adjency Matrix: ")
print(A)

Adjency Matrix: 
      0     1     2     3     4     5     6     7     8     9     ...  3978  \
0        0     1     0     1     0     1     0     1     1     1  ...     1   
1        1     0     0     1     0     1     1     1     1     0  ...     1   
2        0     0     0     0     1     0     0     1     0     1  ...     1   
3        1     1     0     0     0     1     1     1     1     1  ...     1   
4        0     0     1     0     0     1     0     0     0     0  ...     0   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
3983     1     1     0     1     0     1     1     1     1     1  ...     1   
3984     1     1     0     1     0     1     1     1     1     1  ...     1   
3985     0     1     0     1     1     1     1     1     1     1  ...     1   
3986     0     1     1     1     0     0     0     1     1     0  ...     1   
3987     0     1     1     0     0     0     1     1     0     1  ...     1   

      3979  3980  3981  3982  3983

In [37]:
A_list = torch.tensor(A.values)
print(A_list)
print(A_list.shape)
print(type(A_list))
print(A_list.dtype)

tensor([[0, 1, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 1, 1],
        ...,
        [0, 1, 0,  ..., 0, 1, 0],
        [0, 1, 1,  ..., 1, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0]])
torch.Size([3988, 3988])
<class 'torch.Tensor'>
torch.int64


In [46]:
# print(train_df['embedding'])
print(np.array(train_df['embedding'].tolist()))
print(train_df['embedding'].shape)

[[  101  3220  7158 ...     0     0     0]
 [  101 11977  2086 ...  2265  2018   102]
 [  101  1996  2388 ...     0     0     0]
 ...
 [  101  1996  5119 ...  9826 17540   102]
 [  101  2909  2520 ...  2004 19592   102]
 [  101  7986  2019 ...     0     0     0]]
(3988,)


In [None]:
def create_data(df):
    X = torch.tensor(np.array(df['embedding'].tolist()), dtype=torch.float32)
    Y = torch.tensor(np.array(df['label'].tolist()), dtype=torch.int64)
    
    edge_index = A_list
    
    return Data(x=X, edge_index=edge_index, y=Y)

train_data = create_data(train_df)
val_data = create_data(val_df)

In [51]:
print(f"train: {train_data}")
print(f"val: {val_data}")

train: Data(x=[3988, 512], edge_index=[3988, 3988], y=[3988])
val: Data(x=[998, 512], edge_index=[3988, 3988], y=[998])


# Build Graph

In [65]:
import networkx as nx
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix


def build_graph(data):
    '''
    data: PyTorch Geometric Data object
    - x: Node features
    - edge_index: Adjenct matrix [x.size(0), x.size(0)]
    - y: Node labels
    '''
    x, edge_index, y = data.x, data.edge_index, data.y
    

    # initialize a undirected graph
    G = nx.Graph()


    # add nodes
    for i in range(x.size(0)):
        G.add_node(i, label=y[i].item())

    # convert adjency matrix to COO format
    coo = coo_matrix(edge_index.numpy())

    # add edges
    for i, j in zip(coo.row, coo.col):
        G.add_edge(i, j)

    return G
    

In [66]:
train_G = build_graph(train_data)

# Plot graph

In [None]:
# the figure is too big, cost time

# plt.figure(figsize=(10, 10))
# nx.draw(train_G, with_labels=True)
# plt.show()

# Calculate the Density

In [71]:
# from collections import Counter
# import tqdm

# def graph_describe(G):
#     # print("Number of nodes:",G.number_of_nodes())
#     # print("Number of edges:",G.number_of_edges())
#     density=2*G.number_of_edges()/(G.number_of_nodes()*(G.number_of_nodes()-1))
#     # print(f"density = {round(density,round_digit)}")
#     # print("Degree:")
#     # 获取节点的度数
#     degrees = [degree for node, degree in G.degree()]
#     # 计算统计指标
#     mean_degree = np.mean(degrees)
#     median_degree = np.median(degrees)
#     std_degree = np.std(degrees)
#     max_degree = np.max(degrees)
#     min_degree = np.min(degrees)
#     # # 打印结果
#     # print("Mean Degree:", round(mean_degree,round_digit))
#     # print("Median Degree:", round(median_degree,round_digit))
#     # print("Standard Deviation of Degree:", round(std_degree,round_digit))
#     # print("Max Degree:", round(max_degree,round_digit))
#     # print("Min Degree:", round(min_degree,round_digit))

#     edge_consistency={}
#     fraud_node_set=set()
#     for edge in tqdm(G.edges(),desc="Check edge consistent..."):
#         node1_class=G.nodes[edge[0]]['label']
#         node2_class=G.nodes[edge[1]]['label']

#         if node1_class==node2_class:
#             edge_consistency[edge]=1
#         else:
#             edge_consistency[edge]=0

#     value_counts=Counter(edge_consistency.values())
#     # print(value_counts)
#     ratio=value_counts[0]/G.number_of_edges()
#     # print(f"The ratio of heterogeneous edges is:{round(ratio,round_digit)}")
#     print("Graph Descriptive Analysis:")
#     print(f"Number of nodes: {G.number_of_nodes()}")
#     print(f"Number of edges: {G.number_of_edges()}")
#     print(f"Density: {round(density, 4)}")
#     print(f"Mean Degree: {round(mean_degree, 4)}")
#     print(f"Median Degree: {round(median_degree, 4)}")
#     print(f"Standard Deviation of Degree: {round(std_degree, 4)}")
#     print(f"Max Degree: {round(max_degree, 4)}")
#     print(f"Min Degree: {round(min_degree, 4)}")
#     print(f"The ratio of heterogeneous edges is: {round(ratio, 4)}")
    
#     return G.number_of_nodes(),G.number_of_edges(),density,mean_degree,median_degree,std_degree,max_degree,min_degree,ratio

In [76]:
print(train_G.number_of_nodes())
print(train_G.number_of_edges())

3988
3974233


# Build GCN

In [None]:
## model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# 模型参数
input_dim = train_data.num_node_features
hidden_dim = 16
output_dim = len(train_df['label'].unique())

# 创建模型实例
model = GCN(input_dim, hidden_dim, output_dim)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data)
    loss = F.nll_loss(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

num_epochs = 2000
for epoch in range(num_epochs):
    loss = train()
    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}')

In [None]:
def test(data):
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = pred.eq(data.y).sum().item()
    accuracy = correct / len(data.y)
    return accuracy

train_acc = test(train_data)
val_acc = test(val_data)

print(f'Training Accuracy: {train_acc * 100:.2f}%')
print(f'Validation Accuracy: {val_acc * 100:.2f}%')