In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.data import DataLoader
import penman
from transition_amr_parser.parse import AMRParser

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

parser = AMRParser.from_pretrained('AMR3-structbart-L')

def AMR(text):
    # Download and save a model named AMR3.0 to cache
    tokens, positions = parser.tokenize(text)
    
    # Use parse_sentence() for single sentences or parse_sentences() for a batch
    annotations, machines = parser.parse_sentence(tokens)
    
    # Print Penman notation
    return annotations




load from cache /home/ai4learning/.cache/torch/DATA/AMR3.0/models/amr3.0-structured-bart-large-neur-al/seed42/checkpoint_wiki.smatch_top5-avg.pt
| [en] dictionary: 46088 types
| [actions_nopos] dictionary: 16544 types
----------loading pretrained bart.large model ----------


URLError: <urlopen error [Errno 110] Connection timed out>

In [11]:
def parse_amr_to_graph(amr_text):
    """
    将 AMR 图字符串解析为 PyTorch Geometric 数据对象。
    """
    # 使用 penman 解析 AMR
    graph = penman.decode(amr_text)
    
    # 提取节点
    nodes = []
    for instance in graph.instances():
        node_id, concept = instance[0], instance[1]
        nodes.append((node_id, concept))

    # 提取边
    edges = []
    for edge in graph.edges():
        source, role, target = edge
        edges.append((source, target, role))

    # 构造节点索引映射
    node_to_idx = {node[0]: i for i, node in enumerate(nodes)}

    # 转化为 PyTorch Geometric 格式
    edge_index = torch.tensor([[node_to_idx[src], node_to_idx[tgt]] for src, tgt, _ in edges], dtype=torch.long).t()

    # 边的特征（关系类型编码为整数）
    edge_attr = [role for _, _, role in edges]
    edge_attr = torch.tensor([hash(role) % 1000 for role in edge_attr], dtype=torch.long)  # 简单的哈希映射

    # 节点特征（用概念字符串的哈希值表示）
    x = torch.tensor([[hash(concept) % 1000] for _, concept in nodes], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

In [16]:
graphs=[]
amr_texts=["I have a question","I want to konw how to do it"]
graphs.append(parse_amr_to_graph(amr_text) for amr_text in amr_texts)
data_list = [parse_amr_to_graph(amr_text) for amr_text in amr_texts]




DecodeError: 
  line 1
    I have a question
    ^
DecodeError: Expected: LPAREN

In [17]:
graphs_gen = (parse_amr_to_graph(amr_text) for amr_text in amr_texts)

# 通过 list 强制执行生成器
try:
    graphs_list = list(graphs_gen)
    print("Generator executed successfully:", graphs_list)
except Exception as e:
    print("Error occurred during generator execution:", e)


Error occurred during generator execution: 
  line 1
    I have a question
    ^
DecodeError: Expected: LPAREN


In [14]:
batch_size = 2
loader = DataLoader(graphs, batch_size=batch_size, shuffle=True)
for batch in loader:
    print("Batch Data:")
    print("Node Features (x):", batch.x)
    print("Edge Index (edge_index):", batch.edge_index)
    print("Edge Attributes (edge_attr):", batch.edge_attr)
    print("Batch Mapping:", batch.batch) 

TypeError: DataLoader found invalid type: '<class 'generator'>'

In [8]:
dataloader = DataLoader(graphs, batch_size=16, shuffle=True)
for batch in dataloader:
    # 节点特征矩阵、边索引、批次信息
    x = batch.x
    edge_index = batch.edge_index
    batch_info = batch.batch  # 用于全局池化时标记图的归属
    print(x)
    print(batch_info)
    print(edge_index)


[<Graph object (top=q) at 125240369641264>, <Graph object (top=c) at 125238959291408>]
tensor([], dtype=torch.int64)
None




In [23]:
import penman
import numpy as np
def graph_struct(amr_text):
    graph = penman.decode(amr_text)
    
    # 提取节点和边
    nodes = [inst[0] for inst in graph.instances()]  # 获取所有节点ID
    edges = graph.edges()  # 获取所有边（关系）
    num_nodes = len(nodes)
    adj_matrix = np.zeros((num_nodes, num_nodes))
    x = torch.eye(num_nodes)  # 独热向量表示的节点特征

    node_to_index = {node: i for i, node in enumerate(nodes)}  # 为每个节点分配索引
    for edge in edges:
        src, rel, tgt = edge
        src_idx = node_to_index[src]
        tgt_idx = node_to_index[tgt]
        adj_matrix[src_idx, tgt_idx] = 1 
    
    # 定义边索引
    edge_index = torch.tensor([[node_to_index[src], node_to_index[tgt]] for src, _, tgt in edges]).t().contiguous()
    
    # 构造图数据
    data = Data(x=x, edge_index=edge_index)


In [None]:
from torch_geometric.data import Data, Batch

# 创建 PyTorch Geometric 数据对象
graph_data_list

# 动态批量化
batched_graph = Batch.from_data_list(graph_data_list)

print("Batched Graph:", batched_graph)


In [25]:
def amr_to_graph(amr_text, feature_dim=16):
    """
    将 AMR 文本解析为 PyTorch Geometric 的图数据格式，并统一节点特征维度。
    """
    # 使用 Penman 解析 AMR 文本
    graph = penman.decode(amr_text)
    
    # 提取节点和边
    nodes = [instance[0] for instance in graph.instances()]  # 获取节点ID
    edges = graph.edges()  # 获取边信息（src, rel, tgt）
    
    # 为每个节点创建固定维度的特征（随机初始化或零向量）
    num_nodes = len(nodes)
    node_features = torch.zeros((num_nodes, feature_dim))  # 这里为零初始化
    # 可选：为每个节点分配唯一特征（如词嵌入、随机值）
    node_features = torch.rand((num_nodes, feature_dim))  # 随机初始化
    
    # 将边转换为 PyTorch Geometric 的 edge_index 格式
    node_to_index = {node: i for i, node in enumerate(nodes)}
    edge_index = []
    for src, rel, tgt in edges:
        if src in node_to_index and tgt in node_to_index:
            edge_index.append([node_to_index[src], node_to_index[tgt]])
    
    # 转换为 PyTorch 张量
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() if edge_index else torch.empty((2, 0), dtype=torch.long)
    
    # 返回 PyTorch Geometric 的 Data 对象
    return Data(x=node_features, edge_index=edge_index)
# 示例 AMR 文本列表



Batched Graph:
DataBatch(x=[12, 16], edge_index=[2, 10], batch=[12], ptr=[4])


In [26]:
amr_texts=[]
amr_texts.append(AMR("I have a question"))
amr_texts.append(AMR("I want to konw how to do it"))


# 将 AMR 文本解析为 PyTorch Geometric 图数据
feature_dim = 16  # 固定特征维度
graph_list = [amr_to_graph(amr_text, feature_dim=feature_dim) for amr_text in amr_texts]

# 使用 PyTorch Geometric 的 Batch 进行批处理
batched_graph = Batch.from_data_list(graph_list)

# 打印批处理结果
print("Batched Graph:")
print(batched_graph)

Running on batch size: 1
1


decoding: 100%|██████████| 1/1 [00:00<00:00,  6.20it/s]


Running on batch size: 1
1


decoding: 100%|██████████| 1/1 [00:00<00:00,  3.66it/s]

Batched Graph:
DataBatch(x=[8, 16], edge_index=[2, 8], batch=[8], ptr=[3])





In [27]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GraphEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphEncoder, self).__init__()
        # 图卷积层
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        # 全局池化后映射到最终嵌入维度
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        """
        输入批处理后的图数据 (Batch 对象)，输出图嵌入。
        """
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        # 图卷积层 1
        x = self.conv1(x, edge_index)
        x = F.relu(x)  # 激活函数
        # 图卷积层 2
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # 图级别嵌入 (使用全局池化)
        x = global_mean_pool(x, batch)  # 维度为 [num_graphs, hidden_dim]
        
        # 最终嵌入映射
        x = self.fc(x)  # 维度为 [num_graphs, output_dim]
        return x

# 定义模型参数
input_dim = 16  # 输入特征维度，与之前 amr_to_graph 一致
hidden_dim = 32  # 隐藏层维度
output_dim = 64  # 输出嵌入维度（最终图表示）

# 初始化模型
model = GraphEncoder(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

# 假设批处理后的图数据 batched_graph
output = model(batched_graph)  # 输出图嵌入

print("Encoded Graph Embeddings:", output.shape)  # [num_graphs, output_dim]


Encoded Graph Embeddings: torch.Size([2, 64])


In [10]:
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader
import torch

# 创建 teacher 图和 student 图的不同数据
teacher_graph1 = Data(x=torch.tensor([[1], [2], [3]]), edge_index=torch.tensor([[0, 1, 2], [1, 2, 0]]))
student_graph1 = Data(x=torch.tensor([[4], [5]]), edge_index=torch.tensor([[0, 1], [1, 0]]))

teacher_graph2 = Data(x=torch.tensor([[6], [7]]), edge_index=torch.tensor([[0, 1], [1, 0]]))
student_graph2 = Data(x=torch.tensor([[8], [9], [10]]), edge_index=torch.tensor([[0, 2], [2, 1]]))

teacher_graph3 = Data(x=torch.tensor([[11], [12], [13], [14]]), edge_index=torch.tensor([[0, 1, 2, 3], [1, 2, 3, 0]]))
student_graph3 = Data(x=torch.tensor([[15], [16], [17]]), edge_index=torch.tensor([[0, 1, 2], [1, 2, 0]]))

# 创建 (teacher, student) 图对列表
graph_pairs = [
    (teacher_graph1, student_graph1),
    (teacher_graph2, student_graph2),
    (teacher_graph3, student_graph3),
]

# 自定义 collate_fn 函数
def custom_collate_fn(batch):
    # 提取 teacher 和 student 图
    teacher_graphs = [pair[0] for pair in batch]
    student_graphs = [pair[1] for pair in batch]

    # 合并为 Batch
    teacher_batch = Batch.from_data_list(teacher_graphs)
    student_batch = Batch.from_data_list(student_graphs)

    # 返回字典
    return {"teacher": teacher_batch, "student": student_batch}

# 创建 DataLoader，batch_size=2
loader = DataLoader(graph_pairs, batch_size=2, collate_fn=custom_collate_fn)

# 遍历 DataLoader，输出 teacher 和 student 的批次
for batch in loader:
    print("Teacher batch:")
    print(batch[0])
    print("Student batch:")
    print(batch[1])
    print("-" * 50)


Teacher batch:
DataBatch(x=[5, 1], edge_index=[2, 5], batch=[5], ptr=[3])
Student batch:
DataBatch(x=[5, 1], edge_index=[2, 4], batch=[5], ptr=[3])
--------------------------------------------------
Teacher batch:
DataBatch(x=[4, 1], edge_index=[2, 4], batch=[4], ptr=[2])
Student batch:
DataBatch(x=[3, 1], edge_index=[2, 3], batch=[3], ptr=[2])
--------------------------------------------------
