In [1]:
import pandas as pd

df_qm9 = pd.read_csv('qm9_demo.csv')
df_qm9.head()

Unnamed: 0,file_name,mol_len,atom_coords,vibrationalfrequence,smiles_basic,smiles_stereo,inchi_basic,inchi_stereo,A,B,...,homo,lumo,gap,R2,zpve,Uo,U,H,G,Cv
0,dsgdb9nsd_000001.xyz,5,C\t-0.0126981359\t 1.0858041578\t 0.0080009958...,1341.307\t1341.3284\t1341.365\t1562.6731\t1562...,C,C,1S/CH4/h1H4,1S/CH4/h1H4,157.7118,157.70997,...,-0.3877,0.1171,0.5048,35.3641,0.044749,-40.47893,-40.476062,-40.475117,-40.498597,6.469
1,dsgdb9nsd_000002.xyz,4,N\t-0.0404260543\t 1.0241077531\t 0.0625637998...,1103.8733\t1684.1158\t1684.3072\t3458.7145\t35...,N,N,1S/H3N/h1H3,1S/H3N/h1H3,293.60975,293.54111,...,-0.257,0.0829,0.3399,26.1563,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316
2,dsgdb9nsd_000003.xyz,3,O\t-0.0343604951\t 0.9775395708\t 0.0076015923...,1671.4222\t3803.6305\t3907.698,O,O,1S/H2O/h1H2,1S/H2O/h1H2,799.58812,437.90386,...,-0.2928,0.0687,0.3615,19.0002,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002
3,dsgdb9nsd_000004.xyz,4,C\t 0.5995394918\t 0.\t 1.\t-0.207019\nC\t-0.5...,549.7648\t549.7648\t795.2713\t795.2713\t2078.1...,C#C,C#C,1S/C2H2/c1-2/h1-2H,1S/C2H2/c1-2/h1-2H,0.0,35.610036,...,-0.2845,0.0506,0.3351,59.5248,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574
4,dsgdb9nsd_000005.xyz,3,C\t-0.0133239314\t 1.1324657151\t 0.0082758861...,799.0101\t799.0101\t2198.4393\t3490.3686,C#N,C#N,1S/CHN/c1-2/h1H,1S/CHN/c1-2/h1H,0.0,44.593883,...,-0.3604,0.0191,0.3796,48.7476,0.016601,-93.411888,-93.40937,-93.408425,-93.431246,6.278


In [None]:
from utils.data import QM9DataExtractor

# 使用定义的QM9DataExtractor类
extractor = QM9DataExtractor('qm9_demo.csv')

# 使用整数索引获取第一个分子的坐标数据
coords_data = extractor.extract_atom_coords(0)
print("First molecule coordinates:")
print(coords_data)

In [None]:
# 使用数据集中实际存在的SMILES字符串进行测试
coords_data = extractor.extract_atom_coords("C")
print("\nCoordinates for molecule with SMILES 'C':")
print(coords_data)

In [None]:
import numpy as np
# 将原子位置信息转换为 图神经网络 可用的格式
from typing import Dict, List, Tuple

def mol_to_graph(mol_data: Dict) -> Dict:
    """
    将QM9分子数据转换为GNN图数据

    参数:
        mol_data: 包含分子信息的字典

    返回:
        graph_data: 包含图结构和特征的字典
    """

    # 1. 提取基本信息
    atoms = mol_data['atoms']
    coordinates = mol_data['coordinates']
    charges = mol_data['charges']
    num_atoms = mol_data['num_atoms']

    # 2. 构建节点特征
    # 原子类型编码 (one-hot encoding)
    atom_types = ['H', 'C', 'N', 'O', 'F']  # QM9中常见的原子类型
    node_features = []

    for i, atom in enumerate(atoms):
        # 原子类型 one-hot 编码
        atom_feature = [0] * len(atom_types)
        if atom in atom_types:
            atom_feature[atom_types.index(atom)] = 1

        # 添加原子电荷作为特征
        atom_feature.append(charges[i])

        # 添加坐标作为特征（可选）
        atom_feature.extend(coordinates[i].tolist())

        node_features.append(atom_feature)

    node_features = np.array(node_features, dtype=np.float32)

    # 3. 构建边（基于距离阈值）
    distance_threshold = 1.8  # Angstrom, 可调整
    edges = []
    edge_features = []

    # 计算所有原子对之间的距离
    for i in range(num_atoms):
        for j in range(i + 1, num_atoms):
            # 计算欧几里得距离
            dist = np.linalg.norm(coordinates[i] - coordinates[j])

            # 如果距离小于阈值，创建边
            if dist < distance_threshold:
                # 添加双向边 (无向图)
                edges.append([i, j])
                edges.append([j, i])

                # 边特征：距离和距离的倒数
                edge_feat = [dist, 1.0 / dist]
                edge_features.append(edge_feat)
                edge_features.append(edge_feat)  # 双向边使用相同特征

    # 转换为numpy数组
    if edges:
        edges = np.array(edges, dtype=np.int64)
        edge_features = np.array(edge_features, dtype=np.float32)
    else:
        edges = np.array([], dtype=np.int64).reshape(0, 2)
        edge_features = np.array([], dtype=np.float32).reshape(0, 2)

    # 4. 构建邻接矩阵（可选，某些GNN框架需要）
    adjacency_matrix = np.zeros((num_atoms, num_atoms), dtype=np.float32)
    for i, j in edges:
        adjacency_matrix[i, j] = 1

    # 5. 打包图数据
    graph_data = {
        'num_nodes': num_atoms,
        'node_features': node_features,  # shape: (num_atoms, feature_dim)
        'edges': edges,                   # shape: (num_edges, 2)
        'edge_features': edge_features,   # shape: (num_edges, edge_feat_dim)
        'adjacency_matrix': adjacency_matrix,  # shape: (num_atoms, num_atoms)
        'coordinates': coordinates,       # 保留原始坐标
        'smiles': mol_data['smiles']     # 保留SMILES字符串
    }

    return graph_data

In [None]:
def print_graph_info(graph_data: Dict):
    """打印图数据的基本信息"""
    print("=== 图数据信息 ===")
    print(f"节点数量: {graph_data['num_nodes']}")
    print(f"边数量: {len(graph_data['edges'])}")
    print(f"节点特征维度: {graph_data['node_features'].shape}")
    print(f"边特征维度: {graph_data['edge_features'].shape}")
    print(f"\n节点特征示例 (前2个节点):")
    print(graph_data['node_features'][:2])
    print(f"\n边连接 (前10条):")
    print(graph_data['edges'][:10])
    print(f"\n邻接矩阵:")
    print(graph_data['adjacency_matrix'])

In [None]:
graph_data = mol_to_graph(coords_data)
print_graph_info(graph_data)

In [None]:
# 将 df_qm9 中的所有分子转换为图数据

def batch_convert_to_graphs(extractor: 'QM9DataExtractor') -> List[Dict]:
    """
    批量将数据集中的所有分子转换为图数据

    参数:
        extractor: QM9DataExtractor实例

    返回:
        graph_list: 包含所有分子图数据的列表
    """
    graph_list = []
    dataset_size = len(extractor.df)

    print(f"开始转换 {dataset_size} 个分子为图数据...")

    # 遍历数据集中的每个分子
    for i in range(dataset_size):
        try:
            # 提取分子坐标数据
            coords_data = extractor.extract_atom_coords(i)

            # 转换为图数据
            graph_data = mol_to_graph(coords_data)

            # 添加分子索引信息
            graph_data['mol_index'] = i

            # 获取分子属性数据
            mol_info = extractor.get_molecule_info(i)
            properties = mol_info['properties_data']['properties']
            graph_data['properties'] = properties

            graph_list.append(graph_data)

            # 进度提示
            if (i + 1) % 10 == 0 or i == 0:
                print(f"已处理: {i + 1}/{dataset_size} 分子")

        except Exception as e:
            print(f"处理分子 {i} 时出错: {e}")
            continue

    print(f"批量转换完成! 成功转换 {len(graph_list)} 个分子")
    return graph_list

# 执行批量转换
all_graphs = batch_convert_to_graphs(extractor)

# 显示转换结果统计
print("\n=== 转换结果统计 ===")
print(f"总分子数: {len(all_graphs)}")

if all_graphs:
    # 统计节点数分布
    node_counts = [g['num_nodes'] for g in all_graphs]
    edge_counts = [len(g['edges']) for g in all_graphs]

    print(f"节点数 - 最小: {min(node_counts)}, 最大: {max(node_counts)}, 平均: {np.mean(node_counts):.1f}")
    print(f"边数 - 最小: {min(edge_counts)}, 最大: {max(edge_counts)}, 平均: {np.mean(edge_counts):.1f}")

    # 显示第一个图的详细信息
    print(f"\n=== 示例图数据 (分子 0) ===")
    print_graph_info(all_graphs[0])

    # 显示属性信息
    print(f"\n分子属性: {list(all_graphs[0]['properties'].keys())}")
    print(f"HOMO: {all_graphs[0]['properties']['homo']:.4f}")
    print(f"LUMO: {all_graphs[0]['properties']['lumo']:.4f}")

In [None]:
# 保存图数据到文件
import pickle
import json

def save_graph_data(graph_list: List[Dict], save_path: str = 'qm9_graph_data.pkl'):
    """
    保存图数据到pickle文件

    参数:
        graph_list: 图数据列表
        save_path: 保存路径
    """
    print(f"保存图数据到 {save_path}...")
    with open(save_path, 'wb') as f:
        pickle.dump(graph_list, f)
    print(f"已保存 {len(graph_list)} 个分子的图数据")

def load_graph_data(load_path: str = 'qm9_graph_data.pkl') -> List[Dict]:
    """
    从pickle文件加载图数据

    参数:
        load_path: 加载路径

    返回:
        graph_list: 图数据列表
    """
    print(f"从 {load_path} 加载图数据...")
    with open(load_path, 'rb') as f:
        graph_list = pickle.load(f)
    print(f"已加载 {len(graph_list)} 个分子的图数据")
    return graph_list

# 保存转换后的图数据
save_graph_data(all_graphs, 'qm9_demo_graphs.pkl')


In [None]:
# 分析图数据统计信息
def analyze_graph_statistics(graph_list: List[Dict]):
    """
    分析图数据的统计信息

    参数:
        graph_list: 图数据列表
    """
    if not graph_list:
        print("没有图数据可分析")
        return

    # 基本统计
    num_molecules = len(graph_list)
    node_counts = [g['num_nodes'] for g in graph_list]
    edge_counts = [len(g['edges']) for g in graph_list]

    print("=== 图数据详细统计 ===")
    print(f"分子总数: {num_molecules}")
    print(f"\n节点统计:")
    print(f"  最小节点数: {min(node_counts)}")
    print(f"  最大节点数: {max(node_counts)}")
    print(f"  平均节点数: {np.mean(node_counts):.2f}")
    print(f"  节点数标准差: {np.std(node_counts):.2f}")

    print(f"\n边统计:")
    print(f"  最小边数: {min(edge_counts)}")
    print(f"  最大边数: {max(edge_counts)}")
    print(f"  平均边数: {np.mean(edge_counts):.2f}")
    print(f"  边数标准差: {np.std(edge_counts):.2f}")

    # 原子类型统计
    atom_type_counts = {}
    for graph in graph_list:
        smiles = graph['smiles']
        for char in smiles:
            if char.isalpha() and char.isupper():
                atom_type_counts[char] = atom_type_counts.get(char, 0) + 1

    print(f"\n原子类型分布:")
    for atom, count in sorted(atom_type_counts.items()):
        print(f"  {atom}: {count}")

    # 属性统计
    if 'properties' in graph_list[0]:
        print(f"\n分子属性统计:")
        properties = graph_list[0]['properties'].keys()
        for prop in properties:
            values = [g['properties'][prop] for g in graph_list]
            print(f"  {prop}: 平均={np.mean(values):.4f}, 标准差={np.std(values):.4f}")

# 执行统计分析
analyze_graph_statistics(all_graphs)


In [None]:
# 创建简化版图数据 (用于深度学习模型)
def create_simplified_graphs(graph_list: List[Dict]) -> List[Dict]:
    """
    创建简化版图数据，只保留训练需要的核心信息

    参数:
        graph_list: 完整图数据列表

    返回:
        simplified_graphs: 简化图数据列表
    """
    simplified_graphs = []

    for graph in graph_list:
        simplified = {
            'node_features': graph['node_features'],
            'edges': graph['edges'],
            'edge_features': graph['edge_features'],
            'num_nodes': graph['num_nodes'],
            'properties': graph['properties'],
            'smiles': graph['smiles'],
            'mol_index': graph['mol_index']
        }
        simplified_graphs.append(simplified)

    print(f"创建了 {len(simplified_graphs)} 个简化图数据")
    return simplified_graphs

# 创建简化版数据
simplified_graphs = create_simplified_graphs(all_graphs)

# 保存简化版数据
save_graph_data(simplified_graphs, 'qm9_demo_simplified_graphs.pkl')

print("\n=== 数据转换完成 ===")
print("已生成以下文件:")
print("1. qm9_demo_graphs.pkl - 完整图数据")
print("2. qm9_demo_simplified_graphs.pkl - 简化图数据")
print("\n图数据现在可以用于训练图神经网络模型!")
