In [1]:
from transformers import LlamaForCausalLM, AutoTokenizer
import os
import argparse
import torch
import warnings
import lightning as L
import lightning.pytorch.callbacks as plc
from lightning.pytorch.loggers import CSVLogger
from data_provider.data_module import QM9DataModule, QM9LMDataModule, GeomDrugsLMDataModule
from data_provider.geom_drugs_jodo_dm import GeomDrugsJODODM
from model.llm_pl import LLMPL


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
from data_provider.data_module import QM9LMDataset

def inspect_one_molecule():
    # --- 配置 ---
    root_path = '/mnt/rna01/liuzhiyuan/zyliu/nai/NExT-Mol/data/GEOM-QM9'
    tokenizer = LLMPL.init_tokenizer(args)

    print("--- 正在   初始化 QM9LMDataset ---")
    # 我们不需要tokenizer和rand_smiles，因为我们只是看原始数据
    dataset = QM9LMDataset(root=root_path)
    print("数据集加载成功！\n")

    print("--- 取出数据集中的第一个分子 (idx=0) ---")
    # 直接像访问列表一样，调用__getitem__(0)
    first_molecule_data = dataset[0]

    print("成功获取一个Data对象！\n")

    print("--- 开始'解剖'这个Data对象 ---")
    # torch_geometric 会将Data对象打印得非常清晰
    print(first_molecule_data)

    print("\n--- 访问具体属性 ---")

    # 1. 3D坐标
    print(f"属性 'pos' (3D坐标) 的形状: {first_molecule_data.pos.shape}")
    print("这代表了 [原子数量, 3(x,y,z)]\n")

    # 2. 1D蓝图
    print(f"属性 'selfies' (1D蓝图): {first_molecule_data.selfies}\n")

    # 3. 目标属性
    print(f"属性 'y' (19个化学属性) 的形状: {first_molecule_data.y.shape}")
    print(f"第一个属性值(偶极矩): {first_molecule_data.y[0, 0].item():.4f}\n")

    # 4. 图连接性
    print(f"属性 'edge_index' (图连接性) 的形状: {first_molecule_data.edge_index.shape}")
    print("这代表了 [2, 化学键数量 * 2]\n")

    # 5. RDKit对象
    print(f"属性 'rdmol' 的类型: {type(first_molecule_data.rdmol)}")
inspect_one_molecule()

--- 正在初始化 QM9LMDataset ---


AttributeError: 'NoneType' object has no attribute 'get_vocab'

In [4]:
from transformers import AutoTokenizer

def haha():
    # 我们要研究的模型名称
    model_name = "acharkq/MoLlama"
    print(f"--- 正在加载与 '{model_name}' 配套的 Tokenizer ---\n")

    # 1. 模拟 init_tokenizer 的核心步骤
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_bos_token = True
    tokenizer.add_eos_token = True

    print("--- Tokenizer 对象本身 ---")
    print(tokenizer)
    print("\n")

    print("--- 查看特殊词元 (Special Tokens) ---")
    print(f"起始符 (BOS): '{tokenizer.bos_token}'  (ID: {tokenizer.bos_token_id})")
    print(f"结束符 (EOS): '{tokenizer.eos_token}'  (ID: {tokenizer.eos_token_id})")
    print(f"填充符 (PAD): '{tokenizer.pad_token}'  (ID: {tokenizer.pad_token_id})")
    print("\n")

    print("--- 查看部分'化学词汇表' (Vocabulary) ---")
    # 获取完整的 词 -> ID 映射字典
    vocab = tokenizer.get_vocab()
    print(f"词汇表总大小: {len(vocab)} 个词元")

    # 打印一些化学相关的词元
    print("一些化学片段示例:")
    for token in ['[C]', '[=C]', '[Branch1]', '[Ring1]', '[O]', '[N]']:
        if token in vocab:
            print(f"  - '{token}'  --> ID: {vocab[token]}")
    print("\n")


    print("--- 完整演示：从SELFIES到Batch输入 ---")
    # 准备一批(batch)分子，长度不一
    selfies_list = [
        '[C][C][O]',                         # 乙醇 (3个词元)
        '[C][=Branch1][C][=O]',              # 丙酮 (4个词元)
        '[F][C][F]'                          # 二氟甲烷 (3个词元)
    ]
    print(f"原始输入的SELFIES列表: {selfies_list}\n")

    # 使用tokenizer进行处理，模拟LMCollater中的操作
    batch = tokenizer(
        selfies_list,
        padding=True, # 自动填充到批次中最长的长度
        return_tensors='pt'
    )

    print("Tokenizer处理后的最终batch形态 (字典):")
    print(batch)
    print("\n")

    print("--- 逐一解析batch内容 ---")
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']

    print(f"input_ids (形状: {input_ids.shape}):\n{input_ids}\n")
    print("解读: 这是转换后的数字ID。每一行是一个分子。注意所有行的长度都一样（被填充到了最长序列的长度），并且开头是BOS的ID(1)，结尾是EOS的ID(2)。")

    print(f"attention_mask (形状: {attention_mask.shape}):\n{attention_mask}\n")
    print("解读: 值为1代表是真实数据，值为0代表是填充的。模型会根据这个来忽略填充部分。")
haha()

--- 正在加载与 'acharkq/MoLlama' 配套的 Tokenizer ---



You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


--- Tokenizer 对象本身 ---
LlamaTokenizerFast(name_or_path='acharkq/MoLlama', vocab_size=4, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[CH1-1]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	5: AddedToken("[=S@@]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	6: AddedToken("[/F]", rstrip=False, lstri

In [1]:
import pickle
import torch
import os

def inspect_sbd_pkl_file():
    # --- 1. 配置区：请仔细修改以下路径 ---

    pkl_file_path = '/mnt/rna01/liuzhiyuan/zyliu/nai/NExT-Mol/data/sbdd/crossdocked_pocket/1A1C_MALDO_2_433_0/1m4n_A_rec_1m7y_ppg_lig_tt_min_0_pocket10.pkl'

    print(f"--- 正在检查文件: {os.path.basename(pkl_file_path)} ---")

    if not os.path.exists(pkl_file_path):
        print(f"[错误] 文件未找到: {pkl_file_path}")
        return

    try:
        with open(pkl_file_path, 'rb') as f:
            # 加载文件内容
            data = pickle.load(f)

        print("\n--- 文件内容分析 ---")
        print(f"加载出的数据顶层类型是: {type(data)}")

        embedding_tensor = None
        if isinstance(data, torch.Tensor):
            # 情况A：文件直接就是一个张量
            print("文件内容直接是一个 PyTorch Tensor。")
            embedding_tensor = data
        elif isinstance(data, dict):
            # 情况B：文件是一个字典，embedding在字典里面
            print(f"文件内容是一个字典，包含的键 (Keys): {list(data.keys())}")
            # 根据你同学写的Dataset代码，键名很可能是 'pdb_embedding'
            if 'pdb_embedding' in data:
                embedding_tensor = data['pdb_embedding']
                print("在字典中找到了键 'pdb_embedding'。")
            else:
                print("在字典中未找到预期的 'pdb_embedding' 键。")
        else:
            print("文件内容既不是Tensor也不是字典，请检查具体内容。")

        # --- 3. 分析最终的Embedding张量 ---
        if embedding_tensor is not None:
            print("\n--- 口袋嵌入(Embedding)详细信息 ---")
            print(f"  - 数据类型 (dtype): {embedding_tensor.dtype}")
            print(f"  - 形状 (Shape): {embedding_tensor.shape}")

            if len(embedding_tensor.shape) == 2:
                num_residues, feature_dim = embedding_tensor.shape
                print(f"  - 解读: 这代表了口袋中的 {num_residues} 个实体（很可能是氨基酸残基），")
                print(f"           每个实体被一个 {feature_dim} 维的向量所描述。")
            elif len(embedding_tensor.shape) == 1:
                feature_dim = embedding_tensor.shape[0]
                print(f"  - 解读: 这是一个单一的、长度为 {feature_dim} 的向量，")
                print(f"           它代表了整个口袋的整体特征（可能是通过平均池化得到的）。")

    except Exception as e:
        print(f"读取或解析文件时出错: {e}")

if __name__ == '__main__':
    inspect_sbd_pkl_file()

--- 正在检查文件: 1m4n_A_rec_1m7y_ppg_lig_tt_min_0_pocket10.pkl ---

--- 文件内容分析 ---
加载出的数据顶层类型是: <class 'torch.Tensor'>
文件内容直接是一个 PyTorch Tensor。

--- 口袋嵌入(Embedding)详细信息 ---
  - 数据类型 (dtype): torch.float32
  - 形状 (Shape): torch.Size([65, 1536])
  - 解读: 这代表了口袋中的 65 个实体（很可能是氨基酸残基），
           每个实体被一个 1536 维的向量所描述。


In [1]:
import pickle

# 文件路径
index_file = "/data/share/liuzhiyuan/nai/NExT-Mol/datasets/sbdd/crossdocked_pocket/index.pkl"

# 打开并读取
with open(index_file, 'rb') as f:
    index_data = pickle.load(f)

# 查看类型和内容
print(type(index_data))
print(len(index_data))
print(index_data[:5])  # 如果是列表，查看前5条；如果是字典，可用 list(index_data.items())[:5]

<class 'list'>
184087
[('1B57_HUMAN_25_300_0/3vri_A_rec_3vrj_1kx_lig_tt_min_0_pocket10.pdb', '1B57_HUMAN_25_300_0/3vri_A_rec_3vrj_1kx_lig_tt_min_0.sdf', '1B57_HUMAN_25_300_0/3vri_A_rec.pdb', 0.524867), ('1B57_HUMAN_25_300_0/3upr_C_rec_5u98_1kx_lig_tt_min_0_pocket10.pdb', '1B57_HUMAN_25_300_0/3upr_C_rec_5u98_1kx_lig_tt_min_0.sdf', '1B57_HUMAN_25_300_0/3upr_C_rec.pdb', 0.402512), ('1B57_HUMAN_25_300_0/5u98_D_rec_5u98_1kx_lig_tt_min_0_pocket10.pdb', '1B57_HUMAN_25_300_0/5u98_D_rec_5u98_1kx_lig_tt_min_0.sdf', '1B57_HUMAN_25_300_0/5u98_D_rec.pdb', 0.367042), ('1B57_HUMAN_25_300_0/3upr_C_rec_3upr_1kx_lig_tt_docked_3_pocket10.pdb', '1B57_HUMAN_25_300_0/3upr_C_rec_3upr_1kx_lig_tt_docked_3.sdf', '1B57_HUMAN_25_300_0/3upr_C_rec.pdb', 0.319764), ('1B57_HUMAN_25_300_0/3upr_C_rec_3vri_1kx_lig_tt_min_0_pocket10.pdb', '1B57_HUMAN_25_300_0/3upr_C_rec_3vri_1kx_lig_tt_min_0.sdf', '1B57_HUMAN_25_300_0/3upr_C_rec.pdb', 0.360588)]
