In [2]:
import sys 
import torch
sys.path.insert(1, 'MolT5/baselines')

import numpy as np
import dataloader
import json

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


dummy_emb = torch.rand(1000, 20, 20, 64).to('cuda')
pairs = dummy_emb

In [3]:
pairs = torch.load('../burov/unimol_embeddings_project/geom_train_pair_embeddings.pt', weights_only=True)

In [5]:
N = len(pairs)

total = N
n_chunks = 10
chunk_size = total // n_chunks       # → 43647
last_chunk_size = total - chunk_size * (n_chunks - 1)  # → остаток в последнем чанке


In [9]:
chunk_size = 43647
for i in range(0, N, chunk_size):
    chunk = pairs[i:i+chunk_size]
    torch.save(chunk, f"/home/user12/ebeddings/geom_train_pair_embeddings{i//chunk_size:02d}.pt")

In [4]:
import torch
from torch.utils.data import Dataset
import glob

class ChunkedDataset(Dataset):
    def __init__(self, chunk_paths, sizes):
        self.chunk_paths = chunk_paths
        self.sizes = sizes
        self.cumulative = [0]
        for s in sizes:
            self.cumulative.append(self.cumulative[-1] + s)

        self.current_chunk = None
        self.current_chunk_idx = -1

    def __len__(self):
        return self.cumulative[-1]

    def __getitem__(self, idx):
        # Определяем, к какому чанку принадлежит idx
        for i in range(len(self.sizes)):
            if self.cumulative[i] <= idx < self.cumulative[i+1]:
                local_idx = idx - self.cumulative[i]
                if self.current_chunk_idx != i:
                    self.current_chunk = torch.load(self.chunk_paths[i])
                    self.current_chunk_idx = i
                return self.current_chunk[local_idx]

        raise IndexError("Index out of bounds")

# Пример инициализации
chunk_paths = sorted(glob.glob("/home/user12/ebeddings/*.pt"))
sizes = [43647] * 9 + [43648]
dataset = ChunkedDataset(chunk_paths, sizes)


In [None]:
dataset[4]

In [None]:
torch.save('~/geom_train_pair_embeddings.pt')

In [11]:
geom = torch.load('./mydatasets/geom_train.pt', weights_only=True)

In [17]:
s = 0
arg = 0
for i, mol in enumerate(geom):
    N = len(mol['positions'])
    if s < N: 
        s = N
        arg = i
print(arg, s)

18361 90


In [18]:
geom[arg]

{'name': 'Cc1c2oc3c(C)ccc(C(=O)NC4C(=O)NC(C(C)C)C(=O)N5CCCC5C(=O)N(C)CC(=O)N(C)C(C(C)C)C(=O)OC4C)c3nc-2c(C(=O)NC2C(=O)NC(C(C)C)C(=O)N3CCCC3C(=O)N(C)CC(=O)N(C)C(C(C)C)C(=O)OC2C)c(N)c1=O',
 'one_hot': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0.],
   

In [3]:
geom = torch.load('./datasets/geom_cut.pt', weights_only=True)

In [None]:


k         = 128
batchsize = 10_000           # сколько 64-векторов обрабатываем за раз
max_pairs = 10_000_000          # всего примеров, на которых «доведём» центры

kmeans = MiniBatchKMeans(n_clusters=k,
                         batch_size=batchsize,
                         init_size=k*3,        # можно побольше для устойчивости
                         verbose=0,
                         random_state=42)

seen = 0
for mol in tqdm(pairs):                   # pair_list: list[T(N,N,64)]
    vecs = mol.reshape(-1, 64)                # (N²,64)   – в gpu/cpu памяти молекулы
    # --- случайно берём не больше batchsize векторов ---
    if vecs.size(0) > batchsize:
        idx = torch.randperm(vecs.size(0))[:batchsize]
        vecs = vecs[idx]

    kmeans.partial_fit(vecs.cpu().numpy())    # учим на CPU, по кусочкам
    seen += vecs.size(0)
    if seen >= max_pairs:                     # хватит примеров – выходим
        break

centers = torch.tensor(kmeans.cluster_centers_)   # (128,64)  • готово


def vec2tok(v: torch.Tensor) -> str:
    idx = torch.cdist(v[None], centers).argmin().item()
    return f"<p{idx:03d}>"


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1000/1000 [00:02<00:00, 409.94it/s]


In [8]:
def vec2tok(v: torch.Tensor) -> str | list[str]:
    """
    v : Tensor (64,)       → '<p042>'
        Tensor (M,64)      → ['<p042>', '<p118>', ...] длиной M
    """
    v = v.to(centers)                       # убедимся, что на том же девайсе

    if v.ndim == 1:                         # одиночный вектор
        idx = torch.cdist(v[None], centers).argmin().item()
        return f"<p{idx:03d}>"

    # батч M×64  → M индексов
    idx = torch.cdist(v, centers).argmin(dim=1).tolist()   # List[int] длиной M
    return [f"<p{i:03d}>" for i in idx]                    # List[str]


In [6]:
vecs = torch.randn(40, 64)
print(vec2tok(vecs)[:4])
# ['<p042>', '<p118>', '<p031>', '<p007>']


['<p061>', '<p047>', '<p094>', '<p102>']


In [None]:
model_name = "Qwen/Qwen1.5-1.8B-Chat"   # или другая
tokenizer  = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
model      = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# 128 новых «pair-токенов»
new_tokens = [f"<p{i:03d}>" for i in range(128)]

num_added = tokenizer.add_tokens(new_tokens, special_tokens=False)
print("Добавили:", num_added)      # должно быть 128
model.resize_token_embeddings(len(tokenizer))   # 🔑 расширяем эмбеддинг



Добавили: 128


Embedding(151774, 2048)

In [12]:
tokenizer.save_pretrained("tokenizer_pair128")

('tokenizer_pair128/tokenizer_config.json',
 'tokenizer_pair128/special_tokens_map.json',
 'tokenizer_pair128/chat_template.jinja',
 'tokenizer_pair128/vocab.json',
 'tokenizer_pair128/merges.txt',
 'tokenizer_pair128/added_tokens.json',
 'tokenizer_pair128/tokenizer.json')

In [9]:
len(geom[0]['positions'])
pairs = []
for mol in tqdm(geom):
    pair = torch.rand(len(mol['positions']), len(mol['positions']), 64)
    pair.to('cuda')
    pairs.append(pair)

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:02<00:00, 3379.26it/s]


In [None]:
import json
from tqdm import tqdm

SYSTEM = ("You are a chemist. For each atom pair within 2 Å classify the "
          "bond type. Labels: 0 no-bond, 1 single, 2 double, 3 triple, 4 aromatic. "
          "Return ONLY JSON list [{\"pair\":[i,j],\"label\":n}].")

ORDER2LABEL = {0:1, 1:2, 2:3, 3:4, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1}
ALLOWED = ['C', 'O', 'N', 'F', 'S', 'Cl', 'Br', 'I', 'P']

dev = 'cuda'
centers_gpu = centers.to(dev)
with open("bond_prompts.jsonl", "w") as fout:
    
    for mol, pair in tqdm(zip(geom, pairs)):
        xyz   = mol["positions"].to(dev, non_blocking=True)      # (N,3)
        types = mol["one_hot"].argmax(-1)                        # остаётся на CPU
        pair  = pair.to(dev, non_blocking=True)                  # (N,N,64)

        dmat  = torch.cdist(xyz, xyz)                            # GPU, fp32
        i_idx, j_idx = (dmat<=2.0).nonzero(as_tuple=True)        # тоже GPU
        
        if not len(i_idx):  continue

        vecs   = pair[i_idx, j_idx]                              # (M,64)_gpu
        idx    = torch.cdist(vecs, centers_gpu).argmin(dim=1)        # (M,)_gpu
        toks   = [f"<p{i:03d}>" for i in idx.tolist()]                            # List[str]  (исправленный!)

        lines = [f"[{i},{j}]={tok}"
                for (i,j),tok in zip(zip(i_idx.tolist(), j_idx.tolist()), toks)]

        # ---------- метки ------------------------------------------------------
        edge2order = {(int(u),int(v)): bo.nonzero(as_tuple=True)[0].item()
                    for (u,v), bo in zip(mol["edge_index"], mol["bond_orders"])}
        edge2order |= {(v,u):o for (u,v),o in edge2order.items()}

        labels = [0 if (i,j) not in edge2order else ORDER2LABEL[edge2order[(i,j)]]
                for i,j in zip(i_idx.tolist(), j_idx.tolist())]

        assistant = json.dumps([{"pair":[i,j],"label":l}
                                for (i,j),l in zip(zip(i_idx.tolist(), j_idx.tolist()),
                                                labels)],
                            separators=(",",":"))

        atom_line = "Atoms (index→type): " + \
                    ", ".join(f"{idx}:{ALLOWED[types[idx]]}"
                            for idx in range(len(types)))

        prompt = {"messages":[
            {"role":"system","content":SYSTEM},
            {"role":"user",
            "content":"Pairs within 2 Å:\n" + "\n".join(lines) + "\n\n" + atom_line},
            {"role":"assistant","content":assistant}
        ]}
        fout.write(json.dumps(prompt)+"\n")



In [49]:
import json
import torch
from tqdm import tqdm

ORDER2LABEL = {0:1, 1:2, 2:3, 3:4, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1}
ALLOWED = ['C', 'O', 'N', 'F', 'S', 'Cl', 'Br', 'I', 'P']

dev = 'cpu'


with open("bond_prompts_final.jsonl", "w") as fout:
    for mol in tqdm(geom[:10000]):
        xyz   = mol["positions"].to(dev)                  # (N, 3)
        types = mol["one_hot"].argmax(dim=-1).cpu()       # (N,)
        dmat  = torch.cdist(xyz, xyz)                     # (N, N)

        i_idx, j_idx = (dmat <= 2.0).nonzero(as_tuple=True)
        if not len(i_idx): continue

        # Определение порядка связи
        edge2order = {
            (int(u), int(v)): bo.nonzero(as_tuple=True)[0].item()
            for (u, v), bo in zip(mol["edge_index"], mol["bond_orders"])
        }
        edge2order |= {(v, u): o for (u, v), o in edge2order.items()}

        # Формируем пары и метки
        ij_pairs = list(zip(i_idx.tolist(), j_idx.tolist()))
        labels = [
            0 if (i, j) not in edge2order else ORDER2LABEL[edge2order[(i, j)]]
            for i, j in ij_pairs
        ]

        # Строим строки
        input_lines = "\n".join(f"{i} {j}" for i, j in ij_pairs)
        output_lines = "\n".join(str(label) for label in labels)
        atom_types = " ".join(f"{i}:{ALLOWED[types[i]]}" for i in range(len(types)))

        # Постановка задачи
        prompt_text = (
            "You are a chemist. For each atom pair within 2 Å classify the bond type:\n"
            + input_lines + "\n\nAtoms: " + atom_types
        )

        fout.write(json.dumps({
            "input": prompt_text,
            "output": output_lines
        }) + "\n")


100%|██████████| 10000/10000 [01:51<00:00, 89.48it/s]


In [8]:
import json
with open('/home/user12/prompts2/prompts_short.jsonl') as f:
    p = f.readline()
print(json.loads(p)['output'])

0
1
1
0
1
1
1
0
2
1
2
0
1
0
1
1
0
4
4
4
0
4
4
0
4
4
0
4
4
0
4
4
4
0
1
0
1
1
0
4
4
4
0
4
4
0
4
4
0
4
1
4
0
4
4
4
0
1
0


In [None]:

run_parallel(geom, num_workers=8)

  0%|          | 0/8 [00:00<?, ?it/s]


AttributeError: Can't pickle local object 'run_parallel.<locals>.<lambda>'

In [29]:
# убедитесь, что vec2tok() возвращает СТРОКУ целиком
def vec2tok(v):
    idx = torch.cdist(v.unsqueeze(0), centers).argmin().item()
    return f"<p{idx:03d}>"           # → '<p042>'

# формируем lines без лишних \n и join
lines = [f"[{int(i)},{int(j)}]={tok}"      # tok уже готов '<p042>'
         for (i, j), tok in zip(zip(i_idx, j_idx), toks)]

user_text = "Pairs within 2 Å:\n" + "\n".join(lines)
user_text

'Pairs within 2 Å:\n[0,1]=<\n[1,0]=p\n[1,2]=2\n[1,5]=2\n[2,1]=9\n[2,3]=2\n[3,2]=>'

In [None]:
pairs_ij, vecs = ... # ≤2 Å выборка\n
lines = [f"[{i},{j}]={emb2tok(vecs[k])}"
for k,(i,j) in enumerate(pairs_ij)]
user = "Pairs within 2 Å:\\n" + "\\n".join(lines)

In [2]:
import random

line = random.choice(open("bond_prompts.jsonl").readlines())

In [None]:
json.loads(line)

In [3]:
geom_test = torch.load('./mydatasets/geom_test.pt')

In [4]:
len(geom_test)

1000

In [10]:
pept = torch.load('/home/user12/burov/pept/peptides_no_h_2.pt')
geom = torch.load('./mydatasets/geom_cut.pt', weights_only=True)

In [17]:
pept[0]['edge_indices'], pept[0]['bond_orders']

([tensor([[ 0,  1],
          [ 1,  2],
          [ 2,  3],
          [ 3,  4],
          [ 4,  5],
          [ 4,  6],
          [ 6,  7],
          [ 7,  8],
          [ 8,  9],
          [ 8, 10],
          [10, 11],
          [ 7, 12],
          [ 3, 13],
          [ 1, 14],
          [ 0,  2],
          [ 0,  3],
          [ 0,  4],
          [ 0,  5],
          [ 0,  6],
          [ 0,  7],
          [ 0,  8],
          [ 0,  9],
          [ 0, 10],
          [ 0, 11],
          [ 0, 12],
          [ 0, 13],
          [ 0, 14],
          [ 1,  3],
          [ 1,  4],
          [ 1,  5],
          [ 1,  6],
          [ 1,  7],
          [ 1,  8],
          [ 1,  9],
          [ 1, 10],
          [ 1, 11],
          [ 1, 12],
          [ 1, 13],
          [ 2,  4],
          [ 2,  5],
          [ 2,  6],
          [ 2,  7],
          [ 2,  8],
          [ 2,  9],
          [ 2, 10],
          [ 2, 11],
          [ 2, 12],
          [ 2, 13],
          [ 2, 14],
          [ 3,  5],
