In [35]:
from Bio import PDB

parser = PDB.PDBParser()
io = PDB.PDBIO()
struct = parser.get_structure('1a0q','/Users/jihyeonje/Downloads/PDBBind_processed/1a0q/1a0q_protein_processed.pdb')

for model in struct:
    for chain in model:
        for residue in chain:
            for atom in residue:
                x,y,z = atom.get_coord()

In [105]:
import torch
from Bio.PDB import *
import numpy as np

def load_coords_from_pdb(
    pdb,
    atoms,
    method="raw",
    also_bfactors=False,
    normalize_bfactors=True,
):
    """
    Returns array of shape (1, n_res, len(atoms), 3)
    """

    coords = []
    bfactors = []
    if method == "raw":  # Raw numpy implementation, faster than biopdb
        # Indexing into PDB format, allowing XXXX.XXX
        coords_in_pdb = [slice(30, 38), slice(38, 46), slice(46, 54)]
        # Indexing into PDB format, allowing XXX.XX
        bfactor_in_pdb = slice(60, 66)

        with open(pdb, "r") as f:
            resi_prev = 1
            counter = 0
            for l in f:
                l_split = l.rstrip("\n").split()
                if len(l_split) > 0 and l_split[0] == "ATOM" and l_split[2] in atoms:
                    resi = l_split[5]
                    if resi == resi_prev:
                        counter += 1
                    else:
                        counter = 0
                    if counter < len(atoms):
                        xyz = [
                            np.array(l[s].strip()).astype(float) for s in coords_in_pdb
                        ]
                        coords.append(xyz)
                        if also_bfactors:
                            bfactor = np.array(l[bfactor_in_pdb].strip()).astype(float)
                            bfactors.append(bfactor)
                    resi_prev = resi
            coords = torch.Tensor(np.array(coords)).view(1, -1, len(atoms), 3)

    return coords


def get_dmap(pdb, atoms, batched=True, out="torch", device=None):
    """
        Returns a n-dim array of shape (bs, 1, n, n)

    """
    coords = load_coords_from_pdb(pdb, atoms=atoms).view(1, -1, 3)
    coords = coords.contiguous()
    dmaps = torch.cdist(coords, coords).unsqueeze(1)
    return dmaps.detach().cpu().numpy()[0][0]
    

In [202]:
directory = '/Users/jihyeonje/Downloads/PDBBind_processed/'

ligpaths = []
protpaths = []

# iterate over files in
# that directory
for dir in os.listdir(directory):
    if dir !='.DS_Store':
        foldr = os.path.join(directory, dir)
    for i in os.listdir(foldr):
        if i.endswith('.sdf'):
            ligpaths.append(os.path.join(foldr, i))
        elif i.endswith('.pdb'):
            protpaths.append(os.path.join(foldr, i))

In [219]:
len(full_dist)

1596

In [220]:
np.array(full_dist).shape

(1596, 1596)

In [225]:
pth = '/Users/jihyeonje/Downloads/PDBBind_processed/1mx1/1mx1_protein_processed.pdb'
bkbone_atms = ['CA', 'C', 'N']
full_dist = get_dmap(pth, bkbone_atms)
img2 = np.zeros((len(full_dist),len(full_dist),3))
img2[:,:,0] = full_dist
img2[:,:,1] = full_dist
img2[:,:,2] = full_dist

In [214]:
np.array(full_dist.append(get_dmap(pth, a))).shape

()

In [204]:
for i in protpaths[:99]:
    full_dist = []
    bkbone_atms = [['CA'], ['C'], ['N']]
    for a in bkbone_atms:
        full_dist.append(get_dmap(i, a))
    print(i)
    print(np.asarray(full_dist).shape)

/Users/jihyeonje/Downloads/PDBBind_processed/6ugp/6ugp_protein_processed.pdb
(3, 257, 257)
/Users/jihyeonje/Downloads/PDBBind_processed/4hvb/4hvb_protein_processed.pdb
(3, 830, 830)
/Users/jihyeonje/Downloads/PDBBind_processed/2qki/2qki_protein_processed.pdb
(3, 631, 631)
/Users/jihyeonje/Downloads/PDBBind_processed/4rdn/4rdn_protein_processed.pdb
(3, 149, 149)
/Users/jihyeonje/Downloads/PDBBind_processed/3bwk/3bwk_protein_processed.pdb
(3, 476, 476)
/Users/jihyeonje/Downloads/PDBBind_processed/1bnm/1bnm_protein_processed.pdb
(3, 257, 257)
/Users/jihyeonje/Downloads/PDBBind_processed/4mo4/4mo4_protein_processed.pdb
(3, 345, 345)
/Users/jihyeonje/Downloads/PDBBind_processed/5yib/5yib_protein_processed.pdb
(3, 328, 328)
/Users/jihyeonje/Downloads/PDBBind_processed/3wzj/3wzj_protein_processed.pdb
(3, 257, 257)
/Users/jihyeonje/Downloads/PDBBind_processed/5o4s/5o4s_protein_processed.pdb
(3, 112, 112)
/Users/jihyeonje/Downloads/PDBBind_processed/4y7r/4y7r_protein_processed.pdb
(3, 304, 304)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.

In [156]:

sdf_path = '/Users/jihyeonje/Downloads/PDBBind_processed/1a0q/1a0q_ligand.sdf'
suppl = Chem.SDMolSupplier(sdf_path)
smi = Chem.MolToSmiles(suppl[0])


In [186]:
ligtxts = []
for p in ligpaths:
    suppl = Chem.SDMolSupplier(p, sanitize=False)
    smi = Chem.MolToSmiles(suppl[0])

    ligtxts.append(smi)

In [160]:
import json
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from rdkit import Chem

import clip
from transformers import CLIPProcessor, CLIPModel

In [159]:
json_path = '/Users/jihyeonje/Downloads/archive/train_data.json'
image_path = '/Users/jihyeonje/Downloads/archive/images/train/'

with open(json_path, 'r') as f:
    input_data = []
    c = 0
    for line in f:
        obj = json.loads(line)
        input_data.append(obj)
        c+=1
        if c>49: break

In [162]:
# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


# Choose computation device
device = "cuda:0" if torch.cuda.is_available() else "cpu" 


In [199]:
# Load pre-trained CLIP model
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

# Define a custom dataset
class image_title_dataset():
    def __init__(self, list_image_path,list_txt):
        # Initialize image paths and corresponding texts
        self.image_path = list_image_path
        # Tokenize text using CLIP's tokenizer
        self.title  = clip.tokenize(list_txt, context_length=77, truncate=True)

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        # Preprocess image using CLIP's preprocessing function
        full_dist = []
        bkbone_atms = [['CA'], ['C'], ['N']]
        for a in bkbone_atms:
            full_dist.append(get_dmap(self.image_path[idx], a))
        
        transform = T.ToPILImage()
        image = preprocess(transform(torch.tensor(full_dist)))
        title = self.title[idx]
        return image, title


In [None]:
full_dist = []
bkbone_atms = [['CA'], ['C'], ['N']]
for a in bkbone_atms:
    full_dist.append(get_dmap(pdb_path, a))
np.asarray(full_dist).shape

In [None]:
import torchvision
import torchvision.transforms as T


In [131]:
# import required module
import os
# assign directory
directory = '/Users/jihyeonje/Downloads/PDBBind_processed/'

ligpaths = []
protpaths = []

# iterate over files in
# that directory
for dir in os.listdir(directory):
    if dir !='.DS_Store':
        foldr = os.path.join(directory, dir)
    for i in os.listdir(foldr):
        if i.endswith('.sdf'):
            ligpaths.append(os.path.join(foldr, i))
        elif i.endswith('.pdb'):
            protpaths.append(os.path.join(foldr, i))


In [200]:

dataset = image_title_dataset(protpaths[:49], ligtxts[:49])
train_dataloader = DataLoader(dataset, batch_size=10, shuffle=True) #Define your own dataloader

# Function to convert model's parameters to FP32 format
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 


In [197]:

if device == "cpu":
  model.float()

# Prepare the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) # the lr is smaller, more safe for fine tuning to new dataset


# Specify the loss function
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

In [201]:
from tqdm import tqdm
# Train the model
num_epochs = 1
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader, total=len(train_dataloader))
    for batch in pbar:
        optimizer.zero_grad()

        images,texts = batch 
        
        images= images.to(device)
        texts = texts.to(device)

        # Forward pass
        logits_per_image, logits_per_text = model(images, texts)

        # Compute loss
        ground_truth = torch.arange(len(images),dtype=torch.long,device=device)
        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2

        # Backward pass
        total_loss.backward()
        if device == "cpu":
            optimizer.step()
        else : 
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)

        pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")

Epoch 0/1, Loss: 2.4980: 100%|██████████| 5/5 [00:18<00:00,  3.73s/it]


In [9]:
model

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          