In [None]:
!sinfo -O Nodehost,Gres:.30,GresUsed:.45
!salloc -N 1 --cpus-per-task=4 -p CS177h --gres=gpu:TeslaM4024GB:1
!salloc -N 1 --cpus-per-task=8 -p CS177h --gres=gpu:TeslaM4024GB:2
!salloc -N 1 --cpus-per-task=12 -p CS177h --gres=gpu:TeslaM4024GB:3
!salloc -N 1 --cpus-per-task=16 -p CS177h --gres=gpu:TeslaM4024GB:4
!jupyter-lab --no-brows --ip=0.0.0.0 --port=7774 

In [1]:
import os
import sys
from pathlib import Path

# PROJECT_PATH = Path() / "Project" / "ShanghaiTech-CS177H-MSA-Scoring"
PROJECT_PATH = Path() / "perl5" / "project"
sys.path.append(str(PROJECT_PATH))

from tqdm import tqdm
import numpy as np
import torch
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

import esm

DATASET_PATH     = PROJECT_PATH / "dataset" / "CASP14_fm"
MODEL_PATH       = PROJECT_PATH / "model"
EMBDEDDINGS_PATH = PROJECT_PATH / "embeddings"
TRANSFORMER_PATH = PROJECT_PATH / "esm_msa1b_t12_100M_UR50S.pt"

# hyperparameters
MAX_DEPTH = 256

In [2]:
!nvidia-smi

Mon Dec 12 16:51:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 470.74       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN V      On   | 00000000:18:00.0 Off |                  N/A |
| 28%   28C    P8    24W / 250W |      0MiB / 12066MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
msa_transformer, msa_alphabet = esm.pretrained.load_model_and_alphabet_local(TRANSFORMER_PATH) if TRANSFORMER_PATH else esm.pretrained.esm_msa1b_t12_100M_UR50S()
msa_transformer = msa_transformer.eval().cuda()
msa_batch_converter = msa_alphabet.get_batch_converter()

In [4]:
# load MSA data
train_dataset_path = DATASET_PATH / "train_filtered"
test_dataset_path  = DATASET_PATH / "test_filtered"
train_msas = [ {'path': train_dataset_path / filename, 'name': filename[:-4]} for filename in os.listdir(train_dataset_path) if filename.endswith('.a3m') ]
test_msas  = [ {'path': test_dataset_path  / filename, 'name': filename[:-4]} for filename in os.listdir(test_dataset_path)  if filename.endswith('.a3m') ]
msas = train_msas + test_msas

print(f"# of MSAs in dataset: {len(msas)}")

# of MSAs in dataset: 2850


In [6]:
from dataset import read_msa
from subsampling import greedy_subsampling

padding_size = 0

for msa in tqdm(msas):
    msa_name, msa_path = msa['name'], msa['path']
    msa_data = greedy_subsampling(read_msa(msa_path), MAX_DEPTH)
    _, _, msa_batch_tokens = msa_batch_converter(msa_data)
    
    with torch.no_grad():
        torch.cuda.empty_cache()
        results = msa_transformer(msa_batch_tokens.cuda(non_blocking = True), repr_layers=[12])
    
    embedding = results['representations'][12][0, 0, :, :].cpu().clone()
    torch.save(embedding, EMBDEDDINGS_PATH / (msa_name + ".pt"))
    padding_size = max(padding_size, embedding.shape[0])

print(f"Shape of a feature map after padding: {padding_size} x {msa_transformer.args.embed_dim}")

 47%|██████████████████████████████████████████████████████████████████▍                                                                          | 682/1447 [1:05:54<57:34,  4.52s/it]

In [6]:
n_extracted = len([filename for filename in os.listdir(EMBDEDDINGS_PATH) if filename.endswith('.pt')])
print(f"# of embedding extracted: {n_extracted}")

# of embedding extracted: 2850
