In [1]:
# from datetime import datetime
from IPython.display import display, Markdown
from datetime import datetime

todays_date = str(datetime.now().date())

display(Markdown("# Notebook - Antiberty embeddings generation VH_VL seqs - OVA & RBD"))
display(Markdown("Author: Lena Erlach"))
display(Markdown("Created: 2024-03-07"))
display(Markdown(f"Last modified: {todays_date}"))

# Notebook - Antiberty embeddings generation VH_VL seqs - OVA & RBD

Author: Lena Erlach

Created: 2024-03-07

Last modified: 2024-03-21

In [1]:
from IPython.display import display, Markdown
from datetime import datetime

import os
import sys
import torch
import tqdm
import pickle
import pandas as pd
import numpy as np
from antiberty import AntiBERTyRunner
import configparser

sys.path.append("../../src/")
import utils_nb as utils

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


### Custom functions

In [2]:
def generate_Antiberty_Seq_embedding(
    seq_HL, name, antiberty, out_folder="embeddings/", save_plm=True
):
    """
    Function for generating Anitberty embeddings and saving them to a folder. Generates variable length embeddings of heavy and light chains in out_folder;

    params:
    seq_HL: list of 2 str, sequences of the heavy and light chains
    name: str, seq_id of the sequence
    antiberty: loaded AntiBERTyRunner() object
    out_folder: str, path to the folder where the embeddings should be saved
    """

    ids_to_drop = []

    if pd.isna(seq_HL[1]):
        VH_only = True
    else:
        VH_only = False

    try:
        # embed the sequences
        if VH_only:
            embeddings_r = antiberty.embed(seq_HL[0])
            embeddings = [embeddings_r[0][1:-1, :].cpu().numpy(), np.nan]
        else:
            embeddings_r = antiberty.embed(seq_HL)
            embeddings = [
                embeddings_r[0][1:-1, :].cpu().numpy(),
                embeddings_r[1][1:-1, :].cpu().numpy(),
            ]

        # create folder for esm embeddings
        if save_plm:
            out_path_PLM = os.path.join(out_folder)
            if not os.path.isdir(out_path_PLM):
                os.mkdir(out_path_PLM)

            # save the embeddings
            for embedding, chain_type in zip(
                [embeddings[0], embeddings[1]], ["H", "L"]
            ):
                if chain_type == "H":
                    file_name = "{}_{}.p".format(name, chain_type)
                    # print(os.path.join(out_path_PLM, file_name))

                    with open(os.path.join(out_path_PLM, file_name), "wb") as fh:
                        pickle.dump(embedding, fh)

                if chain_type == "L" and not VH_only:
                    file_name = "{}_{}.p".format(name, chain_type)
                    with open(os.path.join(out_path_PLM, file_name), "wb") as fh:
                        pickle.dump(embedding, fh)

    except:
        ids_to_drop.append(name)
        print("except")

    return embeddings, ids_to_drop

### Data preparation - OVA

In [9]:
# setup parser for the config file ../../example_config_file.txt
CONFIG_PATH = "../../config_file.txt"
config = configparser.ConfigParser()
config.read(CONFIG_PATH)
ROOT_DIR = config["ROOT"]["ROOT_DIR"]

# Set input path Sequences
seq_df_inputPath = os.path.join(ROOT_DIR, config["PATHS"]["SEQ_DF"])


seq_col = "VDJ_VJ_aaSeq"  # column name of the sequence to filter for (VDJ_VJ_aaSeq, VDJ_aaSeq, ...)


# Set input path CamSol measure
camsol_inputPath = os.path.join(
    ROOT_DIR, "data/raw/CamSol/CamSol_intrinsic2023-10-06_VDJ_VJ_aaSeq.txt"
)

# embedding paths for VH_VL embeddings
out_folder = os.path.join(ROOT_DIR, "data/processed/embeddings/Antiberty")


##### Setup the GPU support:
cuda_dev_num = 4
if torch.cuda.is_available():
    dev = "cuda:{}".format(cuda_dev_num)
else:
    dev = "cpu"
device = torch.device(dev)
torch.cuda.set_device(device)

In [4]:
# load preprocessed dataframe
seq_df = pd.read_csv(seq_df_inputPath)
# filter df and drop 129 sequences which was also ignored in ESM embeddings
seq_df = seq_df[seq_df.seq_complete == True]
seq_df.drop(192, inplace=True)

seq_df = seq_df.reset_index(drop=True)

# get indeces/names and sequences as lists
names = seq_df.seq_id.tolist()
seqs_H = seq_df.VDJ_aaSeq.tolist()
seqs_L = seq_df.VJ_aaSeq.tolist()
seqs_HL = [[seqs_H[i], seqs_L[i]] for i in range(len(seqs_H))]

### Embed sequences with antiberty

In [5]:
emb_dict = {}
ids_to_drop = []
# load model
antiberty = AntiBERTyRunner()

# generate embeddings
for seq, name in tqdm.tqdm(zip(seqs_HL, names), total=len(seqs_HL)):
    # print(name)
    embeddings, ids_dropped = generate_Antiberty_Seq_embedding(
        seq_HL=seq, name=name, antiberty=antiberty, out_folder=out_folder, save_plm=True
    )
    emb_dict[name] = embeddings
    ids_to_drop.append(ids_dropped)


# mean over embeddings
embeddings = [emb_dict[s] for s in names]
embeddings_m = utils.mean_over_HL(embeddings)

100%|██████████| 3621/3621 [01:04<00:00, 56.44it/s]


In [7]:
### Create function to load embeddings
input_folder = out_folder

#### Function to load emMbeddings
embeddings_loaded = utils.load_pickle_embeddings_VH_VL(
    names=names,
    inputPath=input_folder,
    embedding_type="var",
    file_suffix="",
    verbose=False,
)
embeddings_m = utils.mean_over_HL(embeddings)

### Load VH embeddings

In [7]:
embeddings_raw = utils.load_pickle_embeddings(names, out_folder, file_suffix="_H")
embeddings_m = np.array([emb.mean(0) for emb in embeddings_raw])
embeddings_m.shape

### Data preparation - RBD

In [10]:
# setup parser for the config file
CONFIG_PATH = "../../config_file_RBD.txt"
config = configparser.ConfigParser()
config.read(CONFIG_PATH)
ROOT_DIR = config["ROOT"]["ROOT_DIR"]

# Set input path Sequences
seq_df_inputPath = os.path.join(ROOT_DIR, config["PATHS"]["SEQ_DF"])


seq_col = "VDJ_VJ_aaSeq"  # column name of the sequence to filter for (VDJ_VJ_aaSeq, VDJ_aaSeq, ...)


# embedding paths for VH_VL embeddings
out_folder = os.path.join(ROOT_DIR, "data/processed/embeddings/RBD/Antiberty")


##### Setup the GPU support:
cuda_dev_num = 4
if torch.cuda.is_available():
    dev = "cuda:{}".format(cuda_dev_num)
else:
    dev = "cpu"
device = torch.device(dev)
torch.cuda.set_device(device)

In [8]:
# load preprocessed dataframe
seq_df = pd.read_csv(seq_df_inputPath)
# filter df
seq_df = seq_df[seq_df.seq_complete == True]
seq_df = seq_df.reset_index(drop=True)

# get indeces/names and sequences as lists
names = seq_df.seq_id.tolist()
seqs_H = seq_df.VDJ_aaSeq.tolist()
seqs_L = seq_df.VJ_aaSeq.tolist()
seqs_HL = [[seqs_H[i], seqs_L[i]] for i in range(len(seqs_H))]

In [12]:
emb_dict = {}
ids_to_drop = []
# load model
antiberty = AntiBERTyRunner()

# generate embeddings
for seq, name in tqdm.tqdm(zip(seqs_HL, names), total=len(seqs_HL)):
    # print(name)
    embeddings, ids_dropped = generate_Antiberty_Seq_embedding(
        seq_HL=seq, name=name, antiberty=antiberty, out_folder=out_folder, save_plm=True
    )
    emb_dict[name] = embeddings
    ids_to_drop.append(ids_dropped)


# mean over embeddings
embeddings = [emb_dict[s] for s in names]
embeddings_m = utils.mean_over_HL(embeddings)

100%|██████████| 3593/3593 [01:43<00:00, 34.77it/s]
