# **IgLM**: Generative language modeling for antibody design

Official notebook for [IgLM](https://www.biorxiv.org/content/10.1101/2021.12.13.472419v2), a generative language model for antibody sequence generation and infilling.  The code, data, and weights for this work are made available for non-commercial use. For commercial inquiries, please contact `dmalon11[at]jhu.edu`.

In [None]:
#@title Configure generation settings then press `Runtime` -> `Run all`

import os
import sys
import uuid

python_version = f"{sys.version_info.major}.{sys.version_info.minor}"

job_name = uuid.uuid4().hex[:6]
pred_dir = job_name
os.makedirs(pred_dir, exist_ok=True)

species = 'human' #@param ["human", "mouse", "camel", "rat", "rabbit", "rhesus"]
chain = 'heavy' #@param ["heavy", "light"]

#@markdown IgLM can be used to generate full-length antibody sequences or infill regions of existing sequences.
mode = 'generate' #@param ["generate", "infill"]
num_sequences = 20 #@param {type:"integer"}

#@markdown Predict structures with IgFold.
predict_structures = True #@param {type:"boolean"}

#@markdown **Full-length generation**

#@markdown Initial residues may be provided to prompt sequence generation. This is useful to ensure full-length sequences are generated.
use_prompt = False #@param {type:"boolean"}
prompt = "EVQ" #@param {type:"string"}

#@markdown **Sequence infilling**

#@markdown For sequence infilling, such as CDR loop generation, a parent sequence must be provided with specific residue indices to infill.

parent_sequence = "EVQLVESGGGLVQPGGSLRLSCAASGFNIKEYYMHWVRQAPGKGLEWVGLIDPEQGNTIYDPKFQDRATISADNSKNTAYLQMNSLRAEDTAVYYCARDTAAYFDYWGQGTLVTVS" #@param {type:"string"}
infill_start = 98 #@param {type:"integer"}
infill_end = 106 #@param {type:"integer"}

print(f"Running job {job_name}")

In [None]:
#@title Install dependencies

PYTHON_VERSION = python_version

torch_string = "torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html"

if not os.path.isfile("IGLM_READY"):
  print("installing iglm...")
  os.system(f"pip3 install {torch_string}")
  os.system(f"pip install 'iglm' {torch_string}")
  os.system("pip install -q --no-warn-conflicts 'py3Dmol>=2.0.1' matplotlib seaborn")
  os.system("touch IGLM_READY")

if predict_structures and not os.path.isfile("IGFOLD_READY"):
  print("installing igfold...")
  os.system(f"pip install 'igfold>=0.3.0' {torch_string}")
  os.system("touch IGFOLD_READY")

In [None]:
#@title Generate sequences

from iglm import IgLM
from tqdm import tqdm

species_tokens = {
    "human": "[HUMAN]",
    "mouse": "[MOUSE]",
    "camel": "[CAMEL]",
    "rat": "[RAT]",
    "rabbit": "[RABBIT]",
    "rhesus": "[RHESUS]",
}
chain_tokens = {
    "heavy": "[HEAVY]",
    "light": "[LIGHT]",
}

species_token = species_tokens[species]
chain_token = chain_tokens[chain]

iglm = IgLM()
if mode == "generate":
  if use_prompt and len(prompt) > 0:
    prompt = prompt.upper()
  else:
    prompt = None
  generated_seqs = iglm.generate(
      chain_token,
      species_token,
      prompt_sequence=prompt,
      num_to_generate=num_sequences,
  )
elif mode == "infill":
  generated_seqs = iglm.infill(
      parent_sequence,
      chain_token,
      species_token,
      infill_range=(infill_start, infill_end),
      num_to_generate=num_sequences,
  )

print("\nSequences")
for i, sequence in enumerate(generated_seqs):
  print(f"{str(i).zfill(num_sequences // 10)}\t{sequence}")

fasta_file = os.path.join(pred_dir, "all_sequences.fasta")
chain_id = "H" if chain == "heavy" else "L"
with open(fasta_file, "w") as f:
  for i, sequence in enumerate(generated_seqs):
    f.write(f">{job_name}_{str(i).zfill(num_sequences // 10)}\n{sequence}\n")

In [None]:
#@title Predict structures

if predict_structures:
  import contextlib
  with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
      from igfold import IgFoldRunner

      igfold = IgFoldRunner(num_models=1)

      for i, sequence in tqdm(enumerate(generated_seqs), total=num_sequences):
        pdb_file = os.path.join(pred_dir, f"{job_name}_{str(i).zfill(num_sequences // 10)}.pdb")
        fasta_file = pdb_file.replace("pdb", "fasta")
        igfold.fold(
            pdb_file,
            sequences={chain_id: sequence},
            do_refine=False,
            do_renum=False,
        )
        os.system(f"rm {fasta_file}")
else:
  print("Not predicting structures")

In [None]:
#@title Download results

#@markdown Download zip file containing generated sequences (and structures). If download fails, results are also accessible from file explorer on the left panel of the notebook.

from google.colab import files
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!zip -FSr $job_name".result.zip" $pred_dir/ &> /dev/null
files.download(f"{job_name}.result.zip")