In [1]:
import os
os.chdir("../")

In [2]:
import os
import sys
import pandas as pd

import antifold.main as antifold

%load_ext autoreload
%autoreload 2

#### Sample downloaded antibody in Notebook

In [3]:
# Download an Antibody PDB
!wget https://files.rcsb.org/download/6Y1L.pdb -O data/pdbs/6Y1L.pdb

# Define the PDB and chains in DataFrame
pdb_dir = "data/pdbs"
df_csv_pdbs = pd.DataFrame({
    "pdb": ["6Y1L"],
    "Hchain": ["H"],
    "Lchain": ["L"],
})

df_csv_pdbs

--2023-10-04 16:34:55--  https://files.rcsb.org/download/6Y1L.pdb
Løser files.rcsb.org (files.rcsb.org)... 128.6.159.245, 132.249.213.241, 128.6.159.100, ...
Tilslutter files.rcsb.org (files.rcsb.org)|128.6.159.245|:443... forbundet.
HTTP forespørgsel sendt, afventer svar... 200 OK
Længde: uspecificeret [application/octet-stream]
Gemmer til: 'data/pdbs/6Y1L.pdb'

data/pdbs/6Y1L.pdb      [   <=>              ] 632,81K   757KB/s    in 0,8s    

2023-10-04 16:34:56 (757 KB/s) - 'data/pdbs/6Y1L.pdb' gemt [648000]



Unnamed: 0,pdb,Hchain,Lchain
0,6Y1L,H,L


In [10]:
# Load model
model = antifold.load_IF1_model("models/model.pt")
antifold.seed_everything(42)

# Load PDBs
dataset, dataloader = antifold.get_dataset_dataloader(
    df_csv_pdbs, pdb_dir, batch_size=1
)

# Predict PDBs -> df_probs
predictions_list = antifold.dataset_dataloader_to_predictions_list(
    model, dataset, dataloader, batch_size=1
)
# Probability dataframes
df_probs_list = antifold.predictions_list_to_df_probs_list(
    predictions_list, dataset, dataloader
)

[2023-10-04 16:37:53,175] Loading checkpoint from models/model.pt...
[2023-10-04 16:37:53,703] Loaded model to cpu.
[2023-10-04 16:37:53,849] Reading in DataFrame
[2023-10-04 16:37:53,851] Populating 1 PDBs from                      pdb Hchain Lchain            pdb_path
data/pdbs/6Y1L.pdb  6Y1L      H      L  data/pdbs/6Y1L.pdb
[2023-10-04 16:37:53,996] Predicting batch 1/1: PDBs 1-1 out of 1 total


In [191]:
# Sample from the probability dataframes
df_probs = df_probs_list[0]
_pdb = df_probs.name

df_probs

Unnamed: 0,aa_pred,aa_orig,pdb_res,pdb_posins,pdb_chain,A,C,D,E,F,...,N,P,Q,R,S,T,V,W,Y,X
0,M,V,V,2,H,0.486675,-1.113065,-0.727330,-0.741292,-1.155323,...,-0.867767,0.571745,-0.996771,-0.742380,0.493624,0.416953,1.612888,-2.565651,-1.718953,-8.538675
1,Q,Q,Q,3,H,-2.017024,-4.271091,-0.593897,3.192484,-4.352402,...,1.972357,-4.768906,7.693972,3.299491,-1.335231,1.297512,0.384801,-5.161894,-2.718271,-18.216274
2,L,L,L,4,H,-0.825191,0.238605,-4.453015,0.761889,2.582538,...,-4.164991,-1.140410,1.002587,-1.798775,-3.989313,-4.316673,1.494307,-0.147141,-0.531081,-17.760847
3,Q,Q,Q,5,H,-0.712811,-4.975933,-1.825593,2.407209,-4.805994,...,0.885712,-11.411376,7.806160,3.045987,-0.277990,0.095043,3.021247,-3.329882,-4.493088,-25.328556
4,E,E,E,6,H,1.144224,-0.765152,1.680552,11.288973,-6.852089,...,-0.219154,-0.429449,6.471401,-1.455817,-1.167288,-0.529764,0.692641,-4.192106,-6.145962,-13.564919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,E,E,E,215,L,1.638605,-1.960570,2.277725,4.981476,-2.797938,...,1.228500,-3.078181,2.826369,-1.103499,0.812021,0.826898,1.288635,-3.017800,-3.057463,-21.369175
436,F,C,C,216,L,1.332236,-0.348085,-2.454163,-0.765010,3.746364,...,-2.187080,-5.527626,-1.866386,-2.965662,1.166681,0.846272,2.524161,0.374343,0.460232,-14.798580
437,V,I,I,217,L,0.562388,-3.469298,-3.206690,1.033772,-1.329298,...,-0.867768,-8.379012,-0.728974,0.611854,0.530798,3.349487,3.751574,-3.134093,-3.978726,-18.034561
438,G,D,D,218,L,2.982358,-1.252914,2.150190,0.804460,-1.107985,...,1.003707,-0.140552,1.056424,-0.660122,2.000783,-1.521522,-0.915143,-2.357224,-1.602839,-13.196360


In [192]:
def visualize_mutations(orig, mut, chain):
    mismatches = "".join(["X" if match else "_" for match in (orig != mut)])
    print(f"Mismatches:\t{mismatches}")
    print(f"Original {chain}:\t{''.join(orig)})")
    print(f"Mutated {chain}:\t{''.join(mut)}")
    print()

In [193]:
# Mutate positions 112/113/114 in CDR3, and all of CDR1 and CDR2
positions = [112, 113, 114, 115]
imgt_regions = ["CDR1", "CDR2", positions]

# Sample regions with a temperature of 0.20
H_mut, L_mut, mutation_df = antifold.sample_new_sequences_CDR_HL(
    df_probs,                       # DataFrame with residue probabilities
    t=0.50,                         # Sampling temperature
    imgt_regions=imgt_regions,      # Region to sample
    return_mutation_df=True,        # DataFrame with mutations only
    limit_expected_variation=False, # Only mutate as many positions are expected from temperature
    verbose=True,
)

# Visualize mutations
H_orig, L_orig = antifold.get_df_seqs_HL(df_probs)
visualize_mutations(H_orig, H_mut, chain="H")
visualize_mutations(L_orig, L_mut, chain="L")

# Write to file
antifold.write_HL_sequences(f"output/{_pdb}_mutated__1.fasta", H_mut, L_mut)

# Mutations only dataframe
mutation_df

Sampled 0 / 52 new CDR residues vs top predicted
Sampled 5 / 52 new CDR residues vs original
Mismatches:	_________________________X__X_X_______________________________________________________________________________________________________________________________________________________________________________________________
Original H:	VQLQESGPGLVKPSETLSLTCAVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAGLTQSSHNDANWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCL)
Mutated H:	VQLQESGPGLVKPSETLSLTCAVSGASITSSYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAGLTQSSHNDANWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCL

Mismatches:	_______________________________________________________________________________________________________________XX____________________________________________________________________________

Unnamed: 0,aa_orig,aa_sampled,aa_pred,pdb_posins,pdb_chain
25,Y,A,A,27,H
28,S,T,T,30,H
30,G,S,S,32,H
333,A,V,V,113,L
334,A,G,G,114,L


#### Predict using script

In [7]:
# Predict with script

# Load model
model = load_IF1_model(args.model_path)

# Predict with CSV on folder of solved (SAbDab) structures
_ = antifold.predict_and_save(
    model=model,
    csv_pdbs="data/example_pdbs.csv",
    pdb_dir="data/pdbs",
    out_dir="output/",
    batch_size=1,
    )

[2023-10-04 15:45:10,590] Loading checkpoint from models/model.pt...
[2023-10-04 15:45:11,330] Loaded model to cpu.
[2023-10-04 15:45:11,401] 
Predicting PDBs from CSV file: data/example_pdbs.csv
[2023-10-04 15:45:11,402] Saving prediction CSVs to output/
[2023-10-04 15:45:11,407] Populating 3 PDBs from data/example_pdbs.csv
[2023-10-04 15:45:11,579] Predicting batch 1/3: PDBs 1-1 out of 3 total
[2023-10-04 15:45:23,577] Predicting batch 2/3: PDBs 2-2 out of 3 total
[2023-10-04 15:45:34,868] Predicting batch 3/3: PDBs 3-3 out of 3 total
[2023-10-04 15:45:41,909] Saving 3 CSVs to output/
[2023-10-04 15:45:41,909] Writing predictions for 6y1l_imgt to output//6y1l_imgt.csv
[2023-10-04 15:45:41,923] Writing predictions for 8ee8_imgt to output//8ee8_imgt.csv
[2023-10-04 15:45:41,936] Writing predictions for C143_immunebuilder to output//C143_immunebuilder.csv
