## Generate Protein Graphs Using Alphafold

In [None]:
import os
import sys
import tensorflow as tf
import jax
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
from tqdm import tqdm

from tensorflow.python.ops.array_ops import sequence_mask
from matplotlib.cbook import sanitize_sequence

if jax.local_devices()[0].platform == 'cpu':
  print("WARNING: no GPU detected, will be using CPU")
  DEVICE = "cpu"
else:
  print('Running on GPU')
  DEVICE = "gpu"
  # disable GPU on tensorflow
  # tf.config.set_visible_devices([], 'GPU')

if 'alphafold' not in sys.path:
  sys.path.append('alphafold')
if 'ColabFold/beta' not in sys.path:
  sys.path.append('ColabFold/beta')

import colabfold as cf
import ColabFold.beta.colabfold_alphafold as cf_af

if f"tmp/bin" not in os.environ['PATH']:
  os.environ['PATH'] += f":tmp/bin:tmp/scripts"

if not use_ptm and rank_by == "pTMscore":
  print("WARNING: models will be ranked by pLDDT, 'use_ptm' is needed to compute pTMscore")
  rank_by = "pLDDT"

In [None]:
target_csv = "D:\\Edward\\YALE\\CPSC\\552\\immunoai\\data\\hadrup_viral_data_csv.csv"

In [None]:
with open(target_csv, "r") as f:
  reader = csv.reader(f)
  for line in tqdm(reader):
    peptide = line[1] # line[2] for mut_pep
    sequence = line[3]
    sequence = sequence + ":" + peptide

    sequence_length = len(sequence)-1

    I = cf_af.prep_inputs(sequence, jobname + sequence[-100:].replace(":", ""), homooligomer, clean=IN_COLAB)
    mod_I = cf_af.prep_msa(I, msa_method, add_custom_msa, msa_format,
                      pair_mode, pair_cov, pair_qid, TMP_DIR="tmp")

    feature_dict = cf_af.prep_feats(mod_I, clean=IN_COLAB)
    Ls_plot = feature_dict["Ls"]

    # prep model options
    opt = {"N":len(feature_dict["msa"]),
          "L":len(feature_dict["residue_index"]),
          "use_ptm":use_ptm,
          "use_turbo":use_turbo,
          "num_relax" : num_relax,
          "max_recycles": max_recycles,
          "tol":0.0,
          "num_ensemble":num_ensemble,
          "max_msa_clusters":max_msa_clusters,
          "max_extra_msa":max_extra_msa,
          "is_training":is_training}

    if use_turbo:
      if "runner" in dir():
        # only recompile if options changed
        runner = cf_af.prep_model_runner(opt, old_runner=runner)
      else:
        runner = cf_af.prep_model_runner(opt, params_loc='D:\\Edward\\YALE\\CPSC\\552\\immunoai\\alphafold\\alphafold\\data')

    else:
      runner = None

    # run alphafold
    cf_af.run_alphafold(feature_dict, opt, runner, num_models, num_samples, subsample_msa,
                                          rank_by=rank_by, show_images=show_images, 
                                          params_loc='D:\\Edward\\YALE\\CPSC\\552\\immunoai\\alphafold\\alphafold\\data')


## Run Inference on Generated Proteins

In [None]:
import os
import argparse
import torch
import numpy as np
from dgl.dataloading import GraphDataLoader

from data import ImmunoPredInferDataset, collate, SplitDataset
from models.mapping import model_map
from procedures import inference

In [None]:
## to fill with own values
# model parameters
model_name = "HybridModelv2"
model_dir = "$ROOT/results/PropIEDB_PropCancer_ImmunoCancer/"
model_filename = "HybridModelv2.pt"

# Dataset parameters
feature_size = 23
coord_size = 3
full_sequence = True

# Data paths
graph_dir = "$ROOT/data/graph_pyg_IEDB/"
property_dir = "$ROOT/data/complete_score_Mprops_1_2_smoothed_sasa_v2.txt"
hla_path = "$ROOT/data/HLA_27_seqs_csv.csv"

device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")

config = parser.parse_args()
config.feature_size = feature_size
config.coord_size = coord_size
config.model_dir = model_dir

In [None]:
dataset_ft = ImmunoPredInferDataset(config,
                                        graph_directory=graph_dir,
                                        property_path=property_dir,
                                        hla_path=hla_path)
dataset_ft = SplitDataset(dataset_ft, "test", binary=True, full=config.full_sequence)

test_loader = GraphDataLoader(dataset_ft, batch_size=1, collate_fn=collate, shuffle=False)

In [None]:
model_path = os.path.join(model_dir, model_filename)
print(f'SAVED MODEL PATH: {model_path}')

input_dim = 283 * 21 if config.full_sequence else 11 * 21
model = model_map[config.model](vae_input_dim=input_dim, device=device)
model.load_trained(model_path, new_head=False, map_location=device)
model.to(device)

In [None]:
test_stats = inference(config, model, test_loader, device, return_raw_preds=True)

sequences = dataset_ft.raw_full_sequence[np.array(dataset_ft.indices)]
np.savetxt(f"{config.model_dir}/predictions_PPI.txt", np.stack([test_stats["predicted_probs"], test_stats["true_targets"], sequences], axis=1),
           delimiter="\t", fmt="%s", header="Predicted Immunogenicity\tTrue Immunogenicity\tSequence", comments="")
print('DONE')