In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pip install biopython

In [None]:
!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!git clone https://github.com/dauparas/ProteinMPNN
%cd ./ProteinMPNN
!conda create --name mpnn -y

In [None]:
%%bash
source activate mpnn
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -y

In [None]:
import os
import shutil
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uuid
from datetime import datetime
import re
import torch
from time import time

meta_data_filepath = "/content/drive/MyDrive/Generative_Models/conditional_generation/metadata_mpnn_fixed.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing generation metadata read in.")
else:
  all_metadata_df = pd.DataFrame()
  #all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Created mpnn metadata dataframe")

Existing generation metadata read in.


In [None]:
import os
root_dir = "/content/drive/MyDrive/Generative_Models/conditional_generation/"
paths = []
for dirpath, dirnames, filenames in os.walk(root_dir):
  for filename in filenames:
      if "generation_metadata" in filename:
          paths.append(os.path.join(dirpath, filename))

import pandas as pd
all_dfs = []
for file_path in paths:
  df = pd.read_csv(file_path)
  df["dir_path"] = "/".join(file_path.split("/")[:-1])
  all_dfs.append(df)
gen_meta = pd.concat(all_dfs, ignore_index=True)

In [7]:
gen_meta = gen_meta[(gen_meta['entity_id'].notnull()) & (gen_meta['task'].str.contains('backbone'))] #& (gen_meta['model'].isin(['framediff']))]
gen_meta['input_file_path'] = (gen_meta.dir_path + '/' + gen_meta.output_file_name)
gen_meta = gen_meta.rename(columns={"output_file_name":"input_file_name"})
gen_meta = gen_meta[['model','input_file_path','input_file_name', "entity_id"]]


In [None]:
meta_data_filepath = "/content/drive/MyDrive/Generative_Models/conditional_generation/metadata_mpnn_solo.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing generation metadata read in.")
else:
  all_metadata_df = pd.DataFrame()
  #all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Created mpnn metadata dataframe")

Created mpnn metadata dataframe


In [None]:
from Bio import SeqIO
import time
num_designs = 100

mpnn_alphabet = 'ACDEFGHIKLMNPQRSTVWYX'
mpnn_alphabet_dict = {'A': 0,'C': 1,'D': 2,'E': 3,'F': 4,'G': 5,'H': 6,'I': 7,'K': 8,'L': 9,'M': 10,'N': 11,'P': 12,'Q': 13,'R': 14,'S': 15,'T': 16,'V': 17,'W': 18,'Y': 19,'X': 20}
chain_length = 184
#chain_length = 237

pdb_file= "/content/drive/MyDrive/Generative_Models/conditional_generation/tev_monomer.pdb"
output_dir = "/content/drive/MyDrive/Generative_Models/conditional_generation/MPNN_solo_redesigns"
output_fasta = output_dir + "/seqs/tev.fa"
os.makedirs(output_dir, exist_ok=True)
#make the json which biases the motif positions completely
bias_per_residue = np.zeros([chain_length, 21])
residues = list(range(24,50)) + list(range(90,125))
#"24/A25-50/40/A91-125/59/0 C1-184"
motif_res = "QSENSCTHFPGYLPNMLRDLRDAFSRLEEVMPQAENQDPDIKAHVISLGENLNTLRLRLRR"
#residues = list(range(27,33)) + list(range(46,51)) + list(range(139,152)) + list(range(167,179)) + list(range(211,221))
#"27/A28-33/13/A47-51/88/A140-152/15/A168-179/32/A212-221/16"
#motif_res = "GHTTSLHLFRRFWKHWIQTKDGQCHSASNFTNTNNYWGGHKVFMVK"
motif_res = [mpnn_alphabet_dict[x] for x in motif_res]
for j, res in enumerate(residues):
  for aa in mpnn_alphabet:
    if mpnn_alphabet_dict[aa] == motif_res[j]:
      bias_per_residue[res,mpnn_alphabet_dict[aa]] = 100
    else:
      bias_per_residue[res,mpnn_alphabet_dict[aa]] = -100
bias_by_res_dict = {}
bias_by_res_dict["A"] = bias_per_residue.tolist()
motif_bias_dict = {}
motif_bias_dict[pdb_file.split("/")[-1].split(".")[0]] = bias_by_res_dict
with open('/content/ProteinMPNN/motif_bias.jsonl', 'w') as f:
  json.dump(motif_bias_dict, f)

meta_data_entry = pd.Series()
meta_data_entry["gen_model"] = "NA"
meta_data_entry["model"] = "ProteinMPNN"
meta_data_entry["input_file_path"] = "NA"
meta_data_entry["task"] = "Sequence Redesign (fixed motif)"
meta_data_entry["Timestamp"] = str(datetime.now())
meta_data_entry['gpu'] = 'T4 GPU'
meta_data_entry['output_file_path'] = output_fasta
meta_data_entry['num_designs'] = num_designs

start_time = time.time()
if "comp" in pdb_file:
  mpnn_command = f"""
    source activate mpnn
    python ./protein_mpnn_run.py --pdb_path {pdb_file} --pdb_path_chains A --out_folder {output_dir} --num_seq_per_target {num_designs} --bias_by_res_jsonl '/content/ProteinMPNN/motif_bias.jsonl' --sampling_temp "0.1" --seed 0 --batch_size 1 --ca_only
    """
else:
  mpnn_command = f"""
    source activate mpnn
    python ./protein_mpnn_run.py --pdb_path {pdb_file} --out_folder {output_dir} --num_seq_per_target {num_designs} --bias_by_res_jsonl '/content/ProteinMPNN/motif_bias.jsonl'  --sampling_temp "0.1" --seed 0 --batch_size 1 --ca_only
    """
!{mpnn_command}
end_time = time.time()
meta_data_entry['wall_time_task'] = str(end_time-start_time) + " Seconds"

try:
  records = list(SeqIO.parse(output_fasta, "fasta"))
  for pair in [pair.split('=') for pair in records[0].description.split(', ')[1:]]:
        meta_data_entry[pair[0]] = pair[1]
  for i, record in enumerate(records):
    #record.id = row['model'] + "_il10_fixed_" + row["entity_id"]+"_design"+str(i)
    record.id = "mpnn_solo_il10_fixed_" + str(uuid.uuid4())
    record.name = "mpnn_solo_il10_fixed_" + str(uuid.uuid4())

  SeqIO.write(records, output_fasta, "fasta")
  all_metadata_df = pd.concat([all_metadata_df,pd.DataFrame(meta_data_entry).T], ignore_index=True)
  all_metadata_df.to_csv(meta_data_filepath, index=False)
except FileNotFoundError:
  print(f"The file {output_fasta} does not exist.")




