In [None]:
%pip install biopython

In [None]:
!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!git clone https://github.com/dauparas/ProteinMPNN
%cd ./ProteinMPNN
!conda create --name mpnn -y

In [None]:
%%bash
source activate mpnn
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -y

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import shutil
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uuid
from datetime import datetime
import re
import torch
from time import time

meta_data_filepath = "/content/drive/MyDrive/Generative_Models/utilities/metadata_mpnn.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing MPNN metadata read in.")
else:
  all_metadata_df = pd.DataFrame()
  #all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Created MPNN metadata dataframe")

In [5]:
import os
root_dir = "/content/drive/MyDrive/Generative_Models/unconditional_generation/"
paths = []
for dirpath, dirnames, filenames in os.walk(root_dir):
  for filename in filenames:
      if "generation_metadata" in filename:
          paths.append(os.path.join(dirpath, filename))

import pandas as pd
all_dfs = []
for file_path in paths:
  df = pd.read_csv(file_path)
  df["dir_path"] = "/".join(file_path.split("/")[:-1])
  all_dfs.append(df)
gen_meta = pd.concat(all_dfs, ignore_index=True)

In [6]:
gen_meta = gen_meta[(gen_meta['entity_id'].notnull()) & (gen_meta['task'].str.contains('backbone'))] #& (gen_meta['model'].isin(['framediff']))]
gen_meta['length'] = None
gen_meta.loc[:,"length"] = gen_meta.loc[:,"conditions"].str.extract('(\d+)')[0].astype(int)
gen_meta['input_file_path'] = (gen_meta.dir_path + '/' + gen_meta.output_file_name)
gen_meta = gen_meta.rename(columns={"output_file_name":"input_file_name"})
gen_meta = gen_meta[['model','length','input_file_path','input_file_name', "entity_id"]]
gen_meta = gen_meta.sample(frac=1) #just shuffling the rows
gen_meta = gen_meta.sort_values(by=['length'], ascending=True)

In [7]:
#All foldingdiff outputs > 128 in length are actually just 128 so we'll ignore them
gen_meta = gen_meta.loc[~((gen_meta['length'] > 128) & (gen_meta['model'] == 'foldingdiff'))]

In [11]:
from Bio import SeqIO
import time
num_designs = 10

for index, row in gen_meta.iterrows():
  length = row["length"]
  pdb_file= row["input_file_path"]
  output_dir = "/content/drive/MyDrive/Generative_Models/utilities/MPNN_backbone/len" + str(length) + "/" + row["model"]
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)
  output_fasta = output_dir + "/seqs/" +row["input_file_name"].split(".")[0]+".fa"

  if (not all_metadata_df.empty) and (any(all_metadata_df['output_file_path'].str.contains(output_fasta))): continue #file has already been redesigned
  if length ==100: continue # already did these in a batch, will need to rename
  if length > 200: continue
  os.makedirs(output_dir, exist_ok=True)
  print('\n')
  print('\n')
  print(pdb_file)
  print("out to... " + output_fasta)
  print('\n')
  print('\n')

  meta_data_entry = row.copy()[["model","input_file_path","entity_id"]]
  meta_data_entry["gen_model"] = row["model"]
  meta_data_entry["model"] = "ProteinMPNN"
  meta_data_entry["task"] = "Sequence Redesign"
  meta_data_entry["Timestamp"] = str(datetime.now())
  meta_data_entry['gpu'] = 'T4 GPU'
  meta_data_entry['length'] = length
  meta_data_entry['output_file_path'] = output_fasta
  meta_data_entry['num_designs'] = num_designs

  start_time = time.time()
  mpnn_command = f"""
    source activate mpnn
    python ./protein_mpnn_run.py --pdb_path {pdb_file} --out_folder {output_dir} --num_seq_per_target {num_designs} --sampling_temp "0.1" --seed 0 --batch_size 1 --ca_only
    """
  !{mpnn_command}
  end_time = time.time()
  meta_data_entry['wall_time_task'] = str(end_time-start_time) + " Seconds"

  try:
    records = list(SeqIO.parse(output_fasta, "fasta"))
    for pair in [pair.split('=') for pair in records[0].description.split(', ')[1:]]:
          meta_data_entry[pair[0]] = pair[1]
    for i, record in enumerate(records):
      record.id = row["input_file_name"].split(".")[0] + "_design"+str(i)
      record.name = row['input_file_name']

    SeqIO.write(records, output_fasta, "fasta")
    all_metadata_df = pd.concat([all_metadata_df,pd.DataFrame(meta_data_entry).T], ignore_index=True)
    all_metadata_df.to_csv(meta_data_filepath, index=False)
  except FileNotFoundError:
    print(f"The file {output_fasta} does not exist.")

