In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uuid
from datetime import datetime
import re
import torch
import shutil

meta_data_filepath = "/content/drive/MyDrive/Generative_Models/unconditional_generation/protpardelle_unconditional/generation_metadata_protpardelle.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing generation metadata read in.")
else:
  all_metadata_df = pd.DataFrame()
  #all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Created generation metadata dataframe")


len_dist_filepath = "/content/drive/MyDrive/Generative_Models/unconditional_generation/protpardelle_unconditional/uniref50_length_dist_protpardelle.json"

if os.path.exists(len_dist_filepath):
  with open(len_dist_filepath, "r") as f:
    uniprot_length_dist =  json.load(f)
  print("Loaded length distribution from drive")
else:

  #https://www.uniprot.org/uniprotkb/statistics#sequence-size
  bins = np.array([13,51,101,151,201,251,301,351,401,451,501,551,601,651,701,751,801,851,901,951,1001,1101,1201,1301,1401,1501,1601,1701,1801,1901,2001,2101,2201,2301,2401,2501,34350])
  swissprot_reviewed = np.array([0,9968,43534,59796,59574,58452,52413,52846,45901,37706,30572,22287,15830,13156,9403,7870,5700,4889,5301,4109,3007,4124,2897,2207,2070,1675,834,642,587,503,395,272,386,340,234,195,1462])
  TrEMBL_unreviewed = np.array([0,2668805,19825275,24705701,23838128,23462438,23225451,21389271,16814580,14287105,11501843,8283150,6266068,4715059,3755005,3186452,2687314,2166878,1843669,1457871,1153537,1975953,1398765,961048,664766,517536,390552,300984,236895,210921,180246,138808,122833,102865,82441,71548,527646])

  ecdf = np.cumsum(swissprot_reviewed) / np.sum(swissprot_reviewed)
  #shortest protein in uniprot is 14 res, longest is 34350 res.
  x = np.arange(14, 34350+1)
  ecdf = np.interp(x, bins, ecdf)

  # Sample from the empirical CDF
  num_samples = 11000
  random_values = np.random.rand(num_samples)
  sampled_lengths = np.round(np.interp(random_values, ecdf, x)).astype(int)
  #ten thousand sequences up to 1000 res in length
  sampled_lengths = sampled_lengths[sampled_lengths <= 1000][0:10000]

  # Plot the histogram of sampled values
  hist_values, bin_edges, patches = plt.hist(sampled_lengths, bins=x[0:1001-13], alpha=0.7, label='Sampled Values')
  plt.xlabel('X-axis label')
  plt.ylabel('Frequency')
  plt.legend()
  plt.show()

  uniprot_length_dist = list(zip([int(edge) for edge in bin_edges],[int(value) for value in hist_values]))
  with open(len_dist_filepath, "w") as f:
      json.dump(uniprot_length_dist, f)


Existing generation metadata read in.
Loaded length distribution from drive


In [None]:
%%bash
pip install torch transformers einops tqdm wandb rotary-embedding-torch biopython scipy torchtyping dm-tree matplotlib seaborn black ipython
git clone https://github.com/ProteinDesignLab/protpardelle
git clone https://github.com/dauparas/ProteinMPNN.git

In [None]:
%cd protpardelle

/content/protpardelle


In [None]:
cleanup_command = """
for file in /content/protpardelle/samples/*_samp*.pdb
  do
  bn=$(basename "$file")
  mv $file /content/drive/MyDrive/Generative_Models/unconditional_generation/protpardelle_unconditional/$bn
  done
#mv /content/protpardelle/samples/readme.txt /content/drive/MyDrive/Generative_Models/unconditional_generation/protpardelle_unconditional/"${bn%%_*}"_readme.txt
rm -r /content/protpardelle/samples
"""

for length, batch_size in uniprot_length_dist:
  if all_metadata_df.loc[all_metadata_df.conditions == "length = " + str(length),:].shape[0] >= batch_size: continue
  generation_command = "python draw_samples.py --type allatom --minlen {} --maxlen {} --steplen 1 --perlen {}".format(length,length+1, batch_size)
  meta_data = {}
  meta_data['batch_id'] = str(uuid.uuid4())
  meta_data['batch_size'] = str(batch_size)
  meta_data['Timestamp'] = str(datetime.now())
  meta_data['model'] = 'Protpardelle'
  meta_data['task'] = 'all_atom_pdb_generation'
  meta_data['conditions'] = 'length = ' + str(length)
  meta_data['gpu'] = 'T4 GPU'

  !{generation_command}
  with open("/content/protpardelle/samples/readme.txt", 'r') as file:
    content = file.read()
    total_samples = int(re.search(r'Total samples drawn: (\d+)', content).group(1))
    total_job_time_match = re.search(r'Total job time: (\d+\.\d+) seconds', content)
    total_job_time = float(total_job_time_match.group(1)) if total_job_time_match else None
  #protpardelle already includes walltime capture so we don't need to do that ourselves
  meta_data['wall_time_batch'] = str(total_job_time) + " Seconds"
  meta_data['wall_time_task'] = str(total_job_time/batch_size) + " Seconds (inferred)"

  for filename in os.listdir("/content/protpardelle/samples"):
      if filename.endswith(".pdb") and "samp" in filename:
        meta_data['entity_id'] = str(uuid.uuid4())
        new_filename = 'Protpardelle_len' + str(length) + '_' + meta_data['entity_id'] + '.pdb'
        meta_data['output_file_name'] = new_filename
        shutil.move(f"/content/protpardelle/samples/{filename}", f"/content/protpardelle/samples/{new_filename}")
        metadata_entry = pd.Series(meta_data)
        all_metadata_df = all_metadata_df.append(metadata_entry, ignore_index=True)
  all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Metadata saved. Cleaning up....")
  !{cleanup_command}
  torch.cuda.empty_cache()

In [None]:
#depreciated VVVVVVVVVVVVVVVVVVVVVVVVVVVV