In [None]:
from google.colab import drive
drive.mount('/content/drive')

import time
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uuid
from datetime import datetime
import re
import torch

meta_data_filepath = "/content/drive/MyDrive/Generative_Models/unconditional_generation/foldingdiff_unconditional/generation_metadata_foldingdiff.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing generation metadata read in.")
else:
  all_metadata_df = pd.DataFrame()
  #all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Created generation metadata csv")


len_dist_filepath = "/content/drive/MyDrive/Generative_Models/unconditional_generation/foldingdiff_unconditional/uniref50_length_dist_foldingdiff.json"

if os.path.exists(len_dist_filepath):
  with open(len_dist_filepath, "r") as f:
    uniprot_length_dist =  json.load(f)
  print("Loaded length distribution from drive")
else:

  #https://www.uniprot.org/uniprotkb/statistics#sequence-size

  #shortest protein in uniprot is 14 res, longest is 34350 res.

  bins = np.array([13,51,101,151,201,251,301,351,401,451,501,551,601,651,701,751,801,851,901,951,1001,1101,1201,1301,1401,1501,1601,1701,1801,1901,2001,2101,2201,2301,2401,2501,34350])
  swissprot_reviewed = np.array([0,9968,43534,59796,59574,58452,52413,52846,45901,37706,30572,22287,15830,13156,9403,7870,5700,4889,5301,4109,3007,4124,2897,2207,2070,1675,834,642,587,503,395,272,386,340,234,195,1462])
  TrEMBL_unreviewed = np.array([0,2668805,19825275,24705701,23838128,23462438,23225451,21389271,16814580,14287105,11501843,8283150,6266068,4715059,3755005,3186452,2687314,2166878,1843669,1457871,1153537,1975953,1398765,961048,664766,517536,390552,300984,236895,210921,180246,138808,122833,102865,82441,71548,527646])

  ecdf = np.cumsum(swissprot_reviewed) / np.sum(swissprot_reviewed)

  x = np.arange(14, 34350+1)
  ecdf = np.interp(x, bins, ecdf)

  # Sample from the empirical CDF
  num_samples = 11000
  random_values = np.random.rand(num_samples)
  sampled_lengths = np.round(np.interp(random_values, ecdf, x)).astype(int)
  #ten thousand sequences up to 1000 res in length
  sampled_lengths = sampled_lengths[sampled_lengths <= 1000][0:10000]

  # Plot the histogram of sampled values
  hist_values, bin_edges, patches = plt.hist(sampled_lengths, bins=x[0:1001-13], alpha=0.7, label='Sampled Values')
  plt.xlabel('X-axis label')
  plt.ylabel('Frequency')
  plt.legend()
  plt.show()

  uniprot_length_dist = list(zip([int(edge) for edge in bin_edges],[int(value) for value in hist_values]))
  with open(len_dist_filepath, "w") as f:
      json.dump(uniprot_length_dist, f)



Mounted at /content/drive
Existing generation metadata read in.
Loaded length distribution from drive


In [None]:
!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!git clone https://github.com/microsoft/foldingdiff
%cd foldingdiff
!conda env create -f environment.yml -y

In [None]:
%%bash
source activate foldingdiff
conda install ipykernel -y
pip install -e ./

In [None]:
cleanup_command = """
for file in /content/foldingdiff/output/sampled_pdb/*.pdb
  do
  bn=$(basename "$file")
  mv $file /content/drive/MyDrive/Generative_Models/unconditional_generation/foldingdiff_unconditional/$bn
  done
rm -rf /content/foldingdiff/output
"""

for length, batch_size in uniprot_length_dist:
  if length > 128: break #While FoldingDiff will run with lengths >128, it just defaults to 128.
  if all_metadata_df.loc[all_metadata_df.conditions == "length = " + str(length),:].shape[0] >= batch_size: continue
  generation_command = """
  source activate foldingdiff
  python -O ./bin/sample.py -o ./output -l {} {} -n {} -b 512 --device cuda:0
  """.format(int(length),int(length)+1, int(batch_size))
  meta_data = {}
  meta_data['batch_id'] = str(uuid.uuid4())
  meta_data['batch_size'] = str(int(batch_size))
  meta_data['Timestamp'] = str(datetime.now())
  meta_data['model'] = 'FoldingDiff'
  meta_data['task'] = 'backbone_pdb_generation'
  meta_data['conditions'] = 'length = ' + str(int(length))
  meta_data['gpu'] = 'T4 GPU'
  start_time = time.time()
  !{generation_command}
  end_time = time.time()
  total_job_time = end_time - start_time
  meta_data['wall_time_batch'] = str(total_job_time) + " Seconds"
  meta_data['wall_time_task'] = str(total_job_time/batch_size) + " Seconds (inferred)"
  for i, filename in enumerate(os.listdir("/content/foldingdiff/output/sampled_pdb")):
      if filename.endswith(".pdb"):
        meta_data['entity_id'] = str(uuid.uuid4())
        new_filename = "FoldingDiff_len"+str(length)+"_" + meta_data['entity_id'] + ".pdb"
        os.rename("/content/foldingdiff/output/sampled_pdb/"+filename,"/content/foldingdiff/output/sampled_pdb/"+new_filename)
        meta_data['output_file_name'] = new_filename
        metadata_entry = pd.Series(meta_data)
        all_metadata_df = all_metadata_df.append(metadata_entry, ignore_index=True)
  all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Metadata saved. Cleaning up....")
  !{cleanup_command}
  torch.cuda.empty_cache()