In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install biopython

In [None]:
pip install git+https://github.com/HeliXonProtein/OmegaFold.git

In [None]:
#do a fake run of omegafold before mounting to drive to download the weights etc.
!omegafold /thisdoesnotexist.fa /content/output

In [4]:
import os
import shutil
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uuid
from datetime import datetime
import re
import torch
import time
from Bio import SeqIO

meta_data_filepath = "/content/drive/MyDrive/Generative_Models/utilities/metadata_omegafold.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing generation metadata read in.")
else:
  all_metadata_df = pd.DataFrame()
  #all_metadata_df.to_csv(meta_data_filepath, index=False)
  print("Created omegafold metadata dataframe")

Existing generation metadata read in.


In [5]:
def count_records_fasta(fasta_file):
  records = list(SeqIO.parse(fasta_file, "fasta"))
  return len(records)

In [6]:
def digest_large_fasta(fasta_file,num_processed, max_num_records=100):
  !rm -rf tmp_files
  !mkdir tmp_files
  records = list(SeqIO.parse(fasta_file, "fasta"))
  records = records[num_processed:]
  digested = [records[i:i+max_num_records] for i in range(0, len(records), max_num_records)]
  tmp_fastas=[]
  for i, d in enumerate(digested):
    tmp_path = "/content/tmp_files/" + fasta_file.split('/')[-1].split('.')[0] + "_" + str(i) + ".fa"
    tmp_fastas.append(tmp_path)
    with open(tmp_path, "w") as f:
      SeqIO.write(d, f, "fasta")
  return tmp_fastas


In [None]:
for length in range(14,200):
  input_file_path = f"/content/drive/MyDrive/Generative_Models/utilities/fold_inputs/all_len{length}.fa"
  #input_file_path = f"/content/drive/MyDrive/Generative_Models/utilities/refold_inputs/all_len{length}.fa"
  output_dir_path = "/content/drive/MyDrive/Generative_Models/utilities/omegafold/" + input_file_path.split('/')[-1].split('.')[0]
  num_records = count_records_fasta(input_file_path)
  num_processed = all_metadata_df.loc[all_metadata_df['input_file_path'] == input_file_path, 'batch_size'].sum()
  if num_processed < num_records:
    print(input_file_path)
    print(f"{num_records} sequences.")
    print(f"{num_processed} already processed.")
    tmp_fastas = digest_large_fasta(input_file_path, num_processed)
    for tmp_fasta in tmp_fastas:
      meta_data_entry = pd.Series()
      meta_data_entry["model"] = "omegafold"
      meta_data_entry["task"] = "Structure Prediction"
      meta_data_entry["input_file_path"] = input_file_path
      meta_data_entry["output_dir_path"] = output_dir_path
      meta_data_entry["Timestamp"] = str(datetime.now())
      meta_data_entry['gpu'] = 'T4 GPU'
      meta_data_entry["batch_id"] = str(uuid.uuid4())
      meta_data_entry['batch_size'] = count_records_fasta(tmp_fasta)
      print(f"{meta_data_entry['batch_size']} sequences in this batch ({tmp_fasta})")
      print("-------------------------------------------------")
      start_time = time.time()
      !omegafold {tmp_fasta} {output_dir_path}
      end_time = time.time()
      total_job_time = end_time - start_time
      meta_data_entry['wall_time_batch'] = str(total_job_time) + " Seconds"
      meta_data_entry['wall_time_task'] = str(total_job_time/meta_data_entry['batch_size']) + " Seconds (inferred)"
      all_metadata_df = pd.concat([all_metadata_df,pd.DataFrame(meta_data_entry).T], ignore_index=True)
      all_metadata_df.to_csv(meta_data_filepath, index=False)
  else:
    print(f"All entries in {input_file_path} already processed.")