In [1]:
%pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [43]:
from google.colab import drive
drive.mount('/content/drive')

import os
import shutil
import glob
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uuid
from datetime import datetime
import re
import torch
from time import time

meta_data_filepath = "/content/drive/MyDrive/Generative_Models/IL10_scaffolding/metadata_mpnn_solo.csv"

if os.path.exists(meta_data_filepath):
  all_metadata_df = pd.read_csv(meta_data_filepath)
  print("Existing generation metadata read in.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Existing generation metadata read in.


In [3]:
import os
import Bio
from Bio import SeqIO
from Bio.Seq import Seq


def is_valid_protein(sequence):
  try:
    Seq(str(sequence))
    return True
  except ValueError:
    return False


root_dir = "/content/drive/MyDrive/Generative_Models/IL10_scaffolding/evodiff_il10"
paths = []
for dirpath, dirnames, filenames in os.walk(root_dir):
  for filename in filenames:
      if "generation_metadata_evodiff_tev" in filename:
          paths.append(os.path.join(dirpath, filename))

import pandas as pd
all_dfs = []
for file_path in paths:
  df = pd.read_csv(file_path)
  df["dir_path"] = "/".join(file_path.split("/")[:-1])
  all_dfs.append(df)
gen_meta = pd.concat(all_dfs, ignore_index=True)

In [4]:
gen_meta = gen_meta.loc[(gen_meta['entity_id'].notnull()) & (gen_meta['task'] == 'sequence_generation'),:]
gen_meta = gen_meta.loc[:,['model', 'generated_sequence', "entity_id","dir_path"]]

#LLM outputs sometimes have artifacts. Correct these.
gen_meta.loc[:,'generated_sequence'] = gen_meta.loc[:,'generated_sequence'].apply(lambda x: re.sub(r'[\s\d]+', '', str(x)))
gen_meta.loc[:,'generated_sequence'] = gen_meta.loc[gen_meta.loc[:,'generated_sequence'].apply(is_valid_protein),:]


In [10]:
gen_meta

Unnamed: 0,model,generated_sequence,entity_id,dir_path
0,evodiff_OA_DM_640M,EKILKNLTRPVTERKSANRNSIRTDKPGHTTSLSFNVQTFTASVRM...,b19d0b47-4bd4-4741-a3d7-333058b06375,/content/drive/MyDrive/Generative_Models/IL10_...
1,evodiff_OA_DM_640M,TNSSHLGPTSDGSAGTSSTSILPQSTPGHTTSLRATIWGFIQRLGV...,69786323-615a-43bb-a03c-97dc7d222322,/content/drive/MyDrive/Generative_Models/IL10_...
2,evodiff_OA_DM_640M,MKANADPTASFTEYLKFAEDLLRLLCPGHTTSLDFPFTPDVIASES...,2329206d-2faa-4732-8c58-2b3551e046a1,/content/drive/MyDrive/Generative_Models/IL10_...
3,evodiff_OA_DM_640M,MFFTALTTLFLGLSVLFVVVGAPSPAHGHTTSLSTRDCHRHADGHA...,c1f1aa0d-ef75-4f35-85d4-b978eca8a855,/content/drive/MyDrive/Generative_Models/IL10_...
4,evodiff_OA_DM_640M,MTALITFRGPNGVVNCRLLREHTFPIGGHTTSLVTLTCITAALVPA...,024770c5-8dae-45f1-ba7c-c60953543ab5,/content/drive/MyDrive/Generative_Models/IL10_...
...,...,...,...,...
95,evodiff_OA_DM_640M,MQRGREAFIHQCLALLPGSESCRCLWEGHTTSLRELKYSVLIPRSL...,4814db58-7093-4f80-9711-1aaaee4ce751,/content/drive/MyDrive/Generative_Models/IL10_...
96,evodiff_OA_DM_640M,IECGRVRSVSCFVFVNLLLDDPFVIVIGHTTSLVFWGVKYSVFDHH...,e8d3d9ed-d7bc-44e6-8a1c-32d75c88d163,/content/drive/MyDrive/Generative_Models/IL10_...
97,evodiff_OA_DM_640M,IPNPHVRLPSQRRRNGAVFLGGQPSGLGHTTSLESTDALHEHQASV...,62e4bc85-a0a2-4871-bb00-2cb1a29048e8,/content/drive/MyDrive/Generative_Models/IL10_...
98,evodiff_OA_DM_640M,SGRSSQKDCSTPTLLRRLGFKFFCVSQGHTTSLPPKKKTNAECGSQ...,64a5fbbd-1520-46a5-af76-1f6ee923fda4,/content/drive/MyDrive/Generative_Models/IL10_...


In [11]:
gen_meta.generated_sequence.str.len().unique()

array([237])

In [26]:

from Bio import SeqIO
records= []
for i, row in gen_meta.iterrows():
  id = "evodiff_tev_" +row['entity_id']
  sequence = row['generated_sequence']
  record = SeqIO.SeqRecord(
      seq=sequence,
      id=id,
      description="",
      name="",
  )
  records.append(record)




In [29]:
with open('/content/drive/MyDrive/Generative_Models/IL10_scaffolding/evodiff_il10/generated_seqs_tev.fa', 'w',) as f:
    SeqIO.write(records, f, 'fasta')

In [44]:
test = all_metadata_df
#test['length'] = None
#test.loc[(test.model == 'ProteinMPNN'),'length'] = test.loc[(test.model == 'ProteinMPNN'),'input_file_path'].str.extract(r'(\d+)').astype(int).iloc[:,0]
print(test.shape)
#test['length'] = test['length'].astype(int)
#test = test.loc[test.length == 100,:]
#print(test.shape)


(2, 9)


In [45]:
test


Unnamed: 0,gen_model,model,input_file_path,task,Timestamp,gpu,output_file_path,num_designs,wall_time_task
0,,ProteinMPNN,,Sequence Redesign (fixed motif),2024-06-24 01:44:48.418461,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,100,172.54332041740417 Seconds
1,,ProteinMPNN,,Sequence Redesign (fixed motif),2024-06-24 02:03:04.429074,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,100,145.08001136779785 Seconds


In [20]:
test = test.loc[test.input_file_path.str.contains("tev"), :]


In [37]:
test.output_file_path[400]

'/content/drive/MyDrive/Generative_Models/IL10_scaffolding/rfdiffusion_il10/MPNN_redesigns_fixed/seqs/rfdiffusion_tev_mono_54c8fa44-a588-4cb2-a158-05ae85f9dd76.fa'

In [32]:
test

Unnamed: 0,model,input_file_path,entity_id,gen_model,task,Timestamp,gpu,output_file_path,num_designs,wall_time_task,score,global_score,fixed_chains,designed_chains,CA_model_name,git_hash,seed
0,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,06cc55b4-c549-45f5-b5d0-38f999f290cb,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:34:07.561228,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,17.789294242858887 Seconds,2.8223,2.8223,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,65
1,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,d858977c-980f-41f6-abdc-10bcb2331680,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:34:25.400879,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,18.18731689453125 Seconds,2.8456,2.8456,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,201
2,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,96031dd0-9b58-425e-b0db-a7bd0347cf87,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:34:43.620223,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,22.244973182678223 Seconds,2.8954,2.8954,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,215
3,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,c44462fd-e20b-4f78-b147-af279a30af71,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:35:05.899811,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,17.56941056251526 Seconds,2.8847,2.8847,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,958
4,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,9fb6f4cf-c2cd-4144-9ca4-4b7aca9fa989,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:35:23.504224,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,17.479437112808228 Seconds,2.8726,2.8726,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,80b873d2-5b71-4a19-bef9-6cf8a59bd513,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:21:01.197611,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,20.498369693756104 Seconds,3.0553,3.0553,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,782
408,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,4f9c5216-de0b-4cec-abb0-fa4158a28c6d,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:21:21.752074,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,20.97646975517273 Seconds,2.9466,2.9466,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,967
409,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,c25c7794-91e3-4741-8ad6-b5430a0d5021,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:21:42.767216,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,19.674657583236694 Seconds,2.9820,2.9820,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,654
410,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,df133588-3ef7-4cf2-b2dc-3fc749fe84bd,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:22:02.482719,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,21.584928512573242 Seconds,2.9290,2.9290,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,713


In [38]:
test.groupby('gen_model')['entity_id'].nunique()

gen_model
RFdiffusion_150it    412
Name: entity_id, dtype: int64

In [39]:
test = test.drop_duplicates(subset='output_file_path', keep='last')


In [41]:
test

Unnamed: 0,model,input_file_path,entity_id,gen_model,task,Timestamp,gpu,output_file_path,num_designs,wall_time_task,score,global_score,fixed_chains,designed_chains,CA_model_name,git_hash,seed
0,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,06cc55b4-c549-45f5-b5d0-38f999f290cb,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:34:07.561228,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,17.789294242858887 Seconds,2.8223,2.8223,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,65
1,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,d858977c-980f-41f6-abdc-10bcb2331680,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:34:25.400879,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,18.18731689453125 Seconds,2.8456,2.8456,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,201
2,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,96031dd0-9b58-425e-b0db-a7bd0347cf87,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:34:43.620223,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,22.244973182678223 Seconds,2.8954,2.8954,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,215
3,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,c44462fd-e20b-4f78-b147-af279a30af71,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:35:05.899811,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,17.56941056251526 Seconds,2.8847,2.8847,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,958
4,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,9fb6f4cf-c2cd-4144-9ca4-4b7aca9fa989,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 05:35:23.504224,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,17.479437112808228 Seconds,2.8726,2.8726,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,80b873d2-5b71-4a19-bef9-6cf8a59bd513,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:21:01.197611,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,20.498369693756104 Seconds,3.0553,3.0553,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,782
408,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,4f9c5216-de0b-4cec-abb0-fa4158a28c6d,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:21:21.752074,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,20.97646975517273 Seconds,2.9466,2.9466,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,967
409,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,c25c7794-91e3-4741-8ad6-b5430a0d5021,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:21:42.767216,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,19.674657583236694 Seconds,2.9820,2.9820,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,654
410,ProteinMPNN,/content/drive/MyDrive/Generative_Models/IL10_...,df133588-3ef7-4cf2-b2dc-3fc749fe84bd,RFdiffusion_150it,Sequence Redesign (fixed motif),2024-06-23 08:22:02.482719,T4 GPU,/content/drive/MyDrive/Generative_Models/IL10_...,10,21.584928512573242 Seconds,2.9290,2.9290,[],['A'],v_48_020,8907e6671bfbfc92303b5f79c4b5e6ce47cdef57,713


In [55]:
import uuid
#for _, row in test.iterrows():
records = []
for i, design in enumerate(SeqIO.parse('/content/drive/MyDrive/Generative_Models/IL10_scaffolding/MPNN_solo_redesigns/seqs/IL10_Mutant_model1.fa', "fasta")):
  if i > 0:
    design.description = f"il10_mpnn_solo_{str(uuid.uuid4())}"
    design.name = f"il10_mpnn_solo_{str(uuid.uuid4())}"
    records.append(design)
with open('/content/drive/MyDrive/Generative_Models/IL10_scaffolding/MPNN_solo_redesigns/il10.fa', 'w',) as f:
  SeqIO.write(records, f, 'fasta')




In [None]:
records[1]


SeqRecord(seq=Seq('MKEELEKRIEKLRKRIEEIEEKIKKREEKVKKFEEELPKLLEEIEKKFKKLEEA...KKI'), id='RFdiffusion_150it_il10_06cc55b4-c549-45f5-b5d0-38f999f290cb_design2', name='', description='', dbxrefs=[])

In [None]:
with open('/content/drive/MyDrive/Generative_Models/IL10_scaffolding/rfdiffusion_il10/MPNN_redesigns/all_generated_seqs.fa', 'w',) as f:
  SeqIO.write(records, f, 'fasta')

In [None]:
!ls /content/drive/MyDrive/Generative_Models/Utilities

allseqs_batches.json	ESM_Embeddings.ipynb	unconditional_generation
allseqs_embeddings.csv	extract_all_seqs.ipynb	UniProt_Dist.ipynb
allseqs.fa		metadata		uniref_50_1_batches.json
cost_estimates.ipynb	NOT_INCLUDED		uniref_50_1_embeddings.csv
envs			simplified		Utilities


In [None]:
#from google.colab import files
#files.download('sequences.fasta')