In [16]:
#%pip install biopython
#%pip install pandas
from Bio import SeqIO
import pandas as pd

In [17]:
ref_seq = "EVQLVESGG-GLVQPGGSLRLSCAASGFNI----KDTYIHWVRQAPGKGLEWVARIYPT--NGYTRYADSVK-GRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSS"
len(ref_seq)

128

In [18]:
ref_seq[107-1: 116] == "WGGDGFYAMD"

True

In [19]:
def get_edit_distance(original_seq, mutated_seq):
    '''
    Get number of mutations seq is from original
    Requires seqs to be of same length

    :param original_seq: str of AAs e.g. Trastuzumab's H3
    :param mutated_seq: str of AAs of mutated seq
    :returns: int of number of mutations
    '''
    edit_distance = 0
    for idx, original_AA in enumerate(original_seq):
        if mutated_seq[idx] != original_AA:
            edit_distance += 1  
    return edit_distance

In [23]:
ref_segment = "WGGDGFYAMD"
samples_set = set()
rows = []
with open("outputs/1n8z_1000k_proteinmpnn/seqs/1n8z_abb2.fa") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        if "sample" in record.description:
            info = dict(item.split('=') for item in record.description.split(', '))
            
            seq = str(record.seq)
            if seq not in samples_set:
                samples_set.add(seq)
                
                assert len(seq) == 128
                mutates = get_edit_distance(ref_segment, seq[107-1: 116])
                rows.append([seq, mutates, info['sample'], info['score'], info['global_score'], info['seq_recovery']])

print(len(samples_set), len(rows))

46552 46552


In [25]:
df = pd.DataFrame(columns=['seq', 'mutates', 'sample', 'score', 'global_score', 'seq_recovery'], data=rows)
df

Unnamed: 0,seq,mutates,sample,score,global_score,seq_recovery
0,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,8,1,1.1743,1.1789,0.2000
1,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,8,2,1.2568,1.1619,0.2000
2,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,8,3,1.1610,1.1858,0.2000
3,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,8,4,1.2284,1.1977,0.2000
4,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,7,5,1.1738,1.1842,0.3000
...,...,...,...,...,...,...
46547,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,8,999816,1.3803,1.1782,0.2000
46548,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,7,999825,1.4511,1.1959,0.3000
46549,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,8,999842,1.4620,1.2034,0.2000
46550,EVQLVESGGXGLVQPGGSLRLSCAASGFNIXXXXKDTYIHWVRQAP...,9,999881,1.4729,1.1866,0.1000


In [26]:
df.to_csv("proteinmpnn_s1m_unique_results.csv", index=False)