In [10]:
import pandas as pd
import numpy as np
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [2]:
df = pd.read_csv('SARS-CoV-2 RBD data.csv')
df.head()

Unnamed: 0,Sequence,Ligand,Data,Units,Assay/Protocol
0,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,,-0.14,unitless,expr_lib1
1,AITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,,-0.14,unitless,expr_lib1
2,AITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,,-0.11,unitless,expr_avg
3,AITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,,-0.08,unitless,expr_lib2
4,AITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,,-0.05,unitless,bind_lib1


In [3]:
# Reading the first row as template sequence and saving as SeqRecord
template_seq = Seq(df.iloc[0,:]['Sequence'])
template = SeqRecord(
    template_seq,
    name='SARS-CoV-2 RBD Template',
    description='Parental sequence for RBD mutagenesis'
)


In [4]:
# Checking lengths of sequences to see if sequences are aligned or contain indels
mutants_df = df.iloc[1:, :].copy()
mutants_df.loc[:,'Sequence'].apply(len).unique()
# all sequences are 201 bp so likely only substitutions in this data



array([201], dtype=int64)

In [5]:
# Define a function to find substitution
def sub_finder(seq):
    mutations = []
    for idx, aa in enumerate(seq):
        template_aa = template.seq[idx]
        
        if aa == template_aa:
            pass
        else:
            sub = template_aa + str(idx+1) + aa
            mutations.append(sub)
    if len(mutations)==1:
        return str(mutations[0])
    else:
        return str(mutations)

In [6]:
# Create a new column for substitutions
mutants_df['Substitution'] = mutants_df['Sequence'].apply(sub_finder)


In [35]:
# Creating seq record for each variant sequence
seq_lst = mutants_df['Sequence'].tolist()
sub_lst = mutants_df['Substitution'].tolist()

seq_record_lst = []
for idx, sequence in enumerate(seq_lst):
    sub = sub_lst[idx]
    seq_rec = SeqRecord(seq=Seq(sequence),
                        name='Variant_'+str(idx+1),
                        id = 'RBD_' + sub,
                        description='Mutant variant of SARS-CoV-2 RBD'
                       )
    seq_record_lst.append(seq_rec)

In [45]:
# Checking some samples from the record list
for rec in seq_record_lst[:5000:1000]:
    print(rec)
    print('\n')

ID: RBD_N1A
Name: Variant_1
Description: Mutant variant of SARS-CoV-2 RBD
Number of features: 0
Seq('AITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSP...KST')


ID: RBD_V20H
Name: Variant_1001
Description: Mutant variant of SARS-CoV-2 RBD
Number of features: 0
Seq('NITNLCPFGEVFNATRFASHYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSP...KST')


ID: RBD_S36A
Name: Variant_2001
Description: Mutant variant of SARS-CoV-2 RBD
Number of features: 0
Seq('NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYAVLYNSASFSTFKCYGVSP...KST')


ID: RBD_Y50W
Name: Variant_3001
Description: Mutant variant of SARS-CoV-2 RBD
Number of features: 0
Seq('NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCWGVSP...KST')


ID: RBD_Y66P
Name: Variant_4001
Description: Mutant variant of SARS-CoV-2 RBD
Number of features: 0
Seq('NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSP...KST')




In [34]:
# Writing to fasta file
with open("SARS-CoV-2 RBD Variants.fasta", "w") as output_handle:
    SeqIO.write(seq_record_lst, output_handle, 'fasta')