In [1]:
# !pip install xlwings
# !pip install torch

In [2]:
# Import soft alignment functions

from soft_align import *

In [3]:
# Import necessary libraries

import xlwings as xw

In [4]:
def build_excel_sheet(seq_1_str,seq_2_str,seq_1_embedding, seq_2_embedding):
    # Excel (the program should already be open)
    # Create new excel workbook containing an empty sheet
    wb = xw.Book()
    sheet = wb.sheets[0]
    
    ### Add first seq with indices at cols 0 and 1
    sheet['A3'].value = [[x] for x in range(0,len(seq_1_str))]
    sheet['B3'].value = [[x] for x in seq_1_str]

    ### Add second seq with indices at rows 0 and 1
    sheet['C1'].value = list(range(0, len(seq_2_str)))
    sheet['C2'].value = list(seq_2_str)
    
    # Generate matrix containing cosine distances of amino acid embeddings from sequence representations
    data = get_data_matrix(seq_1_embedding, seq_2_embedding)
    
    # Generate matches of amino acids from cosine distance matrix
    matches = get_matches(seq_1_str, seq_2_str, data)
    
    # Generate a longest path of matches that diagonally traverse the matrix, to be considered soft alignment
    longest_path = get_longest_path(data, matches)
    
    ### Populate empty excel sheet with cosine distance matrix
    sheet['C3'].options(index=False).value = data.to_numpy()
    
    # Color longest path in cosine distance matrix 'yellow'
    for m in longest_path:
        m = (m[0]+2, m[1]+2)
        sheet[m].color = (254, 254, 69)

    return
    

In [5]:
# Set path for directories where sequence embeddings are stored and where sequences in fasta file is stored

embedding_directory = './soft_align_example/example_embeddings/'

fasta_directory = './soft_align_example/'
fasta_file = 'example.fasta'


In [6]:
# Load fasta file as a dictionary
# Keys in seqs dictionary are sequence names
# Values in seqs dictionary are the information for each sequence stored in the fasta file

seqs = SeqIO.to_dict(SeqIO.parse(fasta_directory + fasta_file, 'fasta'))

# Create a list of sequence names
seq_names = list(seqs.keys())
print(seq_names)

['YP_006990334.1', 'YP_001468397.1', 'WP_016056174.1']


In [7]:
# Print sequence from seqs dictionary

seqs['YP_006990334.1'].seq[0:40]

Seq('MQNNKNFQNVLLAHINNIKDLPLKARIDYFEDDKDDLVIN')

In [8]:
# Create a string of the sequences being compared

seq_1_str =  str(seqs[seq_names[0]].seq)
seq_2_str =  str(seqs[seq_names[1]].seq)


In [9]:
# Print the length of the sequence strings

print("Length Sequence 1: ", len(seq_1_str))
print("Length Sequence 2: ", len(seq_2_str))

Length Sequence 1:  135
Length Sequence 2:  135


In [10]:
# Create dictionary containing the embeddings of sequences
# Sequence name and representation are stored in the dictionary

seq_1_embedding = torch.load(f"{embedding_directory + seq_names[0]}.pt")
seq_2_embedding = torch.load(f"{embedding_directory + seq_names[1]}.pt")

In [None]:
# Keys of embedding dictionaries

print("Sequence 1 Dictionary Keys: ", seq_1_embedding.keys())
print("Sequence 2 Dictionary Keys: ", seq_2_embedding.keys())

In [None]:
# Generate matrix containing cosine distances of amino acid embeddings from sequence representations

data = get_data_matrix(seq_1_embedding,seq_2_embedding)
data.iloc[0:5, 0:5]

In [None]:
# Shape of cosine distance matrix
# Shape corresponds to (length of sequence 1, length of sequence 2)

data.shape

In [None]:
# Build excel sheet containing cosine distance matrix and colored longest path

build_excel_sheet(seq_1_str,seq_2_str,seq_1_embedding, seq_2_embedding)