In [1]:
#Importing the necesary biopython library
import Bio
import time
from prettytable import PrettyTable

In [2]:
# Smith-Waterman function from previous response
def smith_waterman(A, B, match_score=2, mismatch_score=-1, gap_penalty=-1):
    m, n = len(A), len(B)
    H = [[0 for x in range(n+1)] for y in range(m+1)]

    for i in range(1, m+1):
        for j in range(1, n+1):
            match = H[i-1][j-1] + (match_score if A[i-1] == B[j-1] else mismatch_score)
            delete = H[i-1][j] + gap_penalty
            insert = H[i][j-1] + gap_penalty
            H[i][j] = max(match, delete, insert, 0)
    
    # Traceback
    a, b = '', ''
    i, j = max([(i, j) for i in range(m+1) for j in range(n+1)], key=lambda x: H[x[0]][x[1]])
    while H[i][j] != 0:
        current_score = H[i][j]
        if i > 0 and H[i][j] == H[i-1][j] + gap_penalty:
            a += A[i-1]
            b += '-'
            i -= 1
        elif j > 0 and H[i][j] == H[i][j-1] + gap_penalty:
            a += '-'
            b += B[j-1]
            j -= 1
        else:
            a += A[i-1]
            b += B[j-1]
            i -= 1
            j -= 1

    return a[::-1], b[::-1]

In [3]:
# Test
A = "GATTCA"
B = "GATCA"
alignmentA, alignmentB = smith_waterman(A, B)
print(alignmentA)
print(alignmentB)

GATTCA
GAT-CA


In [4]:
def read_fasta(file_name):
    """
    Reads a .fasta file and returns the sequences and descriptions in separate lists.

    Args:
    - file_name (str): Name of the .fasta file.

    Returns:
    - sequences (list of str): List of sequences from the file.
    - descriptions (list of str): List of sequence descriptions from the file.
    """
    
    sequences = []
    descriptions = []

    with open(file_name, 'r') as file:
        sequence = ''
        description = None
        
        for line in file:
            line = line.strip()

            if line.startswith('>'):
                if description:  # this checks if there's a previously read sequence description
                    sequences.append(sequence)
                    sequence = ''

                description = line[1:]  # remove the '>'
                descriptions.append(description)

            else:
                sequence += line

        if sequence:  # add the last read sequence
            sequences.append(sequence)

    return sequences, descriptions

In [5]:
# Modified function to display the results
#def align_selected_sequences(sequences, descriptions, indices):
#    for i in range(len(indices) - 1):
#        for j in range(i + 1, len(indices)):
#            print(f"Aligning:\n{descriptions[indices[i]]}\n{descriptions[indices[j]]}\n")
#            alignmentA, alignmentB = smith_waterman(sequences[indices[i]], sequences[indices[j]])
            #print(colorize_sequence(alignmentA))
            #print(colorize_sequence(alignmentB))
            #print("\n" + "-"*50 + "\n")

            # Align selected sequences
def align_selected_sequences(sequences, indices):
    for i in range(len(indices) - 1):
        for j in range(i + 1, len(indices)):
            print(f"Aligning sequence {indices[i]} and sequence {indices[j]}:")
            alignmentA, alignmentB = smith_waterman(sequences[indices[i]], sequences[indices[j]])
            print("Alignment 1:")
            print(alignmentA)
            print("\nAlignment 2:")
            print(alignmentB)
            print("\n" + "-"*50 + "\n")


In [6]:
COLORS = {
    'A': '\033[91m',  # Red
    'C': '\033[92m',  # Green
    'D': '\033[93m',  # Yellow
    'E': '\033[94m',  # Blue
    'F': '\033[95m',  # Magenta
    'G': '\033[96m',  # Cyan
    'H': '\033[97m',  # White
    'I': '\033[41m',  # Red background
    'K': '\033[42m',  # Green background
    'L': '\033[43m',  # Yellow background
    'M': '\033[44m',  # Blue background
    'N': '\033[45m',  # Magenta background
    'P': '\033[46m',  # Cyan background
    'Q': '\033[47m',  # White background
    'R': '\033[100m', # Bright Black background
    'S': '\033[101m', # Bright Red background
    'T': '\033[102m', # Bright Green background
    'V': '\033[103m', # Bright Yellow background
    'W': '\033[104m', # Bright Blue background
    'Y': '\033[105m', # Bright Magenta background
    '-': '\033[0m',   # Reset
    'reset': '\033[0m'
}
def colorize_sequence(sequence):
    return ''.join(COLORS.get(nucleotide, COLORS['reset']) + nucleotide + COLORS['reset'] for nucleotide in sequence)

In [None]:
# Main
file_name = "COVID-GENOMES.fasta"
sequences, descriptions = read_fasta(file_name)

# Get sequence indices from the user
sequence_indices = input("Enter the indices of the sequences you want to align (comma separated): ")
indices_list = list(map(int, sequence_indices.split(',')))

# Align the sequences based on the selected indices
start_time = time.time()  # Start timing
align_selected_sequences(sequences, descriptions, indices_list)
end_time = time.time()  # End timing
print(end_time-start_time)

Enter the indices of the sequences you want to align (comma separated): 1, 2
Aligning:
accn|OP370189   OP370189.1   [Severe acute respiratory syndrome coronavirus 2 strain SARS-CoV-2/human/USA/GA-CDC-STM-BUUA3B3QF/2022 strain SARS-CoV-2/Human/USA/GA-CDC-STM-BUUA3B3QF/2022 | 2697049.6553178]
accn|OP370368   OP370368.1   [Severe acute respiratory syndrome coronavirus 2 strain SARS-CoV-2/human/USA/CO-CDC-STM-GV6KGWQXZ/2022 strain SARS-CoV-2/Human/USA/CO-CDC-STM-GV6KGWQXZ/2022 | 2697049.6553532]



In [None]:
print(sequences)

In [None]:
def create_table():
    x = PrettyTable()

    # Define the table header
    x.field_names = ["", "Computational time 1", "Computational time 2", "Computational time 3", "Computational time 4", "Computational time 5"]

    # Predefined alignments to test
    alignments_to_test = [
        (1, 2),
        (1, 2, 3, 4, 5),
        (0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
        (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
    ]

    # For each alignment set, calculate the computational time
    all_times = []
    for alignment_set in alignments_to_test:
        start_time = time.time()
        
        # Align sequences (without printing)
        for i in range(len(alignment_set) - 1):
            for j in range(i + 1, len(alignment_set)):
                smith_waterman(sequences[alignment_set[i]], sequences[alignment_set[j]])
        
        end_time = time.time()
        all_times.append(end_time - start_time)

    # Add the results to the table
    x.add_row(["Alignment 1 and 2", all_times[0], "", "", "", ""])
    x.add_row(["Alignment 1, 2,3,4,5", all_times[0], all_times[1], "", "", ""])
    x.add_row(["Alignment 0,1,2,3,4,5,6,7,8,9", all_times[0], all_times[1], all_times[2], "", ""])
    x.add_row(["Alignment0,1,2,3,4,5,6,7,8,9,10,11,12,13,14", all_times[0], all_times[1], all_times[2], all_times[3], ""])
    
    print(x)

In [None]:
# Main part of the code
sequences, descriptions = read_fasta("sequence_spike_protein - Copy.txt")
create_table()