In [1]:
#Importing the necesary biopython library
import Bio
import time
from prettytable import PrettyTable

In [2]:
def smith_waterman(A, B, match_score=2, mismatch_score=-1, gap_penalty=-1):
    m, n = len(A), len(B)
    H = [[0 for x in range(n+1)] for y in range(m+1)]

    for i in range(1, m+1):
        for j in range(1, n+1):
            match = H[i-1][j-1] + (match_score if A[i-1] == B[j-1] else mismatch_score)
            delete = H[i-1][j] + gap_penalty
            insert = H[i][j-1] + gap_penalty
            H[i][j] = max(match, delete, insert, 0)
    
    # Traceback
    a, b = '', ''
    i, j = max([(i, j) for i in range(m+1) for j in range(n+1)], key=lambda x: H[x[0]][x[1]])
    while H[i][j] != 0:
        current_score = H[i][j]
        if i > 0 and H[i][j] == H[i-1][j] + gap_penalty:
            a += A[i-1]
            b += '-'
            i -= 1
        elif j > 0 and H[i][j] == H[i][j-1] + gap_penalty:
            a += '-'
            b += B[j-1]
            j -= 1
        else:
            a += A[i-1]
            b += B[j-1]
            i -= 1
            j -= 1

    return a[::-1], b[::-1]

# Test
A = "GATTCA"
B = "GATCA"
alignmentA, alignmentB = smith_waterman(A, B)
print(alignmentA)
print(alignmentB)

GATTCA
GAT-CA


In [3]:
def read_fasta(file_name):
    """
    Reads a .fasta file and returns the sequences and descriptions in separate lists.

    Args:
    - file_name (str): Name of the .fasta file.

    Returns:
    - sequences (list of str): List of sequences from the file.
    - descriptions (list of str): List of sequence descriptions from the file.
    """
    
    sequences = []
    descriptions = []

    with open(file_name, 'r') as file:
        sequence = ''
        description = None
        
        for line in file:
            line = line.strip()

            if line.startswith('>'):
                if description:  # this checks if there's a previously read sequence description
                    sequences.append(sequence)
                    sequence = ''

                description = line[1:]  # remove the '>'
                descriptions.append(description)

            else:
                sequence += line

        if sequence:  # add the last read sequence
            sequences.append(sequence)

    return sequences, descriptions

In [4]:
def multiple_sw_alignment(sequences):
    if len(sequences) < 2:
        return sequences
    
    # Align the first two sequences
    aligned_seq1, aligned_seq2 = smith_waterman(sequences[0], sequences[1])

    if len(sequences) == 2:
        return [aligned_seq1, aligned_seq2]

    # Combine the aligned sequences into a list
    aligned_seqs = [aligned_seq1, aligned_seq2]

    for seq in sequences[2:]:
        new_aligned_seqs = []

        for aligned_seq in aligned_seqs:
            # Pairwise align the current sequence with each previously aligned sequence
            aligned_seq, new_seq = smith_waterman(aligned_seq, seq)
            new_aligned_seqs.append(new_seq)
        
        aligned_seqs = new_aligned_seqs

    return aligned_seqs

In [5]:
def select_sequences(sequences, descriptions, indices):
    """
    Selects sequences and their descriptions based on given indices.

    Args:
    - sequences (list of str): List of sequences.
    - descriptions (list of str): List of sequence descriptions.
    - indices (list of int): List of indices of sequences to select.

    Returns:
    - selected_sequences (list of str): List of selected sequences.
    - selected_descriptions (list of str): List of selected sequence descriptions.
    """
    
    selected_sequences = [sequences[i] for i in indices]
    selected_descriptions = [descriptions[i] for i in indices]

    return selected_sequences, selected_descriptions

In [6]:
def align_selected_sequences(sequences, descriptions, indices):
    selected_sequences, selected_descriptions = select_sequences(sequences, descriptions, indices)
    aligned_sequences = multiple_sw_alignment(selected_sequences)
    
    print("\nAlignment of sequences:", indices)
    for desc, seq in zip(selected_descriptions, aligned_sequences):
        print(desc)
        print(seq)

    return aligned_sequences

In [7]:
# Read the .fasta file
file_name = "COVID-GENOMES.fasta"
sequences, descriptions = read_fasta(file_name)

# Get sequence indices from the user
sequence_indices = input("Enter the indices of the sequences you want to align (comma separated): ")
indices_list = list(map(int, sequence_indices.split(',')))

# Align the sequences based on the selected indices
start_time = time.time()  # Start timing
align_selected_sequences(sequences, descriptions, indices_list)
end_time = time.time()  # End timing
print("Computational time:", end_time-start_time)

Enter the indices of the sequences you want to align (comma separated): 1, 2


MemoryError: 