In [1]:
##Importing Packages
import numpy as np
import os
import time 
from typing import Tuple, Dict, Any

In [17]:
class ScoringSystem:
    def __init__(self, match: int = 2, mismatch: int = -1, gap: int = -2, quality_weights: Dict[str, int] = None) -> None:
        self.match = match
        self.mismatch = mismatch
        self.gap = gap
        ##Need to add more weights
        self.quality_weights = quality_weights if quality_weights else {'-': 3, '|': 2, '7': 2}

    def _fuzzy_similarity(self, a: str, b: str, quality_a: str, quality_b: str) -> float:
        weight_a = self.quality_weights.get(quality_a,1)
        weight_b = self.quality_weights.get(quality_b,1)
        #weight = min(weight_a,weight_b)
        weight = weight_a
        print(f"Comparing '{a}' with '{b}': Fuzzy Weight = {weight}")
        return weight
        
    def _default_scoring(self, a: str, b: str, quality_a: str, quality_b: str) -> float:
        if a == '-' or b == '-':
            return self.gap
        fuzzy_score = self._fuzzy_similarity(a, b, quality_a, quality_b)
        if a != b:
            score = self.mismatch * (1-fuzzy_score)
        else:
            score = self.match * fuzzy_score
        return score

    def score(self, a: str, b: str, quality_a: str, quality_b: str) -> float:
        assert isinstance(a, str) and isinstance(b, str)
        assert len(a) == 1 and len(b) == 1
        return self._default_scoring(a, b, quality_a, quality_b)

    def __str__(self):
        return f'Match: {self.match}, Mismatch: {self.mismatch}, Gap: {self.gap}, Quality Weights: {self.quality_weights}'


In [18]:
class SequencesAnalyzer:
    traceback_symbols = {
        0: '↖',  # Diagonal
        1: '↑',  # Up
        2: '←'   # Left
    }

    def __init__(self, seq_a_file: str, seq_b_file: str, match: int = 2, mismatch: int = -1, gap: int = -2, quality_weights: Dict[str, int] = None) -> None:
        self.seq_a, self.quality_a = self._load_sequence_from_file(seq_a_file)
        self.seq_b, self.quality_b = self._load_sequence_from_file(seq_b_file)
        self.scoring_sys = ScoringSystem(match, mismatch, gap, quality_weights)

    def _load_sequence_from_file(self,filename:str) -> Tuple[str,str]:
        with open(filename,'r') as file:
            lines = file.readlines()
            sequence = lines[0].strip()
            quality = lines[1].strip()
        return sequence,quality

    def smith_waterman_algorithm(self) -> Dict[str, Any]:
        rows, cols = len(self.seq_a) + 1, len(self.seq_b) + 1
        H = np.zeros(shape=(rows, cols), dtype=int)
        traceback = np.zeros(shape=(rows, cols), dtype=np.dtype('U5'))

        max_score = 0
        max_pos = (0, 0)

        for row in range(1, rows):
            for col in range(1, cols):
                a = self.seq_a[row - 1]
                b = self.seq_b[col - 1]
                qa = self.quality_a[row - 1] if row - 1 < len(self.quality_a) else '-'
                qb = self.quality_b[col - 1] if col - 1 < len(self.quality_b) else '-'

                score_diag = H[row - 1, col - 1] + self.scoring_sys.score(a, b, qa, qb)
                score_up = H[row - 1, col] + self.scoring_sys.gap
                score_left = H[row, col - 1] + self.scoring_sys.gap

                H[row, col] = max(0, score_diag, score_up, score_left)

                if H[row, col] == score_diag:
                    traceback[row, col] = self.traceback_symbols[0]
                elif H[row, col] == score_up:
                    traceback[row, col] = self.traceback_symbols[1]
                elif H[row, col] == score_left:
                    traceback[row, col] = self.traceback_symbols[2]

                if H[row, col] > max_score:
                    max_score = H[row, col]
                    max_pos = (row, col)

        return {
            'result_matrix': H,
            'traceback_matrix': traceback,
            'score': max_score,
            'score_pos': max_pos
        }
    
    def local_alignment(self, output_filename: str) -> Tuple[str, str, str, np.ndarray]:
        start_time = time.time()
        result = self.smith_waterman_algorithm()
        alignment_a, alignment_b, matches = self._traceback(
            result_matrix=result['result_matrix'],
            traceback_matrix=result['traceback_matrix'],
            start_pos=result['score_pos']
        )
        '''temp = len(result['traceback_matrix'])
        for i in range(1, temp):
            result['traceback_matrix'][:, 0][i] = self.seq_a[i - 1]
        for i in range(1, temp):
            result['traceback_matrix'][0, :][i] = self.seq_b[i - 1]'''

        end_time = time.time()
        elapsed_time = end_time - start_time

        with open(output_filename, "w", encoding='utf-8') as f:
            f.write(f"Input Sequence A: {self.seq_a}\n")
            f.write(f"Input Sequence B: {self.seq_b}\n")
            f.write(f"Alignment Score: {result['score']}\n\n")
            f.write("Alignment:\n")
            f.write(f"{alignment_a}\n")
            f.write(f"{matches}\n")
            f.write(f"{alignment_b}\n\n")
            f.write(f"Result Matrix:\n {result['result_matrix']}\n\n")
            f.write(f"Traceback Matrix:\n {result['traceback_matrix']}\n\n")
            f.write(f"Elapsed Time: {elapsed_time:.4f} seconds\n")
        
        print(f"[INFO] Alignment process complete. Time taken: {elapsed_time:.4f} seconds")
        return alignment_a, alignment_b, matches, result['traceback_matrix']

    def _traceback(self, result_matrix, traceback_matrix, start_pos: Tuple[int, int]) -> Tuple[str, str, str]:
        seq_a_aligned = ''
        seq_b_aligned = ''
        matches = ''

        row, col = start_pos
        steps = 0

        while result_matrix[row, col] > 0:
            symbol = traceback_matrix[row, col]

            if symbol == '↖':
                seq_a_aligned += self.seq_a[row - 1]
                seq_b_aligned += self.seq_b[col - 1]
                matches += '|' if self.seq_a[row - 1] == self.seq_b[col - 1] else ' '
                row -= 1
                col -= 1
            elif symbol == '↑':
                seq_a_aligned += self.seq_a[row - 1]
                seq_b_aligned += '-'
                matches += ' '
                row -= 1
            elif symbol == '←':
                seq_a_aligned += '-'
                seq_b_aligned += self.seq_b[col - 1]
                matches += ' '
                col -= 1
            else:
                print(f"[ERROR] Invalid symbol '{symbol}' at ({row}, {col}). Exiting traceback.")
                break

            steps += 1
            if steps > len(self.seq_a) + len(self.seq_b):
                print(f"[ERROR] Too many steps in traceback. Potential infinite loop detected. Exiting.")
                break

        return seq_a_aligned[::-1], seq_b_aligned[::-1], matches[::-1]

In [19]:
seq_1_content = "AGCTTAGCTA\n|-|77|7-|\n"  ##Read_Sequence with Quality Value
seq_2_content = "AGTTCGCTA\n7-||7|-||\n"    ##Reference_Sequence with Quality Value
seq_1_path = "seq_1_fuzzy.txt"
seq_2_path = "seq_2_fuzy.txt"
##Writing the content to the files
with open(seq_1_path, 'w') as f:
    f.write(seq_1_content)

with open(seq_2_path, 'w') as f:
    f.write(seq_2_content)

In [20]:
quality_weights = {'-': 3, '|': 2, '7': 2}

In [21]:
analyzer = SequencesAnalyzer('seq_1_fuzzy.txt', 'seq_2_fuzy.txt', match=3, mismatch=-1, gap=-2, quality_weights=quality_weights)

In [22]:
alignment_a, alignment_b, matches, trac_mat = analyzer.local_alignment(output_filename='alig_output_fuzzy.txt')

Comparing 'A' with 'A': Fuzzy Weight = 2
Comparing 'A' with 'G': Fuzzy Weight = 2
Comparing 'A' with 'T': Fuzzy Weight = 2
Comparing 'A' with 'T': Fuzzy Weight = 2
Comparing 'A' with 'C': Fuzzy Weight = 2
Comparing 'A' with 'G': Fuzzy Weight = 2
Comparing 'A' with 'C': Fuzzy Weight = 2
Comparing 'A' with 'T': Fuzzy Weight = 2
Comparing 'A' with 'A': Fuzzy Weight = 2
Comparing 'G' with 'A': Fuzzy Weight = 3
Comparing 'G' with 'G': Fuzzy Weight = 3
Comparing 'G' with 'T': Fuzzy Weight = 3
Comparing 'G' with 'T': Fuzzy Weight = 3
Comparing 'G' with 'C': Fuzzy Weight = 3
Comparing 'G' with 'G': Fuzzy Weight = 3
Comparing 'G' with 'C': Fuzzy Weight = 3
Comparing 'G' with 'T': Fuzzy Weight = 3
Comparing 'G' with 'A': Fuzzy Weight = 3
Comparing 'C' with 'A': Fuzzy Weight = 2
Comparing 'C' with 'G': Fuzzy Weight = 2
Comparing 'C' with 'T': Fuzzy Weight = 2
Comparing 'C' with 'T': Fuzzy Weight = 2
Comparing 'C' with 'C': Fuzzy Weight = 2
Comparing 'C' with 'G': Fuzzy Weight = 2
Comparing 'C' wi