In [19]:
import sys
import pandas as pd

In [20]:
# Returns the count of a specified pattern

def pattern_count(text: str, pattern: str) -> int:
    count = 0
    for i in range(len(text) - len(pattern) + 1):
        if text[i: i + len(pattern)] == pattern:
            count += 1
    return count

In [21]:
# Given a frequency table matching strings to integers, return the highest integer value

def max_map(freq_map):
    max = 0
    for val in freq_map.values():
        if val > max:
            max = val
    return max

In [22]:
# Generate a frequency table given a string and pattern length k

def frequency_table(text: str, k: int):
    freq_map = {}
    for i in range(len(text) - k + 1):
        window = text[i: i+k]
        if window in freq_map.keys():
            freq_map[window] += 1
        else:
            freq_map[window] = 1
    return freq_map

In [23]:
# Given a string and sliding window size k, return the pattern on length k with the highest frequency

def frequent_words(text: str, k: int) -> list[str]:
    freq_patterns = []
    freq_map = frequency_table(text, k)
    max = max_map(freq_map)
    for pattern in freq_map.keys():
        if freq_map[pattern] == max:
            freq_patterns.append(pattern)
    return freq_patterns

In [24]:
# Return the reverse complement of a DNA string

def reverse_complement(pattern: str) -> str:
    return pattern.replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()[::-1]

In [25]:
# Return the starting positions of a pattern in genome

def pattern_matching(pattern: str, genome: str) -> list[int]:
    positions = []
    for i in range(len(genome) - len(pattern) + 1):
        window = genome[i: i+len(pattern)]
        if window == pattern:
            positions.append(i)
    return positions

In [27]:
# Given a genome, find strings of length k and frequency >= t that appear in windows of length l

def find_clumps(genome: str, k: int, l: int, t: int) -> list[str]:
    patterns = []
    n = len(genome)
    for i in range(n - l + 1):
        window = genome[i: i+l]
        freq_map = frequency_table(window, k)
        for key, val in freq_map.items():
            if val >= t:
                patterns.append(key)
    patterns = set(patterns)
    return patterns

In [28]:
# Calculate the skew of a genome, G = +1, C = -1

def skew(genome: str):
    skew = 0
    skews = [0]
    for i in genome:
        if i == 'G':
            skew += 1
        elif i == 'C':
            skew -= 1
        skews.append(skew)
    return skews

In [29]:
# Determine the positions in a genome at which the skew is minimized

def min_skew(genome: str):
    min = 0
    positions = []
    skews = skew(genome)
    for score in skews:
        if score < min:
            min = score
    for i in range(len(skews)):
        if skews[i] == min:
            positions.append(i)
    return positions

In [30]:
# Calculate the hamming distance between two strings

def hamming_distance(p: str, q: str) -> int:
    distance = 0
    for i, j in zip(q, p):
        if i != j:
            distance += 1
    return distance

In [31]:
# Returns the starting position of a window with less than hamming distance d from the pattern

def approximate_pattern_matching(pattern: str, text: str, d: int) -> list[int]:
    positions = []
    k = len(pattern)
    for i in range(len(text) - k + 1):
        window = text[i: i+k]
        if hamming_distance(window, pattern) <= d:
            positions.append(i)
    return positions

In [32]:
# Return the number of approximately matching patterns of hamming distance d in a string

def approximate_pattern_count(text: str, pattern: str, d: int) -> int:
    return len(approximate_pattern_matching(pattern ,text, d))

In [33]:
# Generate the neighborhood of a pattern

def neighbors(s: str, d: int) -> list[str]:
    nucleotides = ['A', 'C', 'G', 'T']
    if d == 0:
        return [s]
    if len(s) == 1:
        return nucleotides
    neighborhood = set()
    suffix = s[1:]
    first_symbol = s[0]
    suffix_neighbors = neighbors(suffix, d)
    for text in suffix_neighbors:
        if hamming_distance(suffix, text) < d:
            for nucleotide in nucleotides:
                neighborhood.add(nucleotide + text)
        else:
            neighborhood.add(first_symbol + text)
    return list(neighborhood)

In [34]:
# Find the most frequent kmer with at most d mismatches in a string

def frequent_words_with_mismatches(text: str, k: int, d: int) -> list[str]:
    patterns = []
    freq_map = {}
    n = len(text)
    for i in range(n - k + 1):
        pattern = text[i:i+k]
        neighborhood = neighbors(pattern, d)
        for neighbor in neighborhood:
            if neighbor not in freq_map:
                freq_map[neighbor] = 1
            else:
                freq_map[neighbor] += 1
    m = max_map(freq_map)
    for pattern, count in freq_map.items():
        if count == m:
            patterns.append(pattern)
    return patterns


In [35]:
# Find the most frequent kmer including its reverse complement with at most d mismatches in a string

def frequent_words_mismatches_reverse_complements(text: str, k: int, d: int) -> list[str]:
    patterns = []
    freq_map = {}
    n = len(text)
    
    for i in range(n - k + 1):
        pattern = text[i:i+k]
        rc_pattern = reverse_complement(pattern)
        neighborhood = neighbors(pattern, d)
        rc_neighborhood = neighbors(rc_pattern, d)
        neighborhood = set(neighborhood + rc_neighborhood)
        for neighbor in neighborhood:
            if neighbor not in freq_map:
                freq_map[neighbor] = 1
            else:
                freq_map[neighbor] += 1
    m = max_map(freq_map)

    for pattern, count in freq_map.items():
        if count == m:
            patterns.append(pattern)
            
    return patterns