# Pattern count function



In [None]:
'''
- Theorical Example:
- If we have the gene GCGCG, and we want to count the pattern of GCG in the given gene
- We want to slide the pattern GCG over the gene GCGCG, so it would be:

- First iteration:
- Text   : GCGCG
- Pattern: GCG

- Second iteration:
- Text   : GCGCG
- Pattern:  GCG

- Third iteration:
- Text   : GCGCG
- Pattern:   GCG

- As you can see, we can slide the pattern over the whole text of the gene in just three iterations for the given example! 
- We want to build that in a code format, and we want to make it general for any text and any pattern as we will do in the text below:
'''

def PatternCount(Text, Pattern):
    '''
    - Define a counter variable that will count the occurence of a specific pattern in the text
    - The counter will be incremented by one if the pattern has been detected
    '''
    count = 0


    '''
    - Define 'position' variable that will set the stopping index when we slide the pattern over the text.
    - For example, if we consider the same example of having a gene GCGCG and we want to find the pattern GCG:
    - Index  : 01234
    - Text   : GCGCG
    - Pattern:   GCG

    - For the above example, the variable position = 2
    - This marks the index of the text when the pattern reaches the end of the text.

    - Then, we will define another variable that we will call overlap. We will be adding 1 to the position variable.
    - It will be clear why we are adding 1 to the position variable later in the code.
    '''
    position = len(Text) - len(Pattern)
    overlap = position + 1


    '''
    - I will start by explaining the output of range(overlap)
    - range(overlap) = [0, 1, 2, ..., overlap - 1]
    - For our example of gene GCGCG and the pattern GCG we will have:
    - range(overlap) = [0, 1, 2]
    '''    
    for i in range(overlap):
        '''
        - The 'start' and 'end' variables will set the starting index and the ending index of the text. 
        - This will be used later to slice the text by selecting certain indicies.
        - These variables will be updated in each iteration during the loop as shown below:

        - For the first iteration:
        - start = 0
        - end   = 3

        - For the second iteration:
        - start = 1
        - end   = 4

        - For the third iteration:
        - start = 2
        - end   = 5

        - As we mentioned earlier, we will use 'start' and 'end' variables to slice the text and compare it with the given pattern.
        - If the slice of the text is same as the pattern, the counter will be incremented by 1.
        '''
        start = i
        end = i + len(Pattern)
        if Text[start:end] == Pattern:
            count += 1


    '''
    - After we finish the code we will return the result we obtained to the user! :)
    '''
    return count

In [None]:
#try to run the function
PatternCount("GCGCG","GCG")

In [None]:
#try a different one
PatternCount("CGTCGCTCGTCTCGCTCGAATTCGCTCGAGGTTGACGGCAAATTGCCATCGCTCGCGTCGCTCGAGGTTCGCTCGATCGCTCGTCGCTCGTTCCACTCGCTCGTCGCTCGATCGCTCGCACTCGCTCGGCACTGGTCGCTCGGTCGCTCGTCGCTCGCGTCGCTCGCCTGGTCATCGCTCGAATCGCTCGTCGATCATCGCTCGCTCGCATCGCTCGTATCCCTCGCTCGCATCGCTCGTCGCTCGACTTCGCTCGCTCGCTCGTCGCTCGTTTGTTCGCTCGAAGATAATCCTCGCTCGGGTCGCTCGTCGCTCGTCGCTCGGGTCGCTCGGAGAATCGCTCGTCGCTCGTCGCTCGTCGCTCGTCGCTCGTTCCCCGTTCGCTCGCTATCGCTCGCAAGGTCGCTCGATGCTCGCTCGAATCGCTCGTCGCTCGTATTAACAACTTCGCTCGGTCGCTCGTCGCTCGTCGCTCGCTTCGCTCGTCGCTCGGTCGCTCGGTCGCTCGAGCTCGCTCGTCGCTCGCTCTCGCTCGGTGATCTCGCTCGTCTCGCTCGCATCGCTCGACTTTCTAGTCGCTCGGGTCTCGCTCGCCATCGCTCGGTCGCTCGCTAGCGTCTTCGCTCGTTAACGTCGCTCGTACGTCGCTCGATATCGCTCGGTTCCTCGCTCGTACTTCGCTCGTTATCGCTCGCATCGCTCGATAGTTCGCTCGTCGCTCGCTGCTCGCTCGTCGCTCGTATCGCTCGGCGTTCGCTCGTCGCTCGTCGCTCGATGGTCGCTCGTTCGCTCGTCGCTCGTTCGCTCGCAGATGCGTATCGCTCGGTATCGCTCGGTACATCGCTCGCGCTCGCATTCGCTCGATCGCTCGCTGTCGCTCGGCTGGTCGCTCGTCGCTCGTTATCGCTCGTGATCGCTCGTATCGCTCGACTCGCTCGTGACAATTCGCTCGGTCGCTCGCTCGCTCGCCTCGCTCGGGTCGCTCGTTCGCTCGATCGCTCGTCGCTCGCGATCGCTCGCTCGCTCGTCGCTCG","TCGCTCGTC")

# FrequentWords function

In [None]:
def FrequentWords(Text, k):
    frequent_patterns = set()
    count = {}

    # iterate through DNA Text and count kmers
    for i in range(len(Text) - k):
        pattern = Text[i : i + k]
        # add pattern to dictionary
        # key is i, start position of kmer
        # value is count, using PatternCount function from 1A
        count[i] = PatternCount(Text, pattern)

        # find maximum count value in dictionary
        max_count = max(count.values())

    # iterate through Text again, if count at that position is max count, slice that kmer and add to set
    for position in range(len(Text) - k):
        if count[position] == max_count:
            frequent_patterns.add(Text[position : position + k])

    return frequent_patterns

assert FrequentWords("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4) == {'CATG', 'GCAT'}

In [None]:
FrequentWords("CGGAGGACTCTAGGTAACGCTTATCAGGTCCATAGGACATTCA" , 3)

# FrequencyTable function

In [None]:
def FrequencyTable(Text, k):
  freqMap = dict()
  n = len(Text)
  for i in range(0, n - k + 1):
    Pattern = Text[i:i+k]
    freqMap[Pattern] = freqMap.get(Pattern, 0) + 1
  return freqMap

def MaxMap(freqMap):
  return max(freqMap.values())

def FrequentWords(Text, k):
  FrequentPatterns = list()
  freqMap = FrequencyTable(Text, k)
  maxCount = MaxMap(freqMap)
  for Pattern in freqMap.keys():
    if freqMap[Pattern] == maxCount:
      FrequentPatterns.append(Pattern)
  return FrequentPatterns

In [None]:
output=FrequentWords("AGTGATCTGTACCTGATACCTGAGTACCAGATACGCTCTACGCTCTACGCTCTAGTGATCTGTACCTGAGTACCAGATGCCCGCCACGCTCTGTACCAGATAGTGATCTGACGCTCTACGCTCTAGTGATCTGACGCTCTACGCTCTAGTGATCTGGCCCGCCGCCCGCCGTACCAGATAGTGATCTGTACCTGAGCCCGCCGCCCGCCTACCTGAGTACCAGATAGTGATCTGGCCCGCCGTACCAGATTACCTGATACCTGAGTACCAGATGTACCAGATTACCTGAGTACCAGATAGTGATCTGACGCTCTTACCTGAGTACCAGATAGTGATCTGAGTGATCTGGCCCGCCGCCCGCCACGCTCTGCCCGCCAGTGATCTGACGCTCTGCCCGCCTACCTGAGCCCGCCAGTGATCTGACGCTCTTACCTGAGCCCGCCAGTGATCTGGCCCGCCGCCCGCCTACCTGAGTACCAGATTACCTGAGCCCGCCAGTGATCTGACGCTCTTACCTGAAGTGATCTGGTACCAGATGTACCAGATGTACCAGATAGTGATCTGACGCTCTGTACCAGATGCCCGCCAGTGATCTGGCCCGCCGTACCAGATGCCCGCCGTACCAGATAGTGATCTGGTACCAGATTACCTGAGTACCAGATACGCTCTACGCTCTGCCCGCCAGTGATCTGTACCTGAGCCCGCCTACCTGAGCCCGCCGTACCAGATGTACCAGATAGTGATCTGGCCCGCCGCCCGCCAGTGATCTGTACCTGAAGTGATCTGGTACCAGATGTACCAGATAGTGATCTGAGTGATCTG" , 12)

In [None]:
output

In [None]:
" ".join(map(str, output))

# ReverseComplement function

In [None]:
def ReverseComplement(Text):
    complement = []
    for i in Text:
        if i == "A":
            complement.append("T")
        elif i == "T":
            complement.append("A")
        elif i == "G":
            complement.append("C")
        elif i == "C":
            complement.append("G")
        else:
            print("Not ACTG!")

    print (complement)

    reverse_complement = list(reversed(complement))
    print(reverse_complement)


    return "".join(reverse_complement)

In [None]:
ReverseComplement("TTGTGTC")

# LocatePatternMatch function

In [None]:
def LocatePatternMatch(Pattern, Genome):
    match_spot = []
    for i in range(len(Genome) - len(Pattern) + 1):
        if Genome[i : i + len(Pattern)] == Pattern:
            match_spot.append(i)

    return match_spot

In [None]:
LocatePatternMatch("AA", "AAACATAGGATCAAC")

# FindClumps function

In [None]:
def FindClumps(Genome, kmer_length, clump_length, times):
    # define length to search within
    clump_overlap = len(Genome) - clump_length + 1
    # kmer_overlap is looking within clump
    kmer_overlap = clump_length - kmer_length + 1
    # define kmers greater than times
    keepers = set()
    for clump_index in range(clump_overlap):
        # define clump window
        clump_start = clump_index
        clump_end = clump_index + clump_length
        clump = Genome[clump_start:clump_end]
        # define dictionary of kmers
        kmer_counts = {}
        for kmer_index in range(kmer_overlap):
            kmer_start = kmer_index
            kmer_end = kmer_index + kmer_length
            kmer = clump[kmer_start:kmer_end]
            if kmer in kmer_counts:
                # if kmer in dictionary, add count, +=
                kmer_counts[kmer] += 1
            else:
                # if kmer not in dictionary, add it
                kmer_counts[kmer] = 1
        # check if kmers occur greater than times
        for kmer in kmer_counts:
            if kmer_counts[kmer] >= times:
                keepers.add(kmer)

    return keepers

In [None]:
FindClumps("CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC",
5, 75, 4)