In [None]:
import LinearAlgebra
import Random
import BenchmarkTools
import StaticArrays
import LinearAlgebra
import Distributions
import StatsBase
import Profile
import StatProfilerHTML
import Test
using DataFrames
using Gadfly

# Useful Functions for Testing
This section contains some basic functionality for generating test data and calculating scores of motifs based on starting positions.

In [None]:
"""
author: gtb
Reads in a fasta file.
#### Arguments
- `fname`: name of the file to be read
"""
# helper function to read in the DNA sequences
function ReadInputs(fname)
    lines = readlines(fname)
    
    # collect sequences into here
    DNA = Vector{Vector{Char}}()
    
    # start collecting the first sequence
    current = Vector{Char}()
    
    # go ahead and pop first line
    this_line = popat!(lines, 1)
    
    while length(lines) >= 1
        this_line = popat!(lines, 1)
        if this_line[1] == '>'
            # add the sequence to DNA
            push!(DNA, current)
            current = Vector{Char}()
        else
            current = vcat(current, collect(this_line))
        end 
    end
    
    push!(DNA, current)
    # now read the other N lines and collect the strings into a matrix of characters

    return(DNA)
end

In [None]:
"""
author: gtb
Reads in inputs given in the format Dr. Heber posted on Piazza.
#### Arguments
- `fname`: name of the file to be read
"""
# helper function to read in the DNA sequences
function ReadInputs_class(fname)
    
    lines = readlines(fname)
    
    # collect sequences into here
    DNA = Vector{Vector{Char}}()
    
    # read in the parameters from the first line
    lines1 = parse.(Int64, split(popat!(lines, 1)))
    
    k = lines1[1]
    t = lines1[2]    
    
    for line in lines
        push!(DNA, collect(line))
    end
    # now read the other N lines and collect the strings into a matrix of characters

    return(k, DNA)
end

In [None]:
"""
Author: gtb
Generate a random sequence of length `Length`. Returns a vector of type `Char`.
...
#### Arguments
- `Length`: the length of the sequence to generate using the standard ACGT alphabet
...
"""
function GenerateSequence(Length)
    return collect(Random.randstring("ACGT", Length))
end

In [None]:
"""
Author: gtb
Generate mutliple sequences, each of a given length. Calls `GenerateSequence`. Return a vector of vectors of type `Char`.
...
#### Arguments
- `NumberOfSequences`: the number of sequences to generate
- `Length`: the number of nucleotides in each DNA sequences to be generated
...
"""
function GenerateSequences(NumberOfSequences, Length)
    return Sequences = map(i -> GenerateSequence(Length), 1:NumberOfSequences)
end

In [None]:
"""
Author: gtb
Mutate a DNA sequence to have a certain distance from the input sequence. Expects and input and output of type `Vec{Char}`.
...
#### Arguments
- `Sequence`: the sequence that will be mutated.
- `Distance`: the number of nucleotides in the sequence to change.
...
"""
function Mutate!(Sequence, Distance) # distance is no. of mutations
    # return the sequence if the Distance is 0
    if Distance == 0
        return Sequence
    end
    # choose the sites to mutate
    posToMutate = StatsBase.sample(1:length(Sequence), Distance, replace = false)
    
    # pick the letters that can be used at each position
    basesAtPositions = [string(i) for i in Sequence[posToMutate]]

    lettersToFill = ["ACGT" for i in 1:length(Sequence)]
    
    # figure out what letters each positions can be changed to
    lettersToFill = map((x, y) -> replace(x, y => ""), lettersToFill, basesAtPositions)
    
    # select one character from each of these positions and place them where they should be
    # use only to cast the string to a character
    # https://stackoverflow.com/questions/59946081/julia-convert-string-to-char-or-convert-arraysubstringstring-1-to-char
    Sequence[posToMutate] = map(x -> only(Random.randstring(x, 1)), lettersToFill)
    
    return Sequence
end

In [None]:
"""
Author: gtb
Generate test data for testing the (l,d) planted motif problem. Returns the:
- `motif` as type `Vec{Char}`, 
- starting positions of the planted motifs in each sequence `motif_starts` as `Vec{Int64}`,
- the actual planted motif including mutations as `motifs_implanted` as `Vec{Vec{Char}}`,
- the `sequences` of nucleotides including the motifs as `Vec{Vec{Char}}`.
...
#### Arguments
- `NumberOfSequences`: the number of DNA sequences to produce
- `LengthMotif`: the length of the motif to plant
- `LengthSequences`: the length of each of the DNA sequences the motif will be planted into
- `Distance`: the hamming distance of each planted sequences from the consensus motif
...
"""
function GenerateTestData_ld(NumberOfSequences, LengthMotif, LengthSequences, Distance)
    # do some error checking to make sure the values provided are valid
    # specifically, Disance <= LengthMotif
    if Distance > LengthMotif
        error("The Distance if larger than the LengthMotif.")
    end

    # LengthMotif <= LengthSequences
    if LengthMotif > LengthSequences
        error("The LengthMotif is longer than the LengthSequences")
    end
    
    # NumberOfSequences >= 2
    if NumberOfSequences <= 1
        error("The NumberOfSequences is 1, which is too small for motif detection")
    end

    # generate the input sequences
    sequences = GenerateSequences(NumberOfSequences, LengthSequences)
    
    # make the motif
    motif = GenerateSequence(LengthMotif)
    
    # generate a mutated motif for each sequence to implant
    motifs = [copy(motif) for i in 1:NumberOfSequences]
    
    # broadcast the mutated motif
    motifs .= Mutate!.(motifs, Distance)
    
    # now implant the mutated motifs and record where we implant it
    motifStarts = rand(1:(LengthSequences - LengthMotif + 1), NumberOfSequences)
    
    # now place the motifs into the sequences vector
    for i in 1:length(sequences)
        sequences[i][motifStarts[i]:(motifStarts[i] + LengthMotif - 1)] = motifs[i]
    end
    
    # now return all the things we may want later
    return (motif = motif, motifs_starts = motifStarts, motifs_implanted = motifs, sequences = sequences)
end

# PSO Helper Functions
Helper functions that are useful for the implementation described in our template paper.

## Basic Functions

### make_profile
This function is used to turn a set of sequences into a sequence profile. Pseudo_counts are enabled by default.

In [None]:
"""
Author: ks; mod gtb 11/10  

Returns the profile probabilities for the given inputs. Normalized frequencies are given in the order A, C, G, T.

#### Arguments
- `k`: length of motifs
- `t`: number of sequences
- `arr`: sequences to operate over, given as a `Vec{Vec{Char}}`
- `pseudo_counts`: optional argument for whether or not pseudo counts are used. Defaults to `true`.
"""
function make_profile(k, t, arr; pseudo_counts=true) 
    res = zeros(4,k)
    if pseudo_counts==false
        for i in 1:k
            c = Dict{Char, Int64}('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = c['A']/t
            res[2,i] = c['C']/t
            res[3,i] = c['G']/t
            res[4,i] = c['T']/t

        end
    else
        for i in 1:k
            c = Dict{Char, Int64}('A'=>1,'C'=>1,'G'=>1,'T'=>1)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = (c['A'])/(4+t)
            res[2,i] = (c['C'])/(4+t)
            res[3,i] = (c['G'])/(4+t)
            res[4,i] = (c['T'])/(4+t)

        end
    end
    return res
end

### background_frequency
This function is used to find the background frequencies for each nucleotide.

In [None]:
"""
Author: ks  

Returns the background frequency of each nucleotide as a dictionary.

#### Arguments
- `sequence`: the sequence as `Vec{Char}` that will be counted.
- `prop`: optional parameter - will convert values to probabilities
"""
function background_frequency(sequence; prop=false)
    bg = StatsBase.countmap(sequence)
    # add keys if they are not present
    if !('A' in keys(bg))
        bg['A'] = 0
    end
    if !('C' in keys(bg))
        bg['C'] = 0
    end
    if !('G' in keys(bg))
        bg['G'] = 0
    end
    if !('T' in keys(bg))
        bg['T'] = 0
    end
    if prop
        bg_prop = Dict{Char, Float64}()
        for k in keys(bg)
            bg_prop[k] = bg[k]/length(sequence)
        end
        return(bg_prop)
    else
        return(bg)
    end
end


### get_index
This function is used to map the nucleotides to the integers. Useful to have at compile time.

In [None]:
"""
Author: ks, mod gtb 11/10  

Helper function that maps the nucleotides `[A, C, G, T]` to the integers `1:4`. Useful for accessing certain arrays.

TODO: check a valid nucleotide is being passed

#### Arguments
- `c`: the nucleotide character that is being accessed.
"""
function get_index(c)
    #lobal d = Dict('A'=>1,'C'=>2,'G'=>3,'T'=>4)
    if c == 'A'
        return 1
    elseif c == 'C'
        return 2
    elseif c == 'G'
        return 3
    elseif c == 'T'
        return 4
    else
        print("ERROR!!")
    end
end

### consensus_score
This function uses the position weight matrix to calculate a score for a given set of starting positions.

In [None]:
"""
Author: ks, mod gtb 11/11

Helper function to calculate the consensus score given a set of motifs and the background frequencies of the nucleotides.

#### Arguments 
- `matches`: obtained motif-matching substrings for all sequences
- `background_frequency`: probability of each base in the background
"""
function consensus_score(matches, background_frequency)
    # build the map from numbers to nucleotides
    int_char = Dict{Int8, Char}(1=>'A',2=>'C',3=>'G',4=>'T')
    
    # determine the length of the motif
    k = length(matches[1])
    t = length(matches)
    
    # build a profile for each column
    profile = make_profile(k, t, matches; pseudo_counts=false)
    
    # in case of ties it sticks to A > C > G > T
    # generate a vector for storing the most common character in each column
    
    # find the most common value in each column
    # cmotif = StaticArrays.SVector{k, Char}([int_char[i] for i in vec(mapslices(i -> findmax(i)[2], profile; dims = 1))])
    cmotif = StaticArrays.SVector{k, Char}([int_char[i] for i in [ch[1] for ch in findmax(profile, dims = 1)[2]]])
    
    # now sum over the calculation in the paper
    s = 0
    for i in 1:k
        for j in 1:4
            idx_char = int_char[j]
            toadd = profile[j, i] * log(2, profile[j, i]/background_frequency[idx_char])
            # add nothing if we get NaN, since this corresponds to no matches
            if !isnan(toadd)
                s += toadd
            end
        end
    end
    return s, cmotif
end

### weights
This function is used to turn nucleotide frequencies into weights, similar to how our template paper describes.

In [None]:
"""
Author: ks  

Calculates the weights for each nucleotide based on their frequencies. Returns these values as a `Dict{Char -> Float64}`.

#### Arguments
- `background_frequency`: the background frequencies of each nucleotide from `background_frequency`
"""
function weights(background_frequency)
    ma = max(1/background_frequency['A'],1/background_frequency['C'],
        1/background_frequency['G'],1/background_frequency['T'])
    
    d = zeros(Float64, 4)
    
    d[1] = (1/background_frequency['A'])/ma
    d[2] = (1/background_frequency['C'])/ma
    d[3] = (1/background_frequency['G'])/ma
    d[4] = (1/background_frequency['T'])/ma
    
    # failsafe: if any are NaN, return all ones
    if any(isnan.(d))
        return ones(Float64, 4)
    else
        return d
    end
end

### update_motif
Given the current particle, its personal best, and the global best, this function proposes a new particle that will then be used for search.

In [None]:
"""
Author: ks, mod gtb 11/11  

Update the current nucleotide for a particle given the personal best, global best, and the weighting vectors for the vectors, `c`, and nucleotides, `w`.

#### Arguments
- `current_seq`: the current motif sequence for the particle
- `pbest_seq`: the motif sequence that corresponds to the best this particle has ever been
- `gbest_seq`: the motif sequence corresponding to the best any particle has achieved
- `weights`: a vector of weights for each nucleotide, based on their frequencies
- `background_frequency`: probability of each base in the background
"""
function update_motif(current_seq, pbest_seq, gbest_seq, weights, background_frequency; deterministic=false)
    k = length(current_seq)
    
    if deterministic
        Random.seed!(101)
        scale_vals = ones(k, 4) .* permutedims(repeat(weights', k))
        rand_seq = collect(Random.randstring(['A','C','G','T'], k))
    else
        rand_seq = collect(Random.randstring(['A','C','G','T'], k))
        scale_vals = rand(4, k) .* permutedims(repeat(weights', k))
    end

    # bind all of the sequences together
    seq_bind = permutedims(reduce(hcat, [current_seq, pbest_seq, gbest_seq, rand_seq]))
    
    # now get the cumulative sum in each column so we can sample a probability to figure out which base to keep
    # now return the "best" nucleotide for that position based on our rules
    return(seq_bind[vec(findmax(scale_vals; dims = 1)[2])])
end

# Our Creative Contributions
This section contains most of our group's creative contributions. We use pre-computation, indexing, and optimistic score-thresholding to reduce the number of wasteful computations. How these functions work may be more obvious by looking at the test cases.

## Precomputation

### precompute_matchscores
This function is used to avoid having to do expensive log calculations thousands of times.

In [None]:
"""
Precompute the log match scores. Returns a dictionary whose keys are (MOTIF LETTER, SEQUENCE LETTER)

#### Arguments
- `background_frequency`: the background frequency for each nucleotide
"""
function precompute_matchscores(background_frequency)
    pc_dict = Dict{Tuple{Char, Char}, Float64}()
    for i in ['A', 'C', 'G', 'T']
        for j in ['A', 'C', 'G', 'T']
            if i == j
                pc_dict[(i, j)] = 1 + log(4, (0.25/background_frequency[i]))
            else
                pc_dict[(i, j)] = log(4, 0.25/sqrt(background_frequency[i] * background_frequency[j]))
            end
        end
    end
    return(pc_dict)
end

### precompute_hash
This function creates an indexible dictionary for trying to find the substring most similar to a proposed motif.

In [None]:
"""
Author: gtb 11/23

Returns a dictionary with the following structure:
    Level 1: positions 1:k
    Level 2: characters A, C, G, T
    Level 3: starting positions with this value

#### Arguments
- `seq`: the sequence to be hashed
- `k`: the length of the pattern to be hashed
"""
function precompute_hash(seq, k)
    # make a dictionary with
    out_dict = Dict{Int64, Dict{Char, BitVector}}()
    
    # get all chars
    allchars = permutedims(reduce(hcat, [seq[i:(i+k-1)] for i in 1:(length(seq)-k+1)]))
    
    for pos in 1:k
        intermediate_dict = Dict{Char, BitVector}()
        for ch in ['A', 'C', 'G', 'T']
            intermediate_dict[ch] = allchars[:,pos] .== ch
        end
        out_dict[pos] = intermediate_dict
    end
    return(out_dict)
end

### precompute_motifscores

In [None]:
"""
Author gtb 11/26

Precomputes the "phases" of the motif, so we know what the fitness will be along the entire sequence, just by knowing which base is in which position.

Will store these values in a Dictionary with the followign structure:
    {Motif Pos -> {{Char in Motif} -> {Vector of Float}}

#### Arguments
- `seq`: the sequence to be hashed
- `seq_hashed`: the sequence that has already been hashed, with the BitVector method
- `k` the length of the motif
- `matchscores` precomputed values from precompute_matchscores
"""
function precompute_motifscores(seq, seq_hashed, k, matchscores)
    ms_dict = Dict{Int64, Dict{Char, Vector{Float64}}}()
    # iterate over the positions in the motif
    for m in 1:k
        # create a dict to store value for this match position
        intermediate_dict = Dict{Char, Vector{Float64}}()
        # add a key for if the base in the motif is A, C, G, or T
        for ch1 in ['A', 'C', 'G', 'T']
            # set it to all zeroes to start
            intermediate_dict[ch1] = zeros(Float64, length(seq) - k + 1)
            # now add in the corresponding score for each start position in the sequence
            for ch2 in ['A', 'C', 'G', 'T']
                mscore = matchscores[(ch1, ch2)]
                intermediate_dict[ch1][seq_hashed[m][ch2]] .+= mscore
            end
        end
        # add the intermediate dict to the big dict
        ms_dict[m] = intermediate_dict
    end
    return(ms_dict)
end

## Faster Searches

### best_match_efficient
This function finds the substring that matches most closely to the proposed motif. It only does addition and indexing operations, making it have very low overhead.

In [None]:
"""
Author: gtb 11/23

Returns the score of the closest matching subsequence in `seq` to `motif`, in a more efficient way.

#### Arguments
- `seq`: the sequence to search
- `motifscore_hashed`: the hashed scores from precompute_motifscores
- `motif`: the motif to search for
- `matchschores`: precomputed values from precompute_matchscores
- `scores`: the matrix scores will be stored in
"""
function best_match_efficient(seq, motifscore_hashed, motif, matchscores, scores)  
    # iterate over the positions in the motif
    for m in 1:length(motif)
        @views scores .+= motifscore_hashed[m][motif[m]]
    end
    
    return(findmax(scores))        
end

### best_possible
This function figures out what the highest possible score you could get given the background frequencies, current profile, the number of sequences examined, and the total number of sequences there are to examine.

In [None]:
"""
Author: gtb

Finds the maximum fitness that could be achieved with a given motif profile

#### Arguments
- `probs_matrix`: the background nucleotide frequencies, in a special matrix format
- `profile`: the profile up until this point
- `nseq_sofar`: the number of sequences scored so far
- `nseq`: the number of sequences being scored total
- `matrix_forfindmax`: the matix that will be used for find max (4 x length motif)
- `bestpossible_fitness`: a pre-allocated vector Float64 that is the length of the motif
"""
function best_possible(probs_matrix, profile, nseq_sofar, nseq, matrix_forfindmax, bestpossible_fitness)
    # iterate over the columns of the profile
    for j in 1:size(profile, 2)
        # place the current values into matrix_forfindmax
        @inbounds matrix_forfindmax .= profile[1:4, j]
        
        # now add in the values along nseq
        matrix_forfindmax[LinearAlgebra.diagind(matrix_forfindmax)] .+= nseq - nseq_sofar
        
        # convert to probabilities
        matrix_forfindmax ./= nseq
        
        # now do the actual math
        matrix_forfindmax .= matrix_forfindmax .* log2.(matrix_forfindmax ./ probs_matrix)
        
        matrix_forfindmax[isnan.(matrix_forfindmax)] .= 0

        # sum over the columns
        bestpossible_fitness[j] = maximum(sum(matrix_forfindmax; dims = 1))
    end
    
    return(sum(bestpossible_fitness))
end

### optimistic_search
This function will look for a given motif in each of the provided sequences. It's main advantage is that it quits after it realizes that it cannot beat the current best score for the particle. It does this calculation with the help of `best_possible`.

In [None]:
"""
Author: gtb

Searches using a motif consensus sequence until the optimistic score cannot reach pbest

#### Arguments
- `seqs`: the sequences that are to be searched
- `motif`: the motif sequence to search for
- `background_frequency`: the background nucleotide frequencies
- `pbest_thresh`: the current best fitness of the particle
- `motifscore_hashed`: the hashed scores, a dict that makes it faster to search
- `precomp`: the precomputed scoring calculations (no more logs!)
"""
function optimistic_search(seqs, motif, background_frequency, pbest_thresh, motifscore_hashed, precomp; vanilla = false)
    best_match_starts = zeros(Int64, length(seqs))

    # start a profile we will add on to
    search_profile = zeros(Float64, 4, length(motif))
    
    optimal = zeros(Float64, 1)
    
    # convert the background frequencies into a matrix
    probs_matrix = permutedims(repeat([background_frequency['A'] background_frequency['C'] background_frequency['G'] background_frequency['T']], 4))
    
    # create a 4x4 matrix we will use for each column; we will keep putting values in here
    mfm = zeros(Float64, 4, 4)
    
    # create a vector to store the current fitness in each column of the profile matrix
    bpf = zeros(Float64, size(search_profile, 2))
    
     # create a l - k + 1 vector for storing the scores
    scores = zeros(Float64, length(seqs[1]) - length(motif) + 1)
    
    for seq in 1:length(seqs)
        # make sure scores is set to zero before passing to best_match_efficient
        scores .= 0
        
        # find the best match subsequence
        best_match_starts[seq] = best_match_efficient(seqs[seq], motifscore_hashed[seq], motif, precomp, scores)[2]
        
        # iterate along the best match string
        for ch in 1:length(motif)
            search_profile[get_index(seqs[seq][best_match_starts[seq] + ch - 1]), ch] += 1 
        end
        
        # skip the break condition if we are doing vanilla search
        if vanilla
            continue
        end
        
        optimal[1] = best_possible(probs_matrix, search_profile, seq, length(seqs), mfm, bpf)

        # find the optimistic score
        if optimal[1] < pbest_thresh
            # return some condition that says this particle isn't going to beat the personal best
            return(best_match_starts)
        end
    end
    
    # return at the end
    return(best_match_starts)
end

# PSO Implementation
This program actually runs the particle swarm optimization.
It has complexity approximately equal to:  
`# of initializations` $\times$ `# of particles` $\times$ `# of iterations` $\times$ `(sequence lengths - motif length + 1)` $\times$ `# of sequences - 1`

In [None]:
Array{10, StaticArrays.SVector{5, Char}}

In [None]:
"""
Author: ks  

PSO Motif is the main function for this project. It takes some set of parameters, and then calls other functions to do the actual work.

#### Arguments
- `sequences`: the input DNA sequences, stored as a `Vec{Vec{Char}}`
- `motiflen`: the motif length to look for
- `max_reset`: the number of times to re-intialize
- `max_iteration`: the number of times to update each particle
- `nparticles`: the number of particles to track
"""
function pso_motif(sequences, motiflen, max_reset, max_iteration, nparticles; includestarts = false)
    ## hash all the sequences so they are easier to search
    sh = precompute_hash.(sequences, (motiflen,))
    
    # set the particles fitness to negative infinity to start
    final_fitness = -Inf16
    
    # create a vector to store the final motif in
    final_motif = []
    
    # and the starting positions
    final_starts = []
    
    # set the weight parameters
    # current, pbest, gbest, random
    c = StaticArrays.SVector{4, Float64}([1.0, 1.0, 1.0, 1.0])
    
    # calculate the background frequencies
    bg = background_frequency(reduce(hcat, sequences); prop = true)
    
    # precompute all the scores for each type of match
    ms = precompute_matchscores(bg)
    
    # hash the motif scores as well
    msh = precompute_motifscores.(sequences, sh, (motiflen,), (ms,))
    
    # and their weights
    w = weights(bg)
    
    # initialize some arrays!
    particle_startingpos = zeros(Int64, nparticles)
    particle_startingseq = zeros(Int64, nparticles)
    
    particles = [StaticArrays.SVector{motiflen, Char}(['A' for i in 1:motiflen]) for i in 1:nparticles]

    match_sequences = [['A' for i in 1:motiflen] for i in 1:length(sequences)]
    
    # pbest will be an array of arrays, corresponding to the best consensus motif each particle has personally seen
    pbest = [StaticArrays.SVector{motiflen, Char}(['A' for i in 1:motiflen]) for i in 1:nparticles]
    
    for i in 1:max_reset 
        # sample particle starting positions
        # and the sequences they come from
        
        ## TODO: maybe ensure the starting positions result in non-overlaping particles? in terms of consensus motifs
        particle_startingpos .= StatsBase.sample(1:(length(sequences[1]) - motiflen + 1), nparticles, replace = true)
        particle_startingseq .= StatsBase.sample(1:length(sequences), nparticles, replace = true)

        # extract these sequences from the input sequences
        for p in 1:nparticles
            # add these particles into a static array
            particles[p] = StaticArrays.SVector{motiflen, Char}(sequences[particle_startingseq[p]][particle_startingpos[p]:(particle_startingpos[p]+motiflen-1)])
            pbest[p] = StaticArrays.SVector{motiflen, Char}(sequences[particle_startingseq[p]][particle_startingpos[p]:(particle_startingpos[p]+motiflen-1)])
        end
        
        # set the personal best fitness for each particle
        # and for global to be -Inf
        # and create a vector to store the current scores inside of
        fitness_current = [-Inf16 for i in 1:nparticles]
        fitness_pbest = [-Inf16 for i in 1:nparticles]
        fitness_gbest = [-Inf16]
        
        gbest = [StaticArrays.SVector{motiflen, Char}(['A' for i in 1:motiflen])]
            
        # iterate until convergance
        j = 1
        no_updates = 0

        match_positions = zeros(Int64, length(sequences))
        best_match_saves_starts = [zeros(Int64, length(sequences)) for i in 1:nparticles]
        while (j <= max_iteration) & (no_updates <= 5)           
            for k in 1:nparticles
                #=display(fitness_current)
                display(fitness_pbest)
                display(fitness_gbest)
                display(particles)
                display(pbest)
                display(gbest[1])
                # first, query each sequence to see if we can find the best match to the motif
                # do this until we find that our optimistic score cannot be as good as fitness_pbest (and by extension, gbest)
                # basically just keep collecting sequences until we're sure we can't do as well as we've done in the past
                # then, we can quit trying to update this particle
                # basically, this will be a short-circuiting operation=#
                match_positions .= optimistic_search(sequences, particles[k], bg, fitness_pbest[k], msh, ms; vanilla = true)

                # skip the rest if we aborted the search
                if match_positions[length(sequences)] == 0
                    continue
                end
                
                for ms in 1:length(sequences)
                    #match_sequences[ms] = StaticArrays.SVector{motiflen, Char}(sequences[ms][match_positions[ms]:(match_positions[ms] + motiflen - 1)])
                    @views match_sequences[ms] .= sequences[ms][match_positions[ms]:(match_positions[ms] + motiflen - 1)]
                end
                
                # and calculate the consensus score like the paper says - this will become our fitness
                fitness_current[k], particles[k] = consensus_score(match_sequences, bg)

                if fitness_current[k] > fitness_pbest[k]
                    best_match_saves_starts[k] .= copy(match_positions)
                    fitness_pbest[k] = copy(fitness_current[k])
                    pbest[k] = particles[k]
                end
                
                # update global best if this is the best we've ever had
                if fitness_current[k] > fitness_gbest[1]
                    final_starts = copy(match_positions)
                    fitness_gbest[1] = copy(fitness_current[k])
                    gbest[1] = particles[k]
                end   
            end

            # do check shift one in every 10 times
            if (j % floor(max_iteration/10)) == 0
                for k in 1:nparticles
                    starts_toworkfrom = copy(best_match_saves_starts[k])
                    for shift in [-5, -4, -3, -2, -1, 1, 2, 3, 4, 5]
                        tocheck_matchstarts = copy(starts_toworkfrom)
                        tocheck_matchstarts .+= shift
                        tocheck_matchstarts[tocheck_matchstarts .< 1] .= 1
                        tocheck_matchstarts[tocheck_matchstarts .> (length(sequences[1]) - motiflen + 1)] .= (length(sequences[1]) - motiflen + 1)
                        
                        for ms in 1:length(sequences)
                            @views match_sequences[ms] .= sequences[ms][tocheck_matchstarts[ms]:(tocheck_matchstarts[ms] + motiflen - 1)]
                        end
                        
                        potential_fitness_current, potential_new_part = consensus_score(match_sequences, bg)
                        
                        if potential_fitness_current > fitness_current[k]
                            fitness_current[k] = copy(potential_fitness_current)
                            particles[k] = StaticArrays.SVector{motiflen, Char}(potential_new_part)
                            best_match_saves_starts[k] = copy(tocheck_matchstarts)
                        end
                        
                        if fitness_current[k] > fitness_pbest[k]
                            fitness_pbest[k] = copy(fitness_current[k])
                            pbest[k] = particles[k]
                        end

                        # upate global best if this is the best we've ever had
                        if fitness_current[k] > fitness_gbest[1]
                            no_updates = 0
                            final_starts = copy(tocheck_matchstarts)
                            fitness_gbest[1] = copy(fitness_current[k])
                            gbest[1] = particles[k]
                        end  
                    end
                end
            end
            
            no_updates += 1
            
            # broadcast the update operation across all the particles
            particles .= update_motif.(particles, pbest, (gbest[1], ), (w, ), (bg, ))

            j += 1

            # break if >50% of particles are the same
            #=
            if maximum(values(StatsBase.countmap(pbest))) > floor(nparticles * 0.90)
                break
            end=#
        end
        
        # wrap up by looking for the motif sequence in all sequences one more time
        
        #=
        display(fitness_current)
        display(fitness_pbest)
        display(fitness_gbest)
        display(particles)
        display(pbest)
        display(gbest[1])=#
        
        # get the _actual_ best score using all the data
        match_positions = optimistic_search(sequences, gbest[1], bg, -Inf, msh, ms; vanilla = true)

        for ms in 1:length(sequences)
            @views match_sequences[ms] .= sequences[ms][match_positions[ms]:(match_positions[ms] + motiflen - 1)]
        end

        # and calculate the consensus score like the paper says - this will become our fitness
        fitness_gbest[1], gbest[1] = consensus_score(match_sequences, bg)
        
        # check if what we got so far is better than in the other runs
        if fitness_gbest[1] > final_fitness
            final_starts = copy(match_positions)
            final_fitness = copy(fitness_gbest[1])
            final_motif = StaticArrays.SVector{motiflen, Char}(gbest[1])
        end     
    end

    if includestarts
        return(final_fitness, final_motif, final_starts)
    else
        return(final_fitness, final_motif)
    end
end


# Unit Testing

## PSO Helper Functions

### make_profile

In [None]:
?make_profile

In [None]:
begin
    Test.@test make_profile(3, 3, [['A', 'A', 'A'], ['A', 'A', 'A'], ['A', 'A', 'A']] ; pseudo_counts = false) == [1.0 1.0 1.0; 0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0;]
    Test.@test make_profile(3, 3, [['C', 'C', 'C'], ['C', 'C', 'C'], ['C', 'C', 'C']] ; pseudo_counts = false) == [0.0 0.0 0.0; 1.0 1.0 1.0; 0.0 0.0 0.0; 0.0 0.0 0.0;]
    Test.@test make_profile(3, 3, [['G', 'G', 'G'], ['G', 'G', 'G'], ['G', 'G', 'G']] ; pseudo_counts = false) == [0.0 0.0 0.0; 0.0 0.0 0.0; 1.0 1.0 1.0; 0.0 0.0 0.0;]
    Test.@test make_profile(3, 3, [['T', 'T', 'T'], ['T', 'T', 'T'], ['T', 'T', 'T']] ; pseudo_counts = false) == [0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0; 1.0 1.0 1.0;]
end

In [None]:
begin
    Test.@test make_profile(3, 3, [['A', 'A', 'A'], ['A', 'A', 'A'], ['A', 'A', 'A']] ; pseudo_counts = true) == [4/7 4/7 4/7; 1/7 1/7 1/7; 1/7 1/7 1/7; 1/7 1/7 1/7;]
    Test.@test make_profile(3, 3, [['C', 'C', 'C'], ['C', 'C', 'C'], ['C', 'C', 'C']] ; pseudo_counts = true) == [1/7 1/7 1/7; 4/7 4/7 4/7; 1/7 1/7 1/7; 1/7 1/7 1/7;]
    Test.@test make_profile(3, 3, [['G', 'G', 'G'], ['G', 'G', 'G'], ['G', 'G', 'G']] ; pseudo_counts = true) == [1/7 1/7 1/7; 1/7 1/7 1/7; 4/7 4/7 4/7; 1/7 1/7 1/7;]
    Test.@test make_profile(3, 3, [['T', 'T', 'T'], ['T', 'T', 'T'], ['T', 'T', 'T']] ; pseudo_counts = true) == [1/7 1/7 1/7; 1/7 1/7 1/7; 1/7 1/7 1/7; 4/7 4/7 4/7;]
end

### background_frequency

In [None]:
?background_frequency

In [None]:
begin
    Test.@test background_frequency(collect(repeat('A', 100))) == Dict('A' => 100, 'C' =>   0, 'G' =>   0, 'T' =>   0)
    Test.@test background_frequency(collect(repeat('C', 100))) == Dict('A' =>   0, 'C' => 100, 'G' =>   0, 'T' =>   0)
    Test.@test background_frequency(collect(repeat('G', 100))) == Dict('A' =>   0, 'C' =>   0, 'G' => 100, 'T' =>   0)
    Test.@test background_frequency(collect(repeat('T', 100))) == Dict('A' =>   0, 'C' =>   0, 'G' =>   0, 'T' => 100)
end

### get_index

In [None]:
?get_index

In [None]:
begin
    Test.@test get_index('A') == 1
    Test.@test get_index('C') == 2
    Test.@test get_index('G') == 3
    Test.@test get_index('T') == 4
end

### consensus_score

In [None]:
?consensus_score

In [None]:
begin
    Test.@test consensus_score([['A', 'A'], ['A', 'T']], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (3.0, ['A', 'A'])
    Test.@test consensus_score([['A', 'T'], ['A', 'T']], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (4.0, ['A', 'T'])
    Test.@test consensus_score([['T', 'A'], ['A', 'T']], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (2.0, ['A', 'A'])
end

### weights

In [None]:
?weights

In [None]:
begin
    Test.@test weights(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == ones(Float64, 4)
    Test.@test weights(Dict('A' => 1/2, 'C' => 1/6, 'G' => 1/6, 'T' => 1/6)) == [1/3, 1, 1, 1]
    Test.@test weights(Dict('A' => 1/2, 'C' => 1/2, 'G' => 0, 'T' => 0)) == ones(Float64, 4)
end

### update_motif

In [None]:
?update_motif

In [None]:
begin
    Test.@test update_motif(['A', 'A', 'A', 'T'], ['C', 'C', 'C', 'C'], ['G', 'G', 'G', 'G'], [2.0, 1.0, 1.0, 1.0], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4); deterministic=true) == ['A', 'A', 'A', 'T']
    Test.@test update_motif(['A', 'A', 'A', 'T'], ['C', 'C', 'C', 'C'], ['G', 'G', 'G', 'G'], [1.0, 2.0, 1.0, 1.0], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4); deterministic=true) == ['C', 'C', 'C', 'C']
    Test.@test update_motif(['A', 'A', 'A', 'T'], ['C', 'C', 'C', 'C'], ['G', 'G', 'G', 'G'], [1.0, 1.0, 2.0, 1.0], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4); deterministic=true) == ['G', 'G', 'G', 'G']
end

## Our Creative Contributions

### precompute_matchscores

In [None]:
?precompute_matchscores

In [None]:
begin
    Test.@test precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == Dict(('A', 'A') => 1,
        ('A', 'C') => 0,
        ('A', 'G') => 0,
        ('A', 'T') => 0,
        ('C', 'A') => 0,
        ('C', 'C') => 1,
        ('C', 'G') => 0,
        ('C', 'T') => 0,
        ('G', 'A') => 0,
        ('G', 'C') => 0,
        ('G', 'G') => 1,
        ('G', 'T') => 0,
        ('T', 'A') => 0,
        ('T', 'C') => 0,
        ('T', 'G') => 0,
        ('T', 'T') => 1)
        
end

### precompute_hash

In [None]:
?precompute_hash

In [None]:
precompute_hash(collect("AACGT"), 3) 

In [None]:
#Need to fix this test casee

begin
    Test.@test precompute_hash(collect("AACGT"), 3)  ==  Dict(2 => Dict('A'=>[1, 0, 0], 'G'=>[0, 0, 1], 'T'=>[0, 0, 0], 'C'=>[0, 1, 0]),
        3 => Dict('A'=>[0, 0, 0], 'G'=>[0, 1, 0], 'T'=>[0, 0, 1], 'C'=>[1, 0, 0]),
        1 => Dict('A'=>[1, 1, 0], 'G'=>[0, 0, 0], 'T'=>[0, 0, 0], 'C'=>[0, 0, 1]))
end

### precompute_motifscores

In [None]:
?precompute_motifscores

In [None]:
begin
    Test.@test precompute_motifscores(['A', 'A', 'A', 'T', 'T', 'T'], precompute_hash(['A', 'A', 'A', 'T', 'T', 'T'], 2), 2, 
        precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))) ==
    Dict(1=> Dict('A' => [1.0, 1.0, 1.0, 0.0, 0.0],
    'G' => [0.0, 0.0, 0.0, 0.0, 0.0],
    'T' => [0.0, 0.0, 0.0, 1.0, 1.0],
    'C' => [0.0, 0.0, 0.0, 0.0, 0.0]),
2 => Dict('A' => [1.0, 1.0, 0.0, 0.0, 0.0],
    'G' => [0.0, 0.0, 0.0, 0.0, 0.0],
    'T' => [0.0, 0.0, 1.0, 1.0, 1.0],
    'C' => [0.0, 0.0, 0.0, 0.0, 0.0]))
end

### best_match_efficient

In [None]:
?best_match_efficient

In [None]:
# need to modify test cases to include passing the "scores" matrix"

In [None]:
begin
    Test.@test best_match_efficient(['A', 'A', 'A', 'T', 'T', 'T'], 
        precompute_motifscores(['A', 'A', 'A', 'T', 'T', 'T'], precompute_hash(['A', 'A', 'A', 'T', 'T', 'T'], 2), 2, 
                               precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))), 
        ['A', 'T'], 
        precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),
        zeros(Float64, 5)) == (2.0, 3)
    
    Test.@test best_match_efficient(['A', 'A', 'A', 'T', 'T', 'T'], 
        precompute_motifscores(['A', 'A', 'A', 'T', 'T', 'T'], precompute_hash(['A', 'A', 'A', 'T', 'T', 'T'], 3), 3, 
                               precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))), 
        ['A', 'T', 'C'], 
        precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),
        zeros(Float64, 4))  == (2.0, 3)

    Test.@test best_match_efficient(['A', 'A', 'A', 'T', 'T', 'T'], 
        precompute_motifscores(['A', 'A', 'A', 'T', 'T', 'T'], precompute_hash(['A', 'A', 'A', 'T', 'T', 'T'], 4), 4, 
                               precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))), 
        ['A', 'T', 'C', 'C'], 
        precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),
        zeros(Float64, 3))  == (2.0, 3)
    
    Test.@test best_match_efficient(['A', 'A', 'A', 'T', 'T', 'T'], 
        precompute_motifscores(['A', 'A', 'A', 'T', 'T', 'T'], precompute_hash(['A', 'A', 'A', 'T', 'T', 'T'], 4), 4, 
                               precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))), 
        ['A', 'A', 'A', 'T'], 
        precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),
        zeros(Float64, 3))  == (4.0, 1)
end



### best_possible

In [None]:
?best_possible

In [None]:
begin
    Test.@test best_possible([1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4], [2 2; 0 0 ; 0 0 ; 0 0 ;], 2, 10, zeros(Float64, 4, 4), zeros(Float64, 2)) == 4.0
    Test.@test best_possible([1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4], [2 2 2; 0 0 0; 0 0 0; 0 0 0;], 2, 10, zeros(Float64, 4, 4), zeros(Float64, 3)) == 6.0
    Test.@test best_possible([1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4], [1 1 1; 1 1 1; 0 0 0; 0 0 0;], 2, 10, zeros(Float64, 4, 4), zeros(Float64, 3)) ==
    best_possible([1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4], [0 0 0; 0 0 0; 1 1 1; 1 1 1;], 2, 10, zeros(Float64, 4, 4), zeros(Float64, 3))
    Test.@test best_possible([1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4], [2 2 2; 1 1 1; 0 0 0; 0 0 0;], 3, 10, zeros(Float64, 4, 4), zeros(Float64, 3)) <
    best_possible([1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4; 1/4 1/4 1/4 1/4], [2 2 2; 0 0 0;  0 0 0;  0 0 0;], 2, 10, zeros(Float64, 4, 4), zeros(Float64, 3))
end

### optimistic_search

In [None]:
?optimistic_search

In [None]:
 precompute_motifscores.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], 
        precompute_hash.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], (3,)), 
        (3, ), (precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),))[1][3]

In [None]:
begin
    Test.@test optimistic_search([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']],
            ['A', 'T', 'T'],
            Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4),
            -Inf,
            precompute_motifscores.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], 
            precompute_hash.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], (3,)), 
            (3, ), (precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),)),
            precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))) == [3, 3, 2]
    
    Test.@test optimistic_search([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']],
            ['A', 'T', 'T'],
            Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4),
            5.99,
            precompute_motifscores.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], 
            precompute_hash.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], (3,)), 
            (3, ), (precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),)),
            precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))) == [3, 3, 2]
    
    Test.@test optimistic_search([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']],
            ['A', 'T', 'T'],
            Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4),
            6,
            precompute_motifscores.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], 
            precompute_hash.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], (3,)), 
            (3, ), (precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),)),
            precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))) == [3, 3, 2]
    
    Test.@test optimistic_search([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']],
            ['A', 'T', 'T'],
            Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4),
            7,
            precompute_motifscores.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], 
            precompute_hash.([['A', 'A', 'A', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], (3,)), 
            (3, ), (precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),)),
            precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))) == [3, 0, 0]
    
    Test.@test optimistic_search([['A', 'A', 'C', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']],
            ['A', 'T', 'T'],
            Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4),
            6,
            precompute_motifscores.([['A', 'A', 'C', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], 
            precompute_hash.([['A', 'A', 'C', 'T', 'T', 'T'], ['C', 'A', 'A', 'T', 'T', 'G'], ['T', 'A', 'T', 'T', 'G', 'G']], (3,)), 
            (3, ), (precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)),)),
            precompute_matchscores(Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))) == [2, 3, 0]
end

# Benchmarking on Dr. Heber's test data from Piazza

In [None]:
dummy_perfect_k, dummy_perfect_seqs = ReadInputs_class("./benchmarking/dummy_data_perfect.txt")
pso_motif(dummy_perfect_seqs, dummy_perfect_k, 5, 50, 10; includestarts = true)

In [None]:
dummy_noisy_k, dummy_noisy_seqs = ReadInputs_class("./benchmarking/dummy_data_noisy.txt")
pso_motif(dummy_noisy_seqs, dummy_noisy_k, 5, 50, 10; includestarts = true)

# Benchmarking on Real Data (cotton fiber promoters)

In [None]:
cotton_seqs = ReadInputs("./benchmarking/cotton_fiber_promoters.fasta")

In [None]:
begin
    cotton_k = 6
    cotton_score, cotton_motif, cotton_starts = pso_motif(cotton_seqs, cotton_k, 10, 50, 100; includestarts = true)
    cotton_match_seqs = [join(cotton_seqs[i][cotton_starts[i]:(cotton_starts[i] + cotton_k - 1)]) for i in 1:length(cotton_seqs)]
    println(cotton_score)
    println(join(cotton_motif))
    println()
    #=for i in cotton_match_seqs
        println(i)
    end=#
end

In [None]:
begin
    cotton_k = 8    
    cotton_score, cotton_motif, cotton_starts = pso_motif(cotton_seqs, cotton_k, 10, 50, 100; includestarts = true)
    cotton_match_seqs = [join(cotton_seqs[i][cotton_starts[i]:(cotton_starts[i] + cotton_k - 1)]) for i in 1:length(cotton_seqs)]
    println(cotton_score)
    println(join(cotton_motif))
    println()
    #=for i in cotton_match_seqs
        println(i)
    end=#
end

In [None]:
begin
    cotton_k = 10
    cotton_score, cotton_motif, cotton_starts = pso_motif(cotton_seqs, cotton_k, 10, 50, 100; includestarts = true)
    cotton_match_seqs = [join(cotton_seqs[i][cotton_starts[i]:(cotton_starts[i] + cotton_k - 1)]) for i in 1:length(cotton_seqs)]
    println(cotton_score)
    println(join(cotton_motif))
    println()
    #=for i in cotton_match_seqs
        println(i)
    end=#
end

# Benchmarking on Synthetic Data

In [None]:
?GenerateTestData_ld

In [None]:
"""
code recycled from gtb, HW2
"""
function runTrial(l, d, seq_len, seq_num) 
    # generate the simulated data
    motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(seq_num, l, seq_len, d)
    
    test_score, test_motif = pso_motif(sequences, l, 5, 50, 10)

    return(Int(all(test_motif .== motif)))
end

In [None]:
"""
code recycled from gtb, HW2
"""
function benchmarker(l, d, seq_len, seq_num, zz)
    res = permutedims(reduce(hcat, map(i -> [i for i in @timed(runTrial(l, d, seq_len, seq_num))[(:value, :time, :bytes)]], 1:zz)))
    return(StatsBase.mean(res; dims = 1))
end

In [None]:
# iterate over motif length
res_motiflength = benchmarker.(4:2:20, (2,), (300,), (30,), (100,))

In [None]:
df_length = DataFrame(reduce(vcat, res_motiflength), ["Percent Correct", "Time (s)", "Memory (Mb)"])

In [None]:
df_length[!,Symbol("Memory (Mb)")] .= df_length[!,Symbol("Memory (Mb)")] ./ 1000000
insertcols!(df_length, 1, Symbol("Motif Length") => 4:2:20)
insertcols!(df_length, 1, Symbol("Number of Mutations") => repeat([2], 9))
insertcols!(df_length, 1, Symbol("Length of Sequences") => repeat([300], 9))
insertcols!(df_length, 1, Symbol("Number of Sequences") => repeat([30], 9))
insertcols!(df_length, 1, Symbol("Number of Trials") => repeat([100], 9))

In [None]:
print(df_length)

# Troubleshooting
This code was used to figure out whether the above programs were actually working.

In [None]:
#import Pkg
#Pkg.add(url="https://github.com/carstenbauer/TimerOutputsTracked")
#import TimerOutputsTracked

In [None]:
# set the seed
Random.seed!(101)
# use tuple unpacking to get some test values
# NumberOfSequences, LengthMotif, LengthSequences, Distance
correct_motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(30, 10, 300, 2)

In [None]:
consensus_score(motifs_implanted, Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))

# Memory Profiling

In [None]:
using Profile

In [None]:
using PProf

In [None]:
Profile.Allocs.clear()

In [None]:
Profile.Allocs.@profile sample_rate=0.01 pso_motif(sequences, length(correct_motif), 1, 10, 10)

In [None]:
PProf.Allocs.pprof(from_c=false)

In [None]:
#Random.seed!(101)
BenchmarkTools.@btime pso_motif(sequences, length(correct_motif), 5, 100, 100)

In [None]:
Random.seed!(101)
BenchmarkTools.@btime pso_motif(sequences, length(correct_motif), 1, 10, 10)

In [None]:
BenchmarkTools.@btime pso_motif(sequences, length(correct_motif), 5, 100, 100)

In [None]:
#pso_motif(sequences, length(correct_motif), 5, 100, 10)
# Random.seed!(105)
#pso_motif(sequences, length(correct_motif), 5, 50, 10)
# pso_motif(sequences, length(correct_motif), 5, 10, 10)
pso_motif(sequences, length(correct_motif), 5, 100, 100)

In [None]:
StatProfilerHTML.@profilehtml pso_motif(sequences, length(correct_motif), 1, 10, 10)

In [None]:
StatProfilerHTML.@profilehtml pso_motif(sequences, length(correct_motif), 5, 100, 100)