In [None]:
import LinearAlgebra
import Random
import BenchmarkTools
import StaticArrays
import LinearAlgebra
import Distributions
import StatsBase
import Profile
import StatProfilerHTML
import Test

# Useful Functions for Testing
This section contains some basic functionality for generating test data and calculating scores of motifs based on starting positions.

In [None]:
"""
Author: gtb
Generate a random sequence of length `Length`. Returns a vector of type `Char`.
...
# Arguments
- `Length`: the length of the sequence to generate using the standard ACGT alphabet
...
"""
function GenerateSequence(Length)
    return collect(Random.randstring("ACGT", Length))
end

In [None]:
"""
Author: gtb
Generate mutliple sequences, each of a given length. Calls `GenerateSequence`. Return a vector of vectors of type `Char`.
...
# Arguments
- `NumberOfSequences`: the number of sequences to generate
- `Length`: the number of nucleotides in each DNA sequences to be generated
...
"""
function GenerateSequences(NumberOfSequences, Length)
    return Sequences = map(i -> GenerateSequence(Length), 1:NumberOfSequences)
end

In [None]:
"""
Author: gtb
Mutate a DNA sequence to have a certain distance from the input sequence. Expects and input and output of type `Vec{Char}`.
...
# Arguments
- `Sequence`: the sequence that will be mutated.
- `Distance`: the number of nucleotides in the sequence to change.
...
"""
function Mutate!(Sequence, Distance) # distance is no. of mutations
    # return the sequence if the Distance is 0
    if Distance == 0
        return Sequence
    end
    # choose the sites to mutate
    posToMutate = StatsBase.sample(1:length(Sequence), Distance, replace = false)
    
    # pick the letters that can be used at each position
    basesAtPositions = [string(i) for i in Sequence[posToMutate]]

    lettersToFill = ["ACGT" for i in 1:length(Sequence)]
    
    # figure out what letters each positions can be changed to
    lettersToFill = map((x, y) -> replace(x, y => ""), lettersToFill, basesAtPositions)
    
    # select one character from each of these positions and place them where they should be
    # use only to cast the string to a character
    # https://stackoverflow.com/questions/59946081/julia-convert-string-to-char-or-convert-arraysubstringstring-1-to-char
    Sequence[posToMutate] = map(x -> only(Random.randstring(x, 1)), lettersToFill)
    
    return Sequence
end

In [None]:
"""
Author: gtb
Generate test data for testing the (l,d) planted motif problem. Returns the:
- `motif` as type `Vec{Char}`, 
- starting positions of the planted motifs in each sequence `motif_starts` as `Vec{Int64}`,
- the actual planted motif including mutations as `motifs_implanted` as `Vec{Vec{Char}}`,
- the `sequences` of nucleotides including the motifs as `Vec{Vec{Char}}`.
...
# Arguments
- `NumberOfSequences`: the number of DNA sequences to produce
- `LengthMotif`: the length of the motif to plant
- `LengthSequences`: the length of each of the DNA sequences the motif will be planted into
- `Distance`: the hamming distance of each planted sequences from the consensus motif
...
"""
function GenerateTestData_ld(NumberOfSequences, LengthMotif, LengthSequences, Distance)
    # do some error checking to make sure the values provided are valid
    # specifically, Disance <= LengthMotif
    if Distance > LengthMotif
        error("The Distance if larger than the LengthMotif.")
    end

    # LengthMotif <= LengthSequences
    if LengthMotif > LengthSequences
        error("The LengthMotif is longer than the LengthSequences")
    end
    
    # NumberOfSequences >= 2
    if NumberOfSequences <= 1
        error("The NumberOfSequences is 1, which is too small for motif detection")
    end

    # generate the input sequences
    sequences = GenerateSequences(NumberOfSequences, LengthSequences)
    
    # make the motif
    motif = GenerateSequence(LengthMotif)
    
    # generate a mutated motif for each sequence to implant
    motifs = [copy(motif) for i in 1:NumberOfSequences]
    
    # broadcast the mutated motif
    motifs .= Mutate!.(motifs, Distance)
    
    # now implant the mutated motifs and record where we implant it
    motifStarts = rand(1:(LengthSequences - LengthMotif + 1), NumberOfSequences)
    
    # now place the motifs into the sequences vector
    for i in 1:length(sequences)
        sequences[i][motifStarts[i]:(motifStarts[i] + LengthMotif - 1)] = motifs[i]
    end
    
    # now return all the things we may want later
    return (motif = motif, motifs_starts = motifStarts, motifs_implanted = motifs, sequences = sequences)
end

In [None]:
"""
Author: gtb
Function to score a set of starting positions. Score is at the "best" values when it is =1, signifying all the sequences are identical at this position.

May be DEPRECATED.
...
# Arguments
- `Starts_inp`: starting positions of the suspected locations of the motifs in the DNA sequences. Type `Vec{Int64}`
- `Length`: the length of the input motifs. Defaults to `length(motif)`, where `motif` is from the output of `GenerateTestData_ld()`
- `Seqs`: the input `Vec{Vec{Char}}` with the DNA sequences. Defaults to `sequences`, where `sequences` is from the output of `GenerateTestData_ld()`
- `ln`: optional argument that takes the natural log of the score; useful for HMC
...
"""
function Score(Starts_inp, Length=length(motif), Seqs=sequences; ln=false)
    maxScore = Length * length(Seqs)
    # MODIFY the inputs if they are out of bounds :-)
    ### TO DO: return a very large number of the starts are out of bounds
    if @inbounds (minimum(Starts_inp) < 1) | (maximum(Starts_inp) > (length(Seqs[1]) - Length + 1))
        return(maxScore)
    end
    # in the mean time, just fix out of bounds errors
    
    # generate the character matrix
    # then store the results inside of it
    #@inbounds @views seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    @inbounds seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    #print(seqsMatrix)
    # allocate an array for the most common values in each column
    @inbounds mostCommon = Array{Char}(undef, size(seqsMatrix, 2))
    thisScore = 0
   
    # find the most common element in each column 
    for i in 1:size(seqsMatrix, 2)
        A = count(x -> x == 'A', @views @inbounds seqsMatrix[:,i])
        C = count(x -> x == 'C', @views @inbounds seqsMatrix[:,i])
        G = count(x -> x == 'G', @views @inbounds seqsMatrix[:,i])
        T = count(x -> x == 'T', @views @inbounds seqsMatrix[:,i])
        
        char_max = max(A, C, G, T)
        thisScore += char_max
        
        if char_max == A
            @inbounds mostCommon[i] = 'A'
        elseif char_max == C
            @inbounds mostCommon[i] = 'C'
        elseif char_max == G
            @inbounds mostCommon[i] = 'G'
        else
            @inbounds mostCommon[i] = 'T'
        end     
    end

    if ln
        return(-1 * log(maxScore - thisScore + 1))
    else
    # make the minimum (best) score 1 so we can compute the log
        return(maxScore - thisScore + 1)
    end
end

In [None]:
"""
Author: gtb
Finds the consensus motif for a given set of starting positions in the DNA sequences.

May be DEPRECATED.
...
# Arguments
- `Starts_inp`: starting positions of the suspected locations of the motifs in the DNA sequences. Type `Vec{Int64}`
- `Length`: the length of the input motifs. Defaults to `length(motif)`, where `motif` is from the output of `GenerateTestData_ld()`
- `Seqs`: the input `Vec{Vec{Char}}` with the DNA sequences. Defaults to `sequences`, where `sequences` is from the output of `GenerateTestData_ld()`
- `ln`: optional argument that takes the natural log of the score; useful for HMC
...
"""
function ConsensusMotif(Starts_inp, Length=length(motif), Seqs=sequences)
    
    # generate the character matrix
    # then store the results inside of it
    #@inbounds @views seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    @inbounds seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    
    # allocate an array for the most common values in each column
    @inbounds mostCommon = Array{Char}(undef, size(seqsMatrix, 2))
    thisScore = 0
   
    # find the most common element in each column 
    for i in 1:size(seqsMatrix, 2)
        A = count(x -> x == 'A', @views @inbounds seqsMatrix[:,i])
        C = count(x -> x == 'C', @views @inbounds seqsMatrix[:,i])
        G = count(x -> x == 'G', @views @inbounds seqsMatrix[:,i])
        T = count(x -> x == 'T', @views @inbounds seqsMatrix[:,i])
        
        char_max = max(A, C, G, T)
        thisScore += char_max
        
        if char_max == A
            @inbounds mostCommon[i] = 'A'
        elseif char_max == C
            @inbounds mostCommon[i] = 'C'
        elseif char_max == G
            @inbounds mostCommon[i] = 'G'
        else
            @inbounds mostCommon[i] = 'T'
        end     
    end
    
    return(mostCommon)
end

# PSO Helper Functions
Helper functions that are useful for the implementation described in our template paper.

In [None]:
"""
Author: ks; mod gtb 11/10  

Returns the profile probabilities for the given inputs. Normalized frequencies are given in the order A, C, G, T.

# Arguments
- `k`: length of motifs
- `t`: number of sequences
- `arr`: sequences to operate over, given as a `Vec{Vec{Char}}`
- `pseudo_counts`: optional argument for whether or not pseudo counts are used. Defaults to `true`.
"""
function make_profile(k, t, arr; pseudo_counts=true) 
    res = zeros(4,k)
    if pseudo_counts==false
        for i in 1:k
            c = Dict('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = c['A']/t
            res[2,i] = c['C']/t
            res[3,i] = c['G']/t
            res[4,i] = c['T']/t

        end
    else
        for i in 1:k
            c = Dict('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = (c['A']+1)/(4+t)
            res[2,i] = (c['C']+1)/(4+t)
            res[3,i] = (c['G']+1)/(4+t)
            res[4,i] = (c['T']+1)/(4+t)

        end
    end
    return res
end

In [None]:
"""
Author: ks  

Returns the background frequency of each nucleotide as a dictionary.

TODO: do we need to account for when a nucleotide is absent from the sequence by providing `StatsBase.count_map` default values?
# Arguments
- `sequence`: the sequence as `Vec{Char}` that will be counted.
- `prop`: optional parameter - will convert values to probabilities
"""
function background_frequency(sequence; prop=false)
    bg = StatsBase.countmap(sequence)
    # add keys if they are not present
    if !('A' in keys(bg))
        bg['A'] = 0
    end
    if !('C' in keys(bg))
        bg['C'] = 0
    end
    if !('G' in keys(bg))
        bg['G'] = 0
    end
    if !('T' in keys(bg))
        bg['T'] = 0
    end
    if prop
        bg_prop = Dict()
        for k in keys(bg)
            bg_prop[k] = bg[k]/length(sequence)
        end
        return(bg_prop)
    else
        return(bg)
    end
end


In [None]:
"""
Author: ks , mod gtb 11/11 

Returns the score, where a higher score is achieved if the motif is more highly conserved.  

Utilizes the background frequency of the sequence, as in the template paper.  

# Arguments
- `seq`: the sequence that is being scored
- `motif`: the motif sequence we are scoring against
- `background_frequency`: the background frequency for each nucleotide
"""
function new_score(seq, motif, background_frequency)
    l = length(seq)
    s = 0
    for i in 1:l
        if seq[i] == motif[i]
            s += (1 + log(4, (0.25/background_frequency[seq[i]])))
        else
            s += log(4, 0.25/sqrt(background_frequency[seq[i]] * background_frequency[motif[i]]))
        end
    end
    return s
end

In [None]:
"""
Author: ks, mod gtb 11/10  

Helper function that maps the nucleotides `[A, C, G, T]` to the integers `1:4`. Useful for accessing certain arrays.

TODO: check a valid nucleotide is being passed

# Arguments
- `c`: the nucleotide character that is being accessed.
"""
function get_index(c)
    d = Dict('A'=>1,'C'=>2,'G'=>3,'T'=>4)
    return d[c]
end

In [None]:
"""
Author: ks, mod gtb 11/11

Helper function to calculate the consensus score given a set of motifs and the background frequencies of the nucleotides.

# Arguments
- `matches`: obtained motif-matching substrings for all sequences
- `background_frequency`: probability of each base in the background
"""
function consensus_score(matches, background_frequency)
    # build the map from numbers to nucleotides
    int_char = Dict(1=>'A',2=>'C',3=>'G',4=>'T')
    
    # determine the length of the motif
    k = length(matches[1])
    t = length(matches)
    
    # build a profile for each column
    profile = make_profile(k, t, matches; pseudo_counts=false)
    
    # in case of ties it sticks to A > C > G > T
    
    # find the most common value in each column
    cmotif = StaticArrays.SVector{k, Char}([int_char[i] for i in vec(mapslices(i -> findmax(i)[2], profile; dims = 1))])
    
    # now sum over the calculation in the paper
    s = 0
    for i in 1:k
        for j in 1:4
            idx_char = int_char[j]
            toadd = profile[j, i] * log(2, profile[j, i]/background_frequency[idx_char])
            # add nothing if we get NaN
            if !isnan(toadd)
                s += toadd
            end
        end
    end
    return s, cmotif
end

In [None]:
"""
Author: ks, mod gtb 11/11

Returns the score of the closest matching subsequence in `seq` to `motif`.

TODO: implement branch and bound, or some other rule, to avoid scoring starting positions with a bad score?

# Arguments
- `seq`: the sequence to search
- `motif`: the motif to search for
- `background_frequency`: probability of each base in the background
"""
function best_match(seq, motif, background_frequency)
    l = length(seq)
    k = length(motif)
    best_score = 0
    best_pos = 0
    for i in 1:(l-k+1)
        s = new_score(seq[i:(i + k - 1)], motif, background_frequency)
        if s > best_score
            best_score = s
            best_pos = i
        end
    end
    return(best_score, best_pos)
        
end

In [None]:
"""
Author: ks, mod gtb 11/11  

Update the current nucleotide for a particle given the personal best, global best, and the weighting vectors for the vectors, `c`, and nucleotides, `w`.

TODO: update the entire particle sequence in one step, rather than calling `update` `l` times?

# Arguments
- `current_seq`: the current motif sequence for the particle
- `pbest_seq`: the motif sequence that corresponds to the best this particle has ever been
- `gbest_seq`: the motif sequence corresponding to the best any particle has achieved
- `weights`: a vector of weights for each nucleotide, based on their frequencies
- `background_frequency`: probability of each base in the background
"""
function update_motif(current_seq, pbest_seq, gbest_seq, weights, background_frequency; deterministic=false)
    k = length(current_seq)
    
    if deterministic
        Random.seed!(101)
        scale_vals = ones(k, 4) .* permutedims(repeat(weights', k))
        rand_seq = collect(Random.randstring(['A','C','G','T'], k))
    else
        rand_seq = collect(Random.randstring(['A','C','G','T'], k))
        scale_vals = rand(4, k) .* permutedims(repeat(weights', k))
    end

    # bind all of the sequences together
    seq_bind = permutedims(reduce(hcat, [current_seq, pbest_seq, gbest_seq, rand_seq]))
    
    # now get the cumulative sum in each column so we can sample a probability to figure out which base to keep
    # now return the "best" nucleotide for that position based on our rules
    return(seq_bind[vec(findmax(scale_vals; dims = 1)[2])])
end

In [None]:
"""
Author: ks  

Calculates the weights for each nucleotide based on their frequencies. Returns these values as a `Dict{Char -> Float64}`.

# Argumen  ts
- `background_frequency`: the background frequencies of each nucleotide from `background_frequency`
"""
function weights(background_frequency)
    ma = max(1/background_frequency['A'],1/background_frequency['C'],
        1/background_frequency['G'],1/background_frequency['T'])
#     mi = mi(1/background_frequency['A'],1/background_frequency['C'],
#         1/background_frequency['G'],1/background_frequency['T'])
    d = Dict()
#     d['A'] = (1/background_frequency['A']-mi)/(ma - mi)
#     d['C'] = (1/background_frequency['C']-mi)/(ma - mi)
#     d['G'] = (1/background_frequency['G']-mi)/(ma - mi)
#     d['T'] = (1/background_frequency['T']-mi)/(ma - mi)
    
    d['A'] = (1/background_frequency['A'])/ma
    d['C'] = (1/background_frequency['C'])/ma
    d['G'] = (1/background_frequency['G'])/ma
    d['T'] = (1/background_frequency['T'])/ma
    return d
end

# PSO Implementation
This program actually runs the particle swarm optimization.
It has complexity approximately equal to:  
`# of initializations` $\times$ `# of particles` $\times$ `# of iterations` $\times$ `(sequence lengths - motif length + 1)` $\times$ `# of sequences - 1`

In [None]:
"""
Author: ks  

PSO Motif is the main function for this project. It takes some set of parameters, and then calls other functions to do the actual work.

# Arguments
- `sequences`: the input DNA sequences, stored as a `Vec{Vec{Char}}`
- `motiflen`: the motif length to look for
- `max_reset`: the number of times to re-intialize
- `max_iteration`: the number of times to update each particle
- `nparticles`: the number of particles to track
"""
function pso_motif(sequences, motiflen, max_reset, max_iteration, nparticles)
    # set the particles fitness to negative infinity to start
    final_fitness = -Inf16
    
    # create a vector to store the final motif in
    final_motif = []
    
    # set the weight parameters
    # current, pbest, gbest, random
    c = StaticArrays.SVector{4, Float64}([1.0, 1.0, 1.0, 1.0])
    
    # calculate the background frequencies
    bg = background_frequency(reduce(hcat, sequences); prop = true)
    
    # and their weights
    #### TODO: FIX THE WEIGHTS FUNCTION and call that instead
    w = [bg['A'], bg['C'], bg['G'], bg['T']]
    
    # initialize some arrays!
    particle_startingpos = zeros(Int64, nparticles)
    particle_startingseq = zeros(Int64, nparticles)
    
    particles = [StaticArrays.SVector{motiflen, Char}(['A' for i in 1:motiflen]) for i in 1:nparticles]
    
    # pbest will be an array of arrays, corresponding to the best consensus motif each particle has personally seen
    pbest = [StaticArrays.SVector{motiflen, Char}(['A' for i in 1:motiflen]) for i in 1:nparticles]
    
    for i in 1:max_reset
        #= println("Reset Number: ", i) =#
    
        # sample particle starting positions
        # and the sequences they come from
        
        ## TODO: maybe ensure the starting positions result in non-overlaping particles? in terms of consensus motifs
        particle_startingpos .= StatsBase.sample(1:(length(sequences[1]) - motiflen + 1), nparticles, replace = true)
        particle_startingseq .= StatsBase.sample(1:length(sequences), nparticles, replace = true)

        # extract these sequence from the input sequences
        for p in 1:nparticles
            # add these particles into a static array
            particles[p] = StaticArrays.SVector{motiflen, Char}(sequences[particle_startingseq[p]][particle_startingpos[p]:(particle_startingpos[p]+motiflen-1)])
            pbest[p] = StaticArrays.SVector{motiflen, Char}(sequences[particle_startingseq[p]][particle_startingpos[p]:(particle_startingpos[p]+motiflen-1)])
        end
        
        # set the personal best fitness for each particle
        # and for global to be -Inf
        # and create a vector to store the current scores inside of
        fitness_current = [-Inf16 for i in 1:nparticles]
        fitness_pbest = [-Inf16 for i in 1:nparticles]
        fitness_gbest = -Inf16
        
        gbest = [StaticArrays.SVector{motiflen, Char}(['A' for i in 1:motiflen])]
        
        # initialize more arrays
        seqs_tosearch = zeros(Int64, min(length(sequences), 30))
            
        # iterate until convergance
        j = 1
        while (j < max_iteration)
            #=
            println("ITERATION: ", j)
            display(fitness_gbest)
            =#
            # now follow the rules for each particle
            
            best_match_saves_seqs = []
            best_match_saves_starts = []
            for k in 1:nparticles
                # broadcast the best_match function along each of the sequences
                best_matches = []
                
                # modify so best match is only collected along 30 random sequences maximum
                # if this is an iteration  (iter - 1 % 10) == 0, search all sequences
                # if ((j - 1) % 10) == 0
                #     seqs_tosearch = 1:length(sequences)
                # else
                #     seqs_tosearch = StatsBase.sample(1:length(sequences), min(length(sequences), 30), replace = false)
                # end
                seqs_tosearch .= StatsBase.sample(1:length(sequences), min(length(sequences), 30), replace = false)
                push!(best_match_saves_seqs, copy(seqs_tosearch))
                
                for seq in seqs_tosearch
                    push!(best_matches, best_match(sequences[seq], particles[k], bg))
                end
                
                # get the current fitness
                
                # now update the consensus motif so it matches the sequences it mathched against, kind of like in gibbs sampling
                match_positions = [i[2] for i in best_matches]
                push!(best_match_saves_starts, copy(match_positions))
                
                # now get those sequences
                match_sequences = [sequences[x][match_positions[i]:(match_positions[i] + motiflen - 1)] for (i, x) in enumerate(seqs_tosearch)]
                
                # and calculate the consensus score like the paper says - this will become our fitness
                fitness_current[k], particles[k] = consensus_score(match_sequences, bg)
                
                ### TODO: might want to add here that the particle is updated based on the PWM from consensus_score

                if fitness_current[k] > fitness_pbest[k]
                    fitness_pbest[k] = copy(fitness_current[k])
                    pbest[k] = particles[k]
                end
                
                # upate global best if this is the best we've ever had
                if fitness_current[k] > fitness_gbest
                    fitness_gbest = copy(fitness_current[k])
                    gbest[1] = particles[k]
                end   
            end
            
            # do check shift 10 times
            if (j % floor(max_iteration/10)) == 0
                for k in 1:nparticles
                    starts_toworkfrom = copy(best_match_saves_starts[k])
                    for shift in [-5, -4, -3, -2, -1, 1, 2, 3, 4, 5]
                        tocheck_matchstarts = copy(starts_toworkfrom)
                        tocheck_matchstarts .+= shift
                        tocheck_matchstarts[tocheck_matchstarts .< 1] .= 1
                        tocheck_matchstarts[tocheck_matchstarts .> (length(sequences[1]) - motiflen + 1)] .= (length(sequences[1]) - motiflen + 1)
                        
                        match_sequences = [sequences[x][tocheck_matchstarts[i]:(tocheck_matchstarts[i] + motiflen - 1)] for (i, x) in enumerate(best_match_saves_seqs[k])]
                        
                        potential_fitness_current, potential_new_part = consensus_score(match_sequences, bg)
                        
                        if potential_fitness_current > fitness_current[k]
                            fitness_current[k] = copy(potential_fitness_current)
                            particles[k] = StaticArrays.SVector{motiflen, Char}(potential_new_part)
                            best_match_saves_starts[k] = copy(tocheck_matchstarts)
                        end
                        
                        if fitness_current[k] > fitness_pbest[k]
                            fitness_pbest[k] = copy(fitness_current[k])
                            pbest[k] = particles[k]
                        end

                        # upate global best if this is the best we've ever had
                        if fitness_current[k] > fitness_gbest
                            fitness_gbest = copy(fitness_current[k])
                            gbest[1] = particles[k]
                        end   
                    end
                end
            end
            
            # broadcast the update operation across all the particles
            particles .= update_motif.(particles, pbest, (gbest[1], ), (w, ), (bg, ))

            j += 1
            # break if >50% of particles are the same
            #=
            if maximum(values(StatsBase.countmap(pbest))) > floor(nparticles * 0.90)
                break
            end=#
        end
        
        # wrap up by looking for the motif sequence in all sequences one more time
        
        #=
        display(fitness_current)
        display(fitness_pbest)
        display(fitness_gbest)
        display(particles)
        display(pbest)
        display(gbest[1])=#
        
        # get the _actual_ best score using all the data
        best_matches = []
        for seq in 1:length(sequences)
            push!(best_matches, best_match(sequences[seq], gbest[1], bg))
        end
        
        match_positions = [i[2] for i in best_matches]

        # now get those sequences
        match_sequences = [sequences[i][match_positions[i]:(match_positions[i] + motiflen - 1)] for i in 1:length(sequences)]

        # and calculate the consensus score like the paper says - this will become our fitness
        fitness_gbest, gbest[1] = consensus_score(match_sequences, bg)
        
        # check if what we got so far is better than in the other runs
        if fitness_gbest > final_fitness
            # calculate the fitness on all the data one more time
            final_fitness = copy(fitness_gbest)
            final_motif = StaticArrays.SVector{motiflen, Char}(gbest[1])
        end     
    end

    return(final_fitness, final_motif)
end


# Unit Testing

## Simulation Code

## PSO Helper Functions

### make_profile

In [None]:
?make_profile

In [None]:
begin
    Test.@test make_profile(3, 3, [['A', 'A', 'A'], ['A', 'A', 'A'], ['A', 'A', 'A']] ; pseudo_counts = false) == [1.0 1.0 1.0; 0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0;]
    Test.@test make_profile(3, 3, [['C', 'C', 'C'], ['C', 'C', 'C'], ['C', 'C', 'C']] ; pseudo_counts = false) == [0.0 0.0 0.0; 1.0 1.0 1.0; 0.0 0.0 0.0; 0.0 0.0 0.0;]
    Test.@test make_profile(3, 3, [['G', 'G', 'G'], ['G', 'G', 'G'], ['G', 'G', 'G']] ; pseudo_counts = false) == [0.0 0.0 0.0; 0.0 0.0 0.0; 1.0 1.0 1.0; 0.0 0.0 0.0;]
    Test.@test make_profile(3, 3, [['T', 'T', 'T'], ['T', 'T', 'T'], ['T', 'T', 'T']] ; pseudo_counts = false) == [0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0; 1.0 1.0 1.0;]
end

In [None]:
begin
    Test.@test make_profile(3, 3, [['A', 'A', 'A'], ['A', 'A', 'A'], ['A', 'A', 'A']] ; pseudo_counts = true) == [4/7 4/7 4/7; 1/7 1/7 1/7; 1/7 1/7 1/7; 1/7 1/7 1/7;]
    Test.@test make_profile(3, 3, [['C', 'C', 'C'], ['C', 'C', 'C'], ['C', 'C', 'C']] ; pseudo_counts = true) == [1/7 1/7 1/7; 4/7 4/7 4/7; 1/7 1/7 1/7; 1/7 1/7 1/7;]
    Test.@test make_profile(3, 3, [['G', 'G', 'G'], ['G', 'G', 'G'], ['G', 'G', 'G']] ; pseudo_counts = true) == [1/7 1/7 1/7; 1/7 1/7 1/7; 4/7 4/7 4/7; 1/7 1/7 1/7;]
    Test.@test make_profile(3, 3, [['T', 'T', 'T'], ['T', 'T', 'T'], ['T', 'T', 'T']] ; pseudo_counts = true) == [1/7 1/7 1/7; 1/7 1/7 1/7; 1/7 1/7 1/7; 4/7 4/7 4/7;]
end

### background_frequency

In [None]:
?background_frequency

In [None]:
begin
    Test.@test background_frequency(collect(repeat('A', 100))) == Dict('A' => 100, 'C' =>   0, 'G' =>   0, 'T' =>   0)
    Test.@test background_frequency(collect(repeat('C', 100))) == Dict('A' =>   0, 'C' => 100, 'G' =>   0, 'T' =>   0)
    Test.@test background_frequency(collect(repeat('G', 100))) == Dict('A' =>   0, 'C' =>   0, 'G' => 100, 'T' =>   0)
    Test.@test background_frequency(collect(repeat('T', 100))) == Dict('A' =>   0, 'C' =>   0, 'G' =>   0, 'T' => 100)
end

### new_score

In [None]:
?new_score

In [None]:
begin
    Test.@test new_score(['A', 'T'], ['A', 'T'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == 2.0
    Test.@test new_score(['T', 'T'], ['A', 'T'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == 1.0
    Test.@test new_score(['T', 'A'], ['A', 'T'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == 0.0
end

### get_index

In [None]:
?get_index

In [None]:
begin
    Test.@test get_index('A') == 1
    Test.@test get_index('C') == 2
    Test.@test get_index('G') == 3
    Test.@test get_index('T') == 4
end

### consensus_score

In [None]:
?consensus_score

In [None]:
begin
    Test.@test consensus_score([['A', 'A'], ['A', 'T']], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (3.0, ['A', 'A'])
    Test.@test consensus_score([['A', 'T'], ['A', 'T']], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (4.0, ['A', 'T'])
    Test.@test consensus_score([['T', 'A'], ['A', 'T']], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (2.0, ['A', 'A'])
end

### best_match

In [None]:
?best_match

In [None]:
begin
    Test.@test best_match(['A', 'A', 'A', 'T', 'T', 'T'], ['A', 'T'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (2.0, 3)
    Test.@test best_match(['A', 'A', 'A', 'T', 'T', 'T'], ['A', 'T', 'C'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (2.0, 3)
    Test.@test best_match(['A', 'A', 'A', 'T', 'T', 'T'], ['A', 'T', 'C', 'C'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (2.0, 3)
    Test.@test best_match(['A', 'A', 'A', 'T', 'T', 'T'], ['A', 'A', 'A', 'T'], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4)) == (4.0, 1)
end

### update_motif

In [None]:
?update_motif

In [None]:
begin
    Test.@test update_motif(['A', 'A', 'A', 'T'], ['C', 'C', 'C', 'C'], ['G', 'G', 'G', 'G'], [2.0, 1.0, 1.0, 1.0], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4); deterministic=true) == ['A', 'A', 'A', 'T']
    Test.@test update_motif(['A', 'A', 'A', 'T'], ['C', 'C', 'C', 'C'], ['G', 'G', 'G', 'G'], [1.0, 2.0, 1.0, 1.0], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4); deterministic=true) == ['C', 'C', 'C', 'C']
    Test.@test update_motif(['A', 'A', 'A', 'T'], ['C', 'C', 'C', 'C'], ['G', 'G', 'G', 'G'], [1.0, 1.0, 2.0, 1.0], Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4); deterministic=true) == ['G', 'G', 'G', 'G']
end

### weights

In [None]:
?weights

# Troubleshooting
This code was used to figure out whether the above programs were actually working.

In [None]:
?GenerateTestData_ld

In [None]:
# set the seed
Random.seed!(101)
# use tuple unpacking to get some test values
# NumberOfSequences, LengthMotif, LengthSequences, Distance
correct_motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(30, 8, 100, 1)

In [None]:
consensus_score(motifs_implanted, Dict('A' => 1/4, 'C' => 1/4, 'G' => 1/4, 'T' => 1/4))

In [None]:
?pso_motif

In [None]:
Random.seed!(101)
BenchmarkTools.@btime pso_motif(sequences, length(correct_motif), 1, 50, 10)

In [None]:
Random.seed!(101)
BenchmarkTools.@btime pso_motif(sequences, length(correct_motif), 1, 50, 10)

In [None]:
#pso_motif(sequences, length(correct_motif), 5, 100, 10)
# Random.seed!(105)
#pso_motif(sequences, length(correct_motif), 5, 50, 10)
# pso_motif(sequences, length(correct_motif), 5, 10, 10)
pso_motif(sequences, length(correct_motif), 1, 50, 10)

In [None]:
StatProfilerHTML.@profilehtml pso_motif(sequences, length(correct_motif), 3, 100, 10)