In [None]:
import LinearAlgebra
import Random
import BenchmarkTools
import StaticArrays
import NBInclude
import LinearAlgebra
import Distributions
import StatsBase
import Profile
import StatProfilerHTML
import StatsBase

# Useful Functions for Testing
This section contains some basic functionality for generating test data and calculating scores of motifs based on starting positions.

In [None]:
"""
Author: gtb
Generate a random sequence of length `Length`. Returns a vector of type `Char`.
...
# Arguments
- `Length`: the length of the sequence to generate using the standard ACGT alphabet
...
"""
function GenerateSequence(Length)
    return collect(Random.randstring("ACGT", Length))
end

In [None]:
"""
Author: gtb
Generate mutliple sequences, each of a given length. Calls `GenerateSequence`. Return a vector of vectors of type `Char`.
...
# Arguments
- `NumberOfSequences`: the number of sequences to generate
- `Length`: the number of nucleotides in each DNA sequences to be generated
...
"""
function GenerateSequences(NumberOfSequences, Length)
    return Sequences = map(i -> GenerateSequence(Length), 1:NumberOfSequences)
end

In [None]:
"""
Author: gtb
Mutate a DNA sequence to have a certain distance from the input sequence. Expects and input and output of type `Vec{Char}`.
...
# Arguments
- `Sequence`: the sequence that will be mutated.
- `Distance`: the number of nucleotides in the sequence to change.
...
"""
function Mutate!(Sequence, Distance) # distance is no. of mutations
    # return the sequence if the Distance is 0
    if Distance == 0
        return Sequence
    end
    # choose the sites to mutate
    posToMutate = StatsBase.sample(1:length(Sequence), Distance, replace = false)
    
    # pick the letters that can be used at each position
    basesAtPositions = [string(i) for i in Sequence[posToMutate]]

    lettersToFill = ["ACGT" for i in 1:length(Sequence)]
    
    # figure out what letters each positions can be changed to
    lettersToFill = map((x, y) -> replace(x, y => ""), lettersToFill, basesAtPositions)
    
    # select one character from each of these positions and place them where they should be
    # use only to cast the string to a character
    # https://stackoverflow.com/questions/59946081/julia-convert-string-to-char-or-convert-arraysubstringstring-1-to-char
    Sequence[posToMutate] = map(x -> only(Random.randstring(x, 1)), lettersToFill)
    
    return Sequence
end

In [None]:
"""
Author: gtb
Generate test data for testing the (l,d) planted motif problem. Returns the:
- `motif` as type `Vec{Char}`, 
- starting positions of the planted motifs in each sequence `motif_starts` as `Vec{Int64}`,
- the actual planted motif including mutations as `motifs_implanted` as `Vec{Vec{Char}}`,
- the `sequences` of nucleotides including the motifs as `Vec{Vec{Char}}`.
...
# Arguments
- `NumberOfSequences`: the number of DNA sequences to produce
- `LengthMotif`: the length of the motif to plant
- `LengthSequences`: the length of each of the DNA sequences the motif will be planted into
- `Distance`: the hamming distance of each planted sequences from the consensus motif
...
"""
function GenerateTestData_ld(NumberOfSequences, LengthMotif, LengthSequences, Distance)
    # do some error checking to make sure the values provided are valid
    # specifically, Disance <= LengthMotif
    if Distance > LengthMotif
        error("The Distance if larger than the LengthMotif.")
    end

    # LengthMotif <= LengthSequences
    if LengthMotif > LengthSequences
        error("The LengthMotif is longer than the LengthSequences")
    end
    
    # NumberOfSequences >= 2
    if NumberOfSequences <= 1
        error("The NumberOfSequences is 1, which is too small for motif detection")
    end

    # generate the input sequences
    sequences = GenerateSequences(NumberOfSequences, LengthSequences)
    
    # make the motif
    motif = GenerateSequence(LengthMotif)
    
    # generate a mutated motif for each sequence to implant
    motifs = [copy(motif) for i in 1:NumberOfSequences]
    
    # broadcast the mutated motif
    motifs .= Mutate!.(motifs, Distance)
    
    # now implant the mutated motifs and record where we implant it
    motifStarts = rand(1:(LengthSequences - LengthMotif + 1), NumberOfSequences)
    
    # now place the motifs into the sequences vector
    for i in 1:length(sequences)
        sequences[i][motifStarts[i]:(motifStarts[i] + LengthMotif - 1)] = motifs[i]
    end
    
    # now return all the things we may want later
    return (motif = motif, motifs_starts = motifStarts, motifs_implanted = motifs, sequences = sequences)
end

In [None]:
"""
Author: gtb
Function to score a set of starting positions. Score is at the "best" values when it is =1, signifying all the sequences are identical at this position.

May be DEPRECATED.
...
# Arguments
- `Starts_inp`: starting positions of the suspected locations of the motifs in the DNA sequences. Type `Vec{Int64}`
- `Length`: the length of the input motifs. Defaults to `length(motif)`, where `motif` is from the output of `GenerateTestData_ld()`
- `Seqs`: the input `Vec{Vec{Char}}` with the DNA sequences. Defaults to `sequences`, where `sequences` is from the output of `GenerateTestData_ld()`
- `ln`: optional argument that takes the natural log of the score; useful for HMC
...
"""
function Score(Starts_inp, Length=length(motif), Seqs=sequences; ln=false)
    maxScore = Length * length(Seqs)
    # MODIFY the inputs if they are out of bounds :-)
    ### TO DO: return a very large number of the starts are out of bounds
    if @inbounds (minimum(Starts_inp) < 1) | (maximum(Starts_inp) > (length(Seqs[1]) - Length + 1))
        return(maxScore)
    end
    # in the mean time, just fix out of bounds errors
    
    # generate the character matrix
    # then store the results inside of it
    #@inbounds @views seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    @inbounds seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    #print(seqsMatrix)
    # allocate an array for the most common values in each column
    @inbounds mostCommon = Array{Char}(undef, size(seqsMatrix, 2))
    thisScore = 0
   
    # find the most common element in each column 
    for i in 1:size(seqsMatrix, 2)
        A = count(x -> x == 'A', @views @inbounds seqsMatrix[:,i])
        C = count(x -> x == 'C', @views @inbounds seqsMatrix[:,i])
        G = count(x -> x == 'G', @views @inbounds seqsMatrix[:,i])
        T = count(x -> x == 'T', @views @inbounds seqsMatrix[:,i])
        
        char_max = max(A, C, G, T)
        thisScore += char_max
        
        if char_max == A
            @inbounds mostCommon[i] = 'A'
        elseif char_max == C
            @inbounds mostCommon[i] = 'C'
        elseif char_max == G
            @inbounds mostCommon[i] = 'G'
        else
            @inbounds mostCommon[i] = 'T'
        end     
    end

    if ln
        return(-1 * log(maxScore - thisScore + 1))
    else
    # make the minimum (best) score 1 so we can compute the log
        return(maxScore - thisScore + 1)
    end
end

In [None]:
"""
Author: gtb
Finds the consensus motif for a given set of starting positions in the DNA sequences.

May be DEPRECATED.
...
# Arguments
- `Starts_inp`: starting positions of the suspected locations of the motifs in the DNA sequences. Type `Vec{Int64}`
- `Length`: the length of the input motifs. Defaults to `length(motif)`, where `motif` is from the output of `GenerateTestData_ld()`
- `Seqs`: the input `Vec{Vec{Char}}` with the DNA sequences. Defaults to `sequences`, where `sequences` is from the output of `GenerateTestData_ld()`
- `ln`: optional argument that takes the natural log of the score; useful for HMC
...
"""
function ConsensusMotif(Starts_inp, Length=length(motif), Seqs=sequences)
    
    # generate the character matrix
    # then store the results inside of it
    #@inbounds @views seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    @inbounds seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    
    # allocate an array for the most common values in each column
    @inbounds mostCommon = Array{Char}(undef, size(seqsMatrix, 2))
    thisScore = 0
   
    # find the most common element in each column 
    for i in 1:size(seqsMatrix, 2)
        A = count(x -> x == 'A', @views @inbounds seqsMatrix[:,i])
        C = count(x -> x == 'C', @views @inbounds seqsMatrix[:,i])
        G = count(x -> x == 'G', @views @inbounds seqsMatrix[:,i])
        T = count(x -> x == 'T', @views @inbounds seqsMatrix[:,i])
        
        char_max = max(A, C, G, T)
        thisScore += char_max
        
        if char_max == A
            @inbounds mostCommon[i] = 'A'
        elseif char_max == C
            @inbounds mostCommon[i] = 'C'
        elseif char_max == G
            @inbounds mostCommon[i] = 'G'
        else
            @inbounds mostCommon[i] = 'T'
        end     
    end
    
    return(mostCommon)
end

# PSO Helper Functions
Helper functions that are useful for the implementation described in our template paper.

In [None]:
"""
Author: ks; mod gtb 11/10
Returns the profile probabilities for the given inputs. Normalized frequencies are given in the order A, C, G, T.
...
# Arguments
- `k`: length of motifs
- `t`: number of sequences
- `arr`: sequences to operate over, given as a `Vec{Vec{Char}}`
- `pseudo_counts`: optional argument for whether or not pseudo counts are used. Defaults to `true`.
...
"""
function make_profile(k,t,arr;pseudo_counts=true) 
    res = zeros(4,k)
    if pseudo_counts==false
        for i in 1:k
            c = Dict('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = c['A']/t
            res[2,i] = c['C']/t
            res[3,i] = c['G']/t
            res[4,i] = c['T']/t

        end
    else
        for i in 1:k
            c = Dict('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = (c['A']+1)/(4+t)
            res[2,i] = (c['C']+1)/(4+t)
            res[3,i] = (c['G']+1)/(4+t)
            res[4,i] = (c['T']+1)/(4+t)

        end
    end
    return res
end

In [None]:
"""
Author: ks
Returns the background frequency of each nucleotide as a dictionary.

TODO: do we need to account for when a nucleotide is absent from the sequence by providing `StatsBase.count_map` default values?
...
# Arguments
- `sequence`: the sequence as `Vec{Char}` that will be counted.
...
"""
function background_frequency(sequence)
    return StatsBase.count_map(sequence)
end


In [None]:
"""
Author: ks
Returns the score, where a higher score is achieved if the motif is more highly conserved. 
Utilizes the background frequency of the sequence, as in the template paper.
...
# Arguments
- `seq`: the sequence that is being scored
- `motif`: the motif sequence we are scoring against
- `background_frequency`: the background frequency for each nucleotide
...
"""
function new_score(seq,motif,background_frequency)
    l = length(seq)
    s = 0
    for i in 1:l
        if seq[i] == motif[i]
            s += (1 + log(4,(0.25/background_frequency[seq[i]])))
        else
            s += log(4,(sqrt(background_frequency[seq[i]]*background_frequency[motif[i]])))
        end
    end
    return s
end

In [None]:
"""
Author: ks, mod gtb 11/10
Helper function that maps the nucleotides `[A, C, G, T]` to the integers `1:4`. Useful for accessing certain arrays.

TODO: check a valid nucleotide is being passed
...
# Arguments
- `c`: the nucleotide character that is being accessed.
...
"""
function get_index(c)
    d = Dict('A'=>1,'C'=>2,'G'=>3,'T'=>4)
    return d[c]
end

In [None]:
"""
Author: ks
Helper function to calculate the consensus score given a set of motifs and the background frequencies of the nucleotides.

TODO: do we need nseq? we can just get this from `motifs`, right?
...
# Arguments
- `motifs`: obtained motifs for all sequences
- `background_frequency`: of all sequences
` `nseq`: number of sequences
...
"""
function consensus_score(motifs,background_frequency,nseq) #information content,
    int_char = Dict(1=>'A',2=>'C',3=>'G',4=>'T')
    l = length(motifs[1])
    profile = make_profile(l,nseq,motifs,0)
    cmotif = repeat(['A'],l)
    #in case of ties it sticks to the earlier character in the sequence, 
    #maybe change that to based on background frequency and generate
    #consensus motif using entropy directly
    for i in 1:l
        m=profile[1,i]
        for j in 2:4
            if profile[j,i]>m
                m = profile[j,i]
                cmotif[i] = int_char[j]
            end
        end
    end
            
    #print(profile)
    s = 0
    for i in 1:nseq
        for j in 1:l
            c = motifs[i][j]
            ind = get_index(c)
            #print(background_frequency[i])
            s += ((profile[ind,j]*log2(profile[ind,j]))/get(background_frequency[i],c,1))#default value of count map = 1, not 0 to avoid 0 probabilities.
        end
    end
    return s,cmotif
end

In [None]:
"""
Author: ks
Returns the score of the closest matching subsequence in `seq` to `motif`.

TODO: implement branch and bound, or some other rule, to avoid scoring starting positions with a bad score?
...
# Arguments
- `seq`: the sequence to search
- `motif`: the motif to search fot
...
"""
function best_match(seq,motif)
# To do: make it effecient, countmap is passed for every iteration,run iteration in new_score function?
    l = length(seq)
    k = length(motif)
    d = StatsBase.countmap(seq)
    best_score = 0
    #motif = seq[motif_pos:motif_pos+k-1]
    for i in l-k+1
        s = new_score(seq[i:k],motif,d)
        if s>best_score
            best_score = s
        end
    end
    return best_score
        
end

In [None]:
"""
Author: ks
Update the current nucleotide for a particle given the personal best, global best, and the weighting vectors for the vectors, `c`, and nucleotides, `w`.

TODO: update the entire particle sequence in one step, rather than calling `update` `l` times?
...
# Arguments
- `current`: the current motif sequence in the particle
- `pbest`: the personal best score
- `gbest`: the global best score
- `c`: the vector of weights for each score
- `w`: a vector of weights for each nucleotide, based on their frequencies
...
"""
function update(current,pbest,gbest,c,w)
    
    #check if current,pbest,gbest are 
    #independent of the character they are representing 
    #or if they need to be added if the character is same
    # c is 4 element vector, has weights for current,pbest,gbest and random
    x4 = Random.randstring(['A','C','G','T'], 1)[1]
    score = Dict('A'=>0.0,'C'=>0.0,'G'=>0.0,'T'=>0.0)
    #print(score)
    score[current] += c[1]*(rand(1:10)/10)*w[current]
    score[pbest] += c[2]*(rand(1:10)/10)*w[pbest]
    score[gbest] += c[3]*(rand(1:10)/10)*w[gbest]
    score[x4] += c[4]*(rand(1:10)/10)*w[x4]
    max_score_char = reduce((x, y) -> score[x] >= score[y] ? x : y, keys(score))
    #print(score)
    return max_score_char
end

    

In [None]:
"""
Calculates the weights for each nucleotide based on their frequencies. Returns these values as a `Dict{Char -> Float64}`.
...
# Arguments
- `background_frequency`: the background frequencies of each nucleotide from `background_frequency`
...
"""
function weights(background_frequency)
    ma = max(1/background_frequency['A'],1/background_frequency['C'],
        1/background_frequency['G'],1/background_frequency['T'])
#     mi = mi(1/background_frequency['A'],1/background_frequency['C'],
#         1/background_frequency['G'],1/background_frequency['T'])
    d = Dict()
#     d['A'] = (1/background_frequency['A']-mi)/(ma - mi)
#     d['C'] = (1/background_frequency['C']-mi)/(ma - mi)
#     d['G'] = (1/background_frequency['G']-mi)/(ma - mi)
#     d['T'] = (1/background_frequency['T']-mi)/(ma - mi)
    
    d['A'] = (1/background_frequency['A'])/ma
    d['C'] = (1/background_frequency['C'])/ma
    d['G'] = (1/background_frequency['G'])/ma
    d['T'] = (1/background_frequency['T'])/ma
    return d
end

# PSO Implementation
This program actually runs the particle swarm optimization.
It has complexity approximately equal to:  
`# of initializations` $\times$ `# of particles` $\times$ `# of iterations` $\times$ `(sequence lengths - motif length + 1)` $\times$ `# of sequences - 1`

In [None]:
?StatsBase.countmap

In [None]:
"""
Author: ks
PSO Motif is the main function for this project. It takes some set of parameters, and then calls other functions to do the actual work.
...
# Arguments
- `sequences`: the input DNA sequences, stored as a `Vec{Vec{Char}}`
- `motiflen`: the motif length to look for
- `max_reset`: the number of times to re-intialize
- `max_iteration`: the number of times to update each particle
- `nparticles`: the number of particles to track
...
"""
function pso_motif(sequences,motiflen,max_reset,max_iteration,nparticles)
    fitness = -Inf16
    final_motif = []
    c=[1,1,1,1]
    w = weights(StatsBase.countmap(sequences[1]))
    
    for i in 1:max_reset
        
        #1:length(sequences)-length(motifs)+1 should be greater than nparticles
        particle_pos = StatsBase.sample(1:length(sequences[1])-motiflen+1,nparticles,replace = false)
        print(particle_pos)
        fitness_pbest = repeat([-Inf16],nparticles)
        fitness_gbest = -Inf16
        #fitness_gbest = repeat([-Inf16],nparticles)
        pbest = []
        gbest = []
        motif = []
        for j in 1:max_iteration
            for k in 1:nparticles
                pscore = 0
                if j==1
                    starting_seq_index = 2
                    push!(motif,sequences[1][particle_pos[k]: particle_pos[k]+motiflen-1])
                    #motif = sequences[1 : particle_pos[k]: length(sequences[1]) - motiflen + 1]#generate motif from first sequence
                else
                    starting_seq_index = 1
                end
                for a in starting_seq_index:length(sequences)#check on all the other sequences
                    pscore += best_match(sequences[a],motif[k])
                end
                #print(pscore,fitness_pbest)
                if pscore > fitness_pbest[k]
                    fitness_pbest[k] = pscore
                    pbest = motif[k]
                end
                if pscore > fitness_gbest
                    fitness_gbest = pscore
                    gbest = motif[k]
                end
                
            end
            # Add check shift, can use hmc here
            #update rule
            #optimize ci and wi
            #print(motif)
            for n in 1:nparticles
                for x in 1:motiflen
                    motif[n][x] = update(sequences[1][particle_pos[n]+x-1],pbest[x],gbest[x],c,w)
                end
            end
        end
        print(motif)
        #cscore,cmotif = consensus_score(motif,[countmap(i) for i in sequences],length(sequences),nparticles)
        cscore,cmotif = consensus_score(motif,[StatsBase.countmap(i) for i in sequences],nparticles)

        if cscore>fitness
            fitness = cscore
            final_motif = cmotif
        end

        
    end

    return motif,fitness
end


# Troubleshooting
This code was used to figure out whether the above programs were actually working.

In [None]:
# set the seed
Random.seed!(101)
# use tuple unpacking to get some test values
# NumberOfSequences, LengthMotif, LengthSequences, Distance
motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(3, 10, 30, 0)

In [None]:
pso_motif(sequences, length(motif), 1, 5, 5)