In [247]:
import LinearAlgebra
import Random
import BenchmarkTools
import StaticArrays
import NBInclude
import LinearAlgebra
import Distributions
import StatsBase
import Profile
import StatProfilerHTML
using StatsBase
using Random

In [2]:
function GenerateSequence(Length)
    return collect(Random.randstring("ACGT", Length))
end

GenerateSequence (generic function with 1 method)

In [4]:
a=GenerateSequence(10)

10-element Vector{Char}:
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)

In [6]:
# create a vector of vector{chars}, each with the same length
function GenerateSequences(NumberOfSequences, Length)
    return Sequences = map(i -> GenerateSequence(Length), 1:NumberOfSequences)
end

GenerateSequences (generic function with 1 method)

In [31]:
seq = GenerateSequences(10,10)

10-element Vector{Vector{Char}}:
 ['G', 'C', 'T', 'G', 'G', 'C', 'G', 'G', 'G', 'A']
 ['A', 'T', 'T', 'T', 'C', 'G', 'T', 'C', 'G', 'T']
 ['A', 'G', 'C', 'C', 'G', 'C', 'T', 'A', 'G', 'G']
 ['G', 'T', 'T', 'A', 'G', 'C', 'G', 'T', 'C', 'T']
 ['G', 'G', 'A', 'G', 'T', 'C', 'A', 'G', 'A', 'G']
 ['C', 'G', 'G', 'T', 'T', 'T', 'A', 'T', 'G', 'T']
 ['A', 'A', 'G', 'T', 'G', 'A', 'T', 'T', 'C', 'T']
 ['T', 'G', 'C', 'G', 'T', 'T', 'T', 'G', 'G', 'C']
 ['C', 'A', 'G', 'A', 'G', 'T', 'T', 'G', 'C', 'T']
 ['T', 'T', 'G', 'C', 'A', 'A', 'G', 'T', 'T', 'G']

In [7]:
function Mutate!(Sequence, Distance) # distance is no. of mutations
    # return the sequence if the Distance is 0
    if Distance == 0
        return Sequence
    end
    # choose the sites to mutate
    posToMutate = StatsBase.sample(1:length(Sequence), Distance, replace = false)
    
    # pick the letters that can be used at each position
    basesAtPositions = [string(i) for i in Sequence[posToMutate]]

    lettersToFill = ["ACGT" for i in 1:length(Sequence)]
    
    # figure out what letters each positions can be changed to
    lettersToFill = map((x, y) -> replace(x, y => ""), lettersToFill, basesAtPositions)
    
    # select one character from each of these positions and place them where they should be
    # use only to cast the string to a character
    # https://stackoverflow.com/questions/59946081/julia-convert-string-to-char-or-convert-arraysubstringstring-1-to-char
    Sequence[posToMutate] = map(x -> only(Random.randstring(x, 1)), lettersToFill)
    
    return Sequence
end

Mutate! (generic function with 1 method)

In [32]:
mseq1 = Mutate!(seq[1],2) ## ! is used so it happens inplace, seq[1] is changed

10-element Vector{Char}:
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)

In [34]:
# generate test data
# distance is the number of mutations in each implanted motif
function GenerateTestData_ld(NumberOfSequences, LengthMotif, LengthSequences, Distance)
    # do some error checking to make sure the values provided are valid
    # specifically, Disance <= LengthMotif
    if Distance > LengthMotif
        error("The Distance if larger than the LengthMotif.")
    end

    # LengthMotif <= LengthSequences
    if LengthMotif > LengthSequences
        error("The LengthMotif is longer than the LengthSequences")
    end
    
    # NumberOfSequences >= 2
    if NumberOfSequences <= 1
        error("The NumberOfSequences is 1, which is too small for motif detection")
    end

    # generate the input sequences
    sequences = GenerateSequences(NumberOfSequences, LengthSequences)
    
    # make the motif
    motif = GenerateSequence(LengthMotif)
    
    # generate a mutated motif for each sequence to implant
    motifs = [copy(motif) for i in 1:NumberOfSequences]
    
    # broadcast the mutated motif
    motifs .= Mutate!.(motifs, Distance)
    
    # now implant the mutated motifs and record where we implant it
    motifStarts = rand(1:(LengthSequences - LengthMotif + 1), NumberOfSequences)
    
    # now place the motifs into the sequences vector
    for i in 1:length(sequences)
        sequences[i][motifStarts[i]:(motifStarts[i] + LengthMotif - 1)] = motifs[i]
    end
    
    # now return all the things we may want later
    return (motif = motif, motifs_starts = motifStarts, motifs_implanted = motifs, sequences = sequences)
end

GenerateTestData_ld (generic function with 1 method)

In [269]:
motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(10,6,20,2)

(motif = ['C', 'G', 'T', 'C', 'G', 'C'], motifs_starts = [13, 8, 3, 6, 9, 14, 2, 3, 7, 5], motifs_implanted = [['C', 'A', 'T', 'C', 'G', 'A'], ['C', 'G', 'C', 'G', 'G', 'C'], ['C', 'T', 'T', 'G', 'G', 'C'], ['C', 'G', 'G', 'C', 'G', 'T'], ['C', 'G', 'A', 'A', 'G', 'C'], ['T', 'G', 'T', 'C', 'G', 'T'], ['C', 'T', 'A', 'C', 'G', 'C'], ['A', 'G', 'T', 'T', 'G', 'C'], ['C', 'T', 'T', 'C', 'G', 'A'], ['C', 'G', 'A', 'C', 'G', 'A']], sequences = [['C', 'A', 'G', 'T', 'T', 'C', 'C', 'A', 'A', 'T', 'G', 'T', 'C', 'A', 'T', 'C', 'G', 'A', 'A', 'C'], ['C', 'C', 'T', 'C', 'G', 'G', 'T', 'C', 'G', 'C', 'G', 'G', 'C', 'T', 'G', 'G', 'G', 'C', 'G', 'T'], ['G', 'T', 'C', 'T', 'T', 'G', 'G', 'C', 'T', 'A', 'G', 'A', 'T', 'C', 'C', 'T', 'G', 'A', 'G', 'A'], ['C', 'A', 'G', 'G', 'T', 'C', 'G', 'G', 'C', 'G', 'T', 'C', 'T', 'A', 'C', 'C', 'G', 'C', 'G', 'A'], ['T', 'A', 'C', 'G', 'C', 'C', 'C', 'T', 'C', 'G', 'A', 'A', 'G', 'C', 'A', 'T', 'G', 'G', 'A', 'A'], ['C', 'C', 'A', 'G', 'A', 'A', 'A', 'G', 'T',

In [52]:
# it is written in such a way that it is continuously defined
function Score(Starts_inp, Length=length(motif), Seqs=sequences; ln=false)
    maxScore = Length * length(Seqs)
    # MODIFY the inputs if they are out of bounds :-)
    ### TO DO: return a very large number of the starts are out of bounds
    if @inbounds (minimum(Starts_inp) < 1) | (maximum(Starts_inp) > (length(Seqs[1]) - Length + 1))
        return(maxScore)
    end
    # in the mean time, just fix out of bounds errors
    
    # generate the character matrix
    # then store the results inside of it
    #@inbounds @views seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    @inbounds seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    #print(seqsMatrix)
    # allocate an array for the most common values in each column
    @inbounds mostCommon = Array{Char}(undef, size(seqsMatrix, 2))
    thisScore = 0
   
    # find the most common element in each column 
    for i in 1:size(seqsMatrix, 2)
        A = count(x -> x == 'A', @views @inbounds seqsMatrix[:,i])
        C = count(x -> x == 'C', @views @inbounds seqsMatrix[:,i])
        G = count(x -> x == 'G', @views @inbounds seqsMatrix[:,i])
        T = count(x -> x == 'T', @views @inbounds seqsMatrix[:,i])
        
        char_max = max(A, C, G, T)
        thisScore += char_max
        
        if char_max == A
            @inbounds mostCommon[i] = 'A'
        elseif char_max == C
            @inbounds mostCommon[i] = 'C'
        elseif char_max == G
            @inbounds mostCommon[i] = 'G'
        else
            @inbounds mostCommon[i] = 'T'
        end     
    end

    if ln
        return(-1 * log(maxScore - thisScore + 1))
    else
    # make the minimum (best) score 1 so we can compute the log
        return(maxScore - thisScore + 1)
    end
end

Score (generic function with 3 methods)

In [66]:
# it is written in such a way that it is continuously defined
function ConsensusMotif(Starts_inp, Length=length(motif), Seqs=sequences; ln=false)
    
    # generate the character matrix
    # then store the results inside of it
    #@inbounds @views seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    @inbounds seqsMatrix = mapreduce(permutedims, vcat, map((s, i) -> s[i:(i + Length - 1)], Seqs, Starts_inp))
    
    # allocate an array for the most common values in each column
    @inbounds mostCommon = Array{Char}(undef, size(seqsMatrix, 2))
    thisScore = 0
   
    # find the most common element in each column 
    for i in 1:size(seqsMatrix, 2)
        A = count(x -> x == 'A', @views @inbounds seqsMatrix[:,i])
        C = count(x -> x == 'C', @views @inbounds seqsMatrix[:,i])
        G = count(x -> x == 'G', @views @inbounds seqsMatrix[:,i])
        T = count(x -> x == 'T', @views @inbounds seqsMatrix[:,i])
        
        char_max = max(A, C, G, T)
        thisScore += char_max
        
        if char_max == A
            @inbounds mostCommon[i] = 'A'
        elseif char_max == C
            @inbounds mostCommon[i] = 'C'
        elseif char_max == G
            @inbounds mostCommon[i] = 'G'
        else
            @inbounds mostCommon[i] = 'T'
        end     
    end
    
    return(mostCommon)
end

ConsensusMotif (generic function with 3 methods)

In [67]:
ConsensusMotif(motif_starts,length(motif),sequences)

6-element Vector{Char}:
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)

In [68]:
function H(particle, velocity, k, Seqs)
    return(U(particle, k, Seqs) + 1/2 * velocity' * inv(LinearAlgebra.I) * velocity)
end

H (generic function with 1 method)

In [75]:
sequences[1][20]

'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)

# PSO

In [86]:
sample(1:14,5,replace = false)

5-element Vector{Int64}:
 1
 5
 4
 9
 2

In [103]:
countmap(seq[1])

Dict{Char, Int64} with 4 entries:
  'A' => 1
  'G' => 4
  'T' => 1
  'C' => 4

In [375]:
function make_profile(k,t,arr,pseudo_counts) 
   
"""
k : length of motifs
t : number of sequences
arr: motifs
returns the profile probabilities for the given inputs
    normalized frequencies
A
C
G
T
    
"""
    res = zeros(4,k)
    if pseudo_counts==false
        for i in 1:k
            c = Dict('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = c['A']/t
            res[2,i] = c['C']/t
            res[3,i] = c['G']/t
            res[4,i] = c['T']/t

        end
    else
        for i in 1:k
            c = Dict('A'=>0,'C'=>0,'G'=>0,'T'=>0)
            for j in 1:t
                c[arr[j][i]]+=1
            end
            res[1,i] = (c['A']+1)/(4+t)
            res[2,i] = (c['C']+1)/(4+t)
            res[3,i] = (c['G']+1)/(4+t)
            res[4,i] = (c['T']+1)/(4+t)

        end
    end
    return res
end
function background_frequency(sequence)
    return count_map(sequence)
end

function new_score(seq,motif,background_frequency)
    
    l = length(seq)
    s = 0
    for i in 1:l
        if seq[i] == motif[i]
            s += (1 + log(4,(0.25/background_frequency[seq[i]])))
        else
            s += log(4,(sqrt(background_frequency[seq[i]]*background_frequency[motif[i]])))
        end
    end
    return s
end

function get_index(c)
    d = Dict('A'=>1,'C'=>2,'G'=>3,'T'=>4)
    return d[c]
end

function consensus_score(motifs,background_frequency,nseq) #information content,
"""
motifs: obtained motifs for all sequences
background_frequency of all sequences
nseq: no.of sequences
"""
    int_char = Dict(1=>'A',2=>'C',3=>'G',4=>'T')
    l = length(motifs[1])
    profile = make_profile(l,nseq,motifs,0)
    cmotif = repeat(['A'],l)
    #in case of ties it sticks to the earlier character in the sequence, 
    #maybe change that to based on background frequency and generate
    #consensus motif using entropy directly
    for i in 1:l
        m=profile[1,i]
        for j in 2:4
            if profile[j,i]>m
                m = profile[j,i]
                cmotif[i] = int_char[j]
            end
        end
    end
            
    #print(profile)
    s = 0
    for i in 1:nseq
        for j in 1:l
            c = motifs[i][j]
            ind = get_index(c)
            #print(background_frequency[i])
            s += ((profile[ind,j]*log2(profile[ind,j]))/get(background_frequency[i],c,1))#default value of count map = 1, not 0 to avoid 0 probabilities.
        end
    end
    return s,cmotif
end

consensus_score (generic function with 1 method)

In [217]:
function best_match(seq,motif)
# To do: make it effecient, countmap is passed for every iteration,run iteration in new_score function?
    l = length(seq)
    k = length(motif)
    d = countmap(seq)
    best_score = 0
    #motif = seq[motif_pos:motif_pos+k-1]
    for i in l-k+1
        s = new_score(seq[i:k],motif,d)
        if s>best_score
            best_score = s
        end
    end
    return best_score
        
end

best_match (generic function with 1 method)

In [340]:
function update(current,pbest,gbest,c,w)
    
    #check if current,pbest,gbest are 
    #independent of the character they are representing 
    #or if they need to be added if the character is same
    # c is 4 element vector, has weights for current,pbest,gbest and random
    x4 = randstring(['A','C','G','T'],1)[1]
    score = Dict('A'=>0.0,'C'=>0.0,'G'=>0.0,'T'=>0.0)
    #print(score)
    score[current] += c[1]*(rand(1:10)/10)*w[current]
    score[pbest] += c[2]*(rand(1:10)/10)*w[pbest]
    score[gbest] += c[3]*(rand(1:10)/10)*w[gbest]
    score[x4] += c[4]*(rand(1:10)/10)*w[x4]
    max_score_char = reduce((x, y) -> score[x] >= score[y] ? x : y, keys(d))
    #print(score)
    return max_score_char
end

    

update (generic function with 1 method)

In [233]:
function weights(background_frequency)
    ma = max(1/background_frequency['A'],1/background_frequency['C'],
        1/background_frequency['G'],1/background_frequency['T'])
#     mi = mi(1/background_frequency['A'],1/background_frequency['C'],
#         1/background_frequency['G'],1/background_frequency['T'])
    d = Dict()
#     d['A'] = (1/background_frequency['A']-mi)/(ma - mi)
#     d['C'] = (1/background_frequency['C']-mi)/(ma - mi)
#     d['G'] = (1/background_frequency['G']-mi)/(ma - mi)
#     d['T'] = (1/background_frequency['T']-mi)/(ma - mi)
    
    d['A'] = (1/background_frequency['A'])/ma
    d['C'] = (1/background_frequency['C'])/ma
    d['G'] = (1/background_frequency['G'])/ma
    d['T'] = (1/background_frequency['T'])/ma
    return d
end

weights (generic function with 1 method)

In [235]:
rand(1:10)/10

0.2

In [373]:
function pso_motif(sequences,motiflen,max_reset,max_iteration,nparticles)
    fitness = -Inf16
    final_motif = []
    c=[1,1,1,1]
    w = weights(countmap(sequences[1]))
    
    for i in 1:max_reset
        
        #1:length(sequences)-length(motifs)+1 should be greater than nparticles
        particle_pos = sample(1:length(sequences[1])-motiflen+1,nparticles,replace = false)
        print(particle_pos)
        fitness_pbest = repeat([-Inf16],nparticles)
        fitness_gbest = -Inf16
        #fitness_gbest = repeat([-Inf16],nparticles)
        pbest = []
        gbest = []
        motif = []
        for j in 1:max_iteration
            for k in 1:nparticles
                pscore = 0
                if j==1
                    starting_seq_index = 2
                    push!(motif,sequences[1][particle_pos[k]: particle_pos[k]+motiflen-1])
                    #motif = sequences[1 : particle_pos[k]: length(sequences[1]) - motiflen + 1]#generate motif from first sequence
                else
                    starting_seq_index = 1
                end
                for a in starting_seq_index:length(sequences)#check on all the other sequences
                    pscore += best_match(sequences[a],motif[k])
                end
                #print(pscore,fitness_pbest)
                if pscore > fitness_pbest[k]
                    fitness_pbest[k] = pscore
                    pbest = motif[k]
                end
                if pscore > fitness_gbest
                    fitness_gbest = pscore
                    gbest = motif[k]
                end
                
            end
            # Add check shift, can use hmc here
            #update rule
            #optimize ci and wi
            #print(motif)
            for n in 1:nparticles
                for x in 1:motiflen
                    motif[n][x] = update(sequences[1][particle_pos[n]+x-1],pbest[x],gbest[x],c,w)
                end
            end
        end
        print(motif)
        #cscore,cmotif = consensus_score(motif,[countmap(i) for i in sequences],length(sequences),nparticles)
        cscore,cmotif = consensus_score(motif,[countmap(i) for i in sequences],nparticles)

        if cscore>fitness
            fitness = cscore
            final_motif = cmotif
        end

        
    end

    return motif,fitness
end


pso_motif (generic function with 1 method)

In [343]:
sequences

10-element Vector{Vector{Char}}:
 ['C', 'A', 'G', 'T', 'T', 'C', 'C', 'A', 'A', 'T', 'G', 'T', 'C', 'A', 'T', 'C', 'G', 'A', 'A', 'C']
 ['C', 'C', 'T', 'C', 'G', 'G', 'T', 'C', 'G', 'C', 'G', 'G', 'C', 'T', 'G', 'G', 'G', 'C', 'G', 'T']
 ['G', 'T', 'C', 'T', 'T', 'G', 'G', 'C', 'T', 'A', 'G', 'A', 'T', 'C', 'C', 'T', 'G', 'A', 'G', 'A']
 ['C', 'A', 'G', 'G', 'T', 'C', 'G', 'G', 'C', 'G', 'T', 'C', 'T', 'A', 'C', 'C', 'G', 'C', 'G', 'A']
 ['T', 'A', 'C', 'G', 'C', 'C', 'C', 'T', 'C', 'G', 'A', 'A', 'G', 'C', 'A', 'T', 'G', 'G', 'A', 'A']
 ['C', 'C', 'A', 'G', 'A', 'A', 'A', 'G', 'T', 'G', 'C', 'G', 'T', 'T', 'G', 'T', 'C', 'G', 'T', 'A']
 ['C', 'C', 'T', 'A', 'C', 'G', 'C', 'A', 'C', 'A', 'C', 'A', 'C', 'T', 'T', 'T', 'T', 'T', 'T', 'C']
 ['T', 'A', 'A', 'G', 'T', 'T', 'G', 'C', 'G', 'G', 'C', 'G', 'T', 'T', 'T', 'C', 'A', 'G', 'A', 'C']
 ['C', 'T', 'G', 'G', 'C', 'C', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'A', 'T', 'G', 'G', 'A', 'T']
 ['A', 'G', 'A', 'G', 'C', 'G', 'A', 'C', 'G', 'A

In [253]:
abc = []

Any[]

In [262]:
push!(abc,[1,4,3])

3-element Vector{Any}:
 [1, 2, 3]
 [1, 2, 3]
 [1, 4, 3]

In [263]:
abc[3][2]

4

In [270]:
motif, motif_starts, motifs_implanted, sequences

(['C', 'G', 'T', 'C', 'G', 'C'], [13, 8, 3, 6, 9, 14, 2, 3, 7, 5], [['C', 'A', 'T', 'C', 'G', 'A'], ['C', 'G', 'C', 'G', 'G', 'C'], ['C', 'T', 'T', 'G', 'G', 'C'], ['C', 'G', 'G', 'C', 'G', 'T'], ['C', 'G', 'A', 'A', 'G', 'C'], ['T', 'G', 'T', 'C', 'G', 'T'], ['C', 'T', 'A', 'C', 'G', 'C'], ['A', 'G', 'T', 'T', 'G', 'C'], ['C', 'T', 'T', 'C', 'G', 'A'], ['C', 'G', 'A', 'C', 'G', 'A']], [['C', 'A', 'G', 'T', 'T', 'C', 'C', 'A', 'A', 'T', 'G', 'T', 'C', 'A', 'T', 'C', 'G', 'A', 'A', 'C'], ['C', 'C', 'T', 'C', 'G', 'G', 'T', 'C', 'G', 'C', 'G', 'G', 'C', 'T', 'G', 'G', 'G', 'C', 'G', 'T'], ['G', 'T', 'C', 'T', 'T', 'G', 'G', 'C', 'T', 'A', 'G', 'A', 'T', 'C', 'C', 'T', 'G', 'A', 'G', 'A'], ['C', 'A', 'G', 'G', 'T', 'C', 'G', 'G', 'C', 'G', 'T', 'C', 'T', 'A', 'C', 'C', 'G', 'C', 'G', 'A'], ['T', 'A', 'C', 'G', 'C', 'C', 'C', 'T', 'C', 'G', 'A', 'A', 'G', 'C', 'A', 'T', 'G', 'G', 'A', 'A'], ['C', 'C', 'A', 'G', 'A', 'A', 'A', 'G', 'T', 'G', 'C', 'G', 'T', 'T', 'G', 'T', 'C', 'G', 'T', 'A']

In [374]:
(res_motif,res_fitness)=pso_motif(sequences,length(motif),10,100,6)

[12, 14, 13, 1, 3, 11]Any[['T', 'C', 'G', 'T', 'T', 'G'], ['G', 'C', 'G', 'G', 'T', 'G'], ['G', 'C', 'G', 'G', 'T', 'G'], ['G', 'C', 'G', 'T', 'T', 'G'], ['G', 'T', 'G', 'T', 'T', 'G'], ['G', 'G', 'G', 'T', 'T', 'C']][0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.6666666666666666 0.0 0.0 0.0 0.16666666666666666; 0.8333333333333334 0.16666666666666666 1.0 0.3333333333333333 0.0 0.8333333333333334; 0.16666666666666666 0.16666666666666666 0.0 0.6666666666666666 1.0 0.0]Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)Dict('G' => 9, 'T' => 4, 'C' => 7)Dict('G' => 9, 'T' => 4, 'C' => 7)Dict('G' => 9, 'T' => 4, 'C' => 7)Dict('G' => 9, 'T' => 4, 'C' => 7)Dict('G' => 9, 'T' => 4, 'C' => 7)Dict('G' => 9, 'T' => 4, 'C' => 7)Dict('A' => 4, 'G' => 6, 'T' => 6, 'C' => 4)Dict('A' => 4, 'G' => 6, 'T' => 

(['C', 'G', 'T', 'C', 'G', 'C'], -0.6774227432057964)

In [353]:
aaaa=Any[['A', 'G', 'G', 'G', 'G', 'C'], ['G', 'G', 'C', 'G', 'G', 'T'], ['A', 'A', 'T', 'G', 'G', 'C'], ['G', 'G', 'G', 'A', 'G', 'C'], ['G', 'G', 'G', 'G', 'G', 'G'], ['G', 'G', 'T', 'G', 'G', 'T']]

6-element Vector{Any}:
 ['A', 'G', 'G', 'G', 'G', 'C']
 ['G', 'G', 'C', 'G', 'G', 'T']
 ['A', 'A', 'T', 'G', 'G', 'C']
 ['G', 'G', 'G', 'A', 'G', 'C']
 ['G', 'G', 'G', 'G', 'G', 'G']
 ['G', 'G', 'T', 'G', 'G', 'T']

In [None]:
aaaa[]

In [277]:
length(sequences[1])

20

In [282]:
23423.34>11

true

In [285]:
-Inf16

-Inf16

In [303]:
[countmap(i) for i in sequences]

10-element Vector{Dict{Char, Int64}}:
 Dict('A' => 6, 'G' => 3, 'T' => 5, 'C' => 6)
 Dict('G' => 9, 'T' => 4, 'C' => 7)
 Dict('A' => 4, 'G' => 6, 'T' => 6, 'C' => 4)
 Dict('A' => 3, 'G' => 7, 'T' => 3, 'C' => 7)
 Dict('A' => 6, 'G' => 5, 'T' => 3, 'C' => 6)
 Dict('A' => 5, 'G' => 6, 'T' => 5, 'C' => 4)
 Dict('A' => 4, 'G' => 1, 'T' => 7, 'C' => 8)
 Dict('A' => 4, 'G' => 6, 'T' => 6, 'C' => 4)
 Dict('A' => 3, 'G' => 6, 'T' => 6, 'C' => 5)
 Dict('A' => 7, 'G' => 6, 'T' => 3, 'C' => 4)

In [304]:
zeros(Float64,2,3)

2×3 Matrix{Float64}:
 0.0  0.0  0.0
 0.0  0.0  0.0

In [309]:
sequences[1:10][1]

20-element Vector{Char}:
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'G': ASCII/Uni

In [314]:
[countmap(j) for j in countmapt(sequences[1:]) ]

20-element Vector{Char}:
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'G': ASCII/Uni

In [330]:
a,b = consensus_score(sequences,[countmap(i) for i in sequences],10)

[0.1 0.4 0.3 0.1 0.1 0.1 0.2 0.2 0.1 0.3 0.1 0.4 0.0 0.2 0.2 0.1 0.2 0.2 0.4 0.5; 0.6 0.3 0.2 0.1 0.4 0.4 0.4 0.4 0.3 0.2 0.3 0.1 0.4 0.2 0.2 0.3 0.1 0.3 0.0 0.3; 0.1 0.1 0.3 0.6 0.1 0.4 0.3 0.2 0.3 0.4 0.4 0.3 0.2 0.1 0.2 0.1 0.6 0.4 0.4 0.0; 0.2 0.2 0.2 0.2 0.4 0.1 0.1 0.2 0.3 0.1 0.2 0.2 0.4 0.5 0.4 0.5 0.1 0.1 0.2 0.2]

(-18.944911458728036, ['C', 'A', 'A', 'G', 'C', 'C', 'C', 'C', 'C', 'G', 'G', 'A', 'C', 'T', 'T', 'T', 'G', 'G', 'A', 'A'])

In [331]:
a

-18.944911458728036

In [332]:
b

20-element Vector{Char}:
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)
 'G': ASCII/Uni