In [None]:
# import the simulation code
import NBInclude
import Random
import StatsBase

In [None]:
# import the code from the l,d motif simulation notebook
NBInclude.@nbinclude("simulate_ld_motif.ipynb")

In [None]:
# set the seed
# Random.seed!(100)
# use tuple unpacking to get some test values
# NumberOfSequences, LengthMotif, LengthSequences, Distance
motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(20, 5, 10, 1)

In [None]:
# now implement the standard algorithm for motif detection from bioinformatics 2
# given we known the length
function FindMotifs(Sequences, MotifLength)
    # code skeleton
    background_rates = StatsBase.proportionmap(reduce(vcat, Sequences))
    # pick a random sequence at random. scan along this sequences, and create a "generator"
    
    # here, we'll just start with the first one
    starter = Sequences[1]
    
    # construct all the potential starting sequences, by moving along the sequence
    potential_starts = Dict(k => @view(starter[k:(k + MotifLength - 1)]) for k in 1:(length(starter) - MotifLength + 1))
    
    # now construct a generator for each potential case
    # that is a mixture of 75% background and 25% current position
    # implement using a slow for loop
    model_dict = Dict()
    for (k, v) in pairs(potential_starts)
        model_dict[k] = Dict()
        for i in 1:length(v)
            model_dict[k][i] = Dict(xk => xv/4 for (xk, xv) in pairs(background_rates))
            model_dict[k][i][v[i]] += 0.75 
        end
    end
    
    # generate a 3 dimensional array
    # which should contain the
    #   model being scored in the first dimension
    #   sequence being scored in the second dimension
    #   and starting position in the last dimension
    score_saver = zeros(Float64, length(starter) - MotifLength + 1, length(Sequences), length(starter) - MotifLength + 1)

    # initialize a vector for the scores

    # loop over each potential model
    for model in 1:(length(starter) - MotifLength + 1)
        this_model = model_dict[model]
        # then loop over each sequence
        for i_seq in 1:length(Sequences)
            seq = Sequences[i_seq]
            # collect the scores in a vector
            # finally, loop over each starting position in that sequence
            for j_seq in 1:(length(seq) - MotifLength + 1)
                # grab the sequence that we are going to be scoring
                seq_inner = seq[j_seq:(j_seq + MotifLength - 1)]
                a = sum(map((i, j) -> log(this_model[i][j]), 1:MotifLength, seq_inner))
                b = sum(map((i, j) -> log(background_rates[j]), 1:MotifLength, seq_inner))
                score_saver[model, i_seq, j_seq] = 2 * (a - b)
            end
        end
    end
    
    # find the maximum score for each model x sequence
    max_byseq = dropdims(maximum(score_saver, dims=3) , dims = 3)

    # figure out which of these is the largest - that's the starting position
    pos_in_starter = findmax(dropdims(sum(max_byseq, dims = 2), dims = 2))[2]
    # now, slice the matrix so we can find the maximum in each of the other dimensions
    guessed_pos = vec(mapslices(x -> findmax(x)[2], score_saver[pos_in_starter,:,:]; dims = 2))
    
    # now, run more iterations
    # construct a new generator that is equal parts from each sequence
    # that is a mixture of 5% background and the rest equally divided across the genes
    # implement using a slow for loop
    tentative_model = Dict()
    for i in 1:MotifLength
        tentative_model[i] = Dict(xk => xv/20 for (xk, xv) in pairs(background_rates))
        # go through all of the sequences
        for seq in 1:length(Sequences)
            # figure out which base is at this position
            this_char = Sequences[seq][guessed_pos[seq] + i - 1]
            tentative_model[i][this_char] += 0.95/length(Sequences)
        end
    end
    
    # now find the best position in each sequence based on this scoring method
    score_saver = zeros(Float64, length(Sequences), length(starter) - MotifLength + 1)
    
    # rescore
    for i_seq in 1:length(Sequences)
        seq = Sequences[i_seq]
        # collect the scores in a vector
        # finally, loop over each starting position in that sequence
        for j_seq in 1:(length(seq) - MotifLength + 1)
            # grab the sequence that we are going to be scoring
            seq_inner = seq[j_seq:(j_seq + MotifLength - 1)]
            a = sum(map((i, j) -> log(tentative_model[i][j]), 1:MotifLength, seq_inner))
            b = sum(map((i, j) -> log(background_rates[j]), 1:MotifLength, seq_inner))
            score_saver[i_seq, j_seq] = 2 * (a - b)
        end
    end
    
    # find the maximum in each row again
    guessed_pos = vec(mapslices(x -> findmax(x)[2], score_saver, dims = 2)) 
    
    # now, try updating the probabilities and checking one more time
    return(guessed_pos)
end

In [None]:
FindMotifs(sequences, length(motif))

In [None]:
begin
    n = 10
    summer = zeros(n)
    for i in 1:n
        motif, motif_starts, motifs_implanted, sequences = GenerateTestData_ld(100, 14, 100, 2)
        summer[i] = StatsBase.mean(FindMotifs(sequences, length(motif)) .== motif_starts)
    end
    println(summer)
end