In [40]:
include("../src/HDC.jl")
include("../src/math.jl")
include("../src/experimental.jl")
using DataFrames
using CSV
using JLD
using MultivariateStats
using Evolutionary
using Distances
using Random
using BioAlignments

function fitness_func(individual, distance_matrix, dim)
    num_vectors = size(distance_matrix, 1)
    binary_vectors = transpose(reshape(individual, (dim, num_vectors)))
    distances = zeros((num_vectors, num_vectors))
    
    Threads.@threads for i in 1:num_vectors
        for j in 1:num_vectors
            if i != j
                distances[i, j] = Distances.hamming(Vector{Int}(binary_vectors[i, :]), Vector{Int}(binary_vectors[j, :]))
            end
        end
    end
    
    return sum((distances - distance_matrix).^2)
end

function generate_initial_population(population_size, dim, num_vectors)
    return [bitrand(dim*num_vectors) for i in 1:population_size]
end

function generate_binary_vectors(dim, num_vectors, distance_matrix, max_generations=300, population_size=100, mutation_rate = 0.05)

    opts = Evolutionary.Options(iterations=max_generations, parallelization=:thread)

    ga = GA(
        populationSize=population_size,
        crossoverRate=0.99,
        mutationRate=mutation_rate,
        crossover=SPX,
        mutation=flip
    )
    
    objective = individual -> fitness_func(individual, distance_matrix, dim)

    initial_population = generate_initial_population(population_size, dim, num_vectors)

    result = Evolutionary.optimize(objective, initial_population, ga, opts)
    best_individual = result.minimizer
    best_solution = reshape(best_individual, (dim, num_vectors))

    return transpose(best_solution)
end

generate_binary_vectors (generic function with 4 methods)

In [47]:
using Printf

function similarity_error(desired_distance_matrix, calculated_distance_matrix)
    return sum((calculated_distance_matrix .- desired_distance_matrix).^2)
end

function find_best_binary_vectors(dim, num_vectors, distance_matrix, num_trials, max_generations=300, population_size=100, mutation_rate = 0.05)
    best_error = Inf
    best_vectors = nothing
    best_calculated_distance_matrix = nothing

    for trial in 1:num_trials
        binary_vectors = generate_binary_vectors(dim, num_vectors, distance_matrix, max_generations, population_size, mutation_rate)
        calculated_distance_matrix = zeros((num_vectors, num_vectors))

        for i in 1:num_vectors
            for j in 1:num_vectors
                if i != j
                    hamming_distance = hamming(Vector{Int}(binary_vectors[i, :]), Vector{Int}(binary_vectors[j, :]))
                    calculated_distance_matrix[i, j] = hamming_distance / dim
                end
            end
        end

        error = similarity_error(distance_matrix, calculated_distance_matrix)

        if error < best_error
            best_error = error
            best_vectors = binary_vectors
            best_calculated_distance_matrix = calculated_distance_matrix
        end
    end

    return best_vectors, best_calculated_distance_matrix, best_error
end

find_best_binary_vectors (generic function with 4 methods)

In [None]:
# Define the desired pairwise similarities between the 20 vectors
distance_matrix = [
    0.0 0.2 0.4 0.6 0.8;
    0.2 0.0 0.3 0.5 0.7;
    0.4 0.3 0.0 0.2 0.4;
    0.6 0.5 0.2 0.0 0.2;
    0.8 0.7 0.4 0.2 0.0]

# Generate binary vectors using the desired pairwise similarities
dim = 10000
num_vectors = size(distance_matrix, 1)
num_trials = 30

best_vectors, best_calculated_distance_matrix, best_error = find_best_binary_vectors(dim, num_vectors, distance_matrix, num_trials)

println("Desired pairwise similarities:")
println(distance_matrix)

println("\nBest generated binary vectors:")
println(best_vectors)

println("\nBest calculated pairwise similarities:")
println(best_calculated_distance_matrix)

@printf("\nBest error: %f", best_error)

In [10]:
AA_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

20-element Vector{Char}:
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'D': ASCII/Unicode U+0044 (category Lu: Letter, uppercase)
 'E': ASCII/Unicode U+0045 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'H': ASCII/Unicode U+0048 (category Lu: Letter, uppercase)
 'I': ASCII/Unicode U+0049 (category Lu: Letter, uppercase)
 'K': ASCII/Unicode U+004B (category Lu: Letter, uppercase)
 'L': ASCII/Unicode U+004C (category Lu: Letter, uppercase)
 'M': ASCII/Unicode U+004D (category Lu: Letter, uppercase)
 'N': ASCII/Unicode U+004E (category Lu: Letter, uppercase)
 'P': ASCII/Unicode U+0050 (category Lu: Letter, uppercase)
 'Q': ASCII/Unicode U+0051 (category Lu: Letter, uppercase)
 'R': ASCII/Unicode U+0052 (category Lu: Letter, uppercase)
 'S': ASCII/Unicode U+0053 (category Lu: Letter, uppercase)
 'T': ASCII/Uni

In [21]:
simmat = copy(GRANTHAM1974)
simmatblo = copy(BLOSUM62)

SubstitutionMatrix{BioSymbols.AminoAcid, Int64}:
     A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  O  U  B  J  Z  X  *
  A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0  0̲  0̲ -2  0̲ -1  0 -4
  R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3  0̲  0̲ -1  0̲  0 -1 -4
  N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  0̲  0̲  3  0̲  0 -1 -4
  D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  0̲  0̲  4  0̲  1 -1 -4
  C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1  0̲  0̲ -3  0̲ -3 -2 -4
  Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0̲  0̲  0  0̲  3 -1 -4
  E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  0̲  0̲  1  0̲  4 -1 -4
  G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3  0̲  0̲ -1  0̲ -2 -1 -4
  H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0̲  0̲  0  0̲  0 -1 -4
  I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3  0̲  0̲ -3

In [22]:
grantham = [[simmat[i, j] for i in AA_list] for j in AA_list]
grantham = hcat(grantham...)

blosum = [[simmatblo[i, j] for i in AA_list] for j in AA_list]
blosum = hcat(blosum...)

20×20 Matrix{Int64}:
  4   0  -2  -1  -2   0  -2  -1  -1  …  -2  -1  -1  -1   1   0   0  -3  -2
  0   9  -3  -4  -2  -3  -3  -1  -3     -3  -3  -3  -3  -1  -1  -1  -2  -2
 -2  -3   6   2  -3  -1  -1  -3  -1      1  -1   0  -2   0  -1  -3  -4  -3
 -1  -4   2   5  -3  -2   0  -3   1      0  -1   2   0   0  -1  -2  -3  -2
 -2  -2  -3  -3   6  -3  -1   0  -3     -3  -4  -3  -3  -2  -2  -1   1   3
  0  -3  -1  -2  -3   6  -2  -4  -2  …   0  -2  -2  -2   0  -2  -3  -2  -3
 -2  -3  -1   0  -1  -2   8  -3  -1      1  -2   0   0  -1  -2  -3  -2   2
 -1  -1  -3  -3   0  -4  -3   4  -3     -3  -3  -3  -3  -2  -1   3  -3  -1
 -1  -3  -1   1  -3  -2  -1  -3   5      0  -1   1   2   0  -1  -2  -3  -2
 -1  -1  -4  -3   0  -4  -3   2  -2     -3  -3  -2  -2  -2  -1   1  -2  -1
 -1  -1  -3  -2   0  -3  -2   1  -1  …  -2  -2   0  -1  -1  -1   1  -1  -1
 -2  -3   1   0  -3   0   1  -3   0      6  -2   0   0   1   0  -3  -4  -2
 -1  -3  -1  -1  -4  -2  -2  -3  -1     -2   7  -1  -2  -1  -1  -2  -4  -3
 -1 

In [27]:
function norm_mat(matrix, zero=false)
    if zero == true
        min_value = 0
    else 
        min_value = minimum(matrix)
    end
    
    max_value = maximum(matrix)

    normalized_matrix = (matrix .- min_value) ./ (max_value - min_value)
    return normalized_matrix
end

norm_mat (generic function with 2 methods)

In [31]:
grantham

20×20 Matrix{Int64}:
   0  195  126  107  113   60   86   94  …   91  112   99   58   64  148  112
 195    0  154  170  205  159  174  198     154  180  112  149  192  215  194
 126  154    0   45  177   94   81  168      61   96   65   85  152  181  160
 107  170   45    0  140   98   40  134      29   54   80   65  121  152  122
 113  205  177  140    0  153  100   21     116   97  155  103   50   40   22
  60  159   94   98  153    0   98  135  …   87  125   56   59  109  184  147
  86  174   81   40  100   98    0   94      24   29   89   47   84  115   83
  94  198  168  134   21  135   94    0     109   97  142   89   29   61   33
 106  202  101   56  102  127   32  102      53   26  121   78   97  110   85
  96  198  172  138   22  138   99    5     113  102  145   92   32   61   36
  84  196  160  126   28  127   87   10  …  101   91  135   81   21   67   36
 111  139   23   42  158   80   68  149      46   86   46   65  133  174  143
  27  169  108   93  114   42   77   95    

In [39]:
norm_blosum = broadcast(abs, norm_mat(blosum) .-1)
setindex!.(Ref(norm_blosum), 0.0, 1:20, 1:20)
norm_grant = norm_mat(grantham, true)


20×20 Matrix{Float64}:
 0.0       0.906977  0.586047  0.497674  …  0.297674   0.688372  0.52093
 0.906977  0.0       0.716279  0.790698     0.893023   1.0       0.902326
 0.586047  0.716279  0.0       0.209302     0.706977   0.84186   0.744186
 0.497674  0.790698  0.209302  0.0          0.562791   0.706977  0.567442
 0.525581  0.953488  0.823256  0.651163     0.232558   0.186047  0.102326
 0.27907   0.739535  0.437209  0.455814  …  0.506977   0.855814  0.683721
 0.4       0.809302  0.376744  0.186047     0.390698   0.534884  0.386047
 0.437209  0.92093   0.781395  0.623256     0.134884   0.283721  0.153488
 0.493023  0.939535  0.469767  0.260465     0.451163   0.511628  0.395349
 0.446512  0.92093   0.8       0.64186      0.148837   0.283721  0.167442
 0.390698  0.911628  0.744186  0.586047  …  0.0976744  0.311628  0.167442
 0.516279  0.646512  0.106977  0.195349     0.618605   0.809302  0.665116
 0.125581  0.786047  0.502326  0.432558     0.316279   0.683721  0.511628
 0.423256  0.716

In [48]:
distance_matrix = norm_blosum
# Generate binary vectors using the desired pairwise similarities
dim = 10000
num_vectors = size(distance_matrix, 1)
num_trials = 100

best_vectors, best_calculated_distance_matrix, best_error = find_best_binary_vectors(dim, num_vectors, distance_matrix, num_trials, 500, 500)

println("Desired pairwise similarities:")
println(distance_matrix)

println("\nBest generated binary vectors:")
println(best_vectors)

println("\nBest calculated pairwise similarities:")
println(best_calculated_distance_matrix)

@printf("\nBest error: %f", best_error)

TaskFailedException: TaskFailedException

    nested task error: TaskFailedException
    Stacktrace:
      [1] wait
        @ ./task.jl:345 [inlined]
      [2] threading_run(fun::var"#146#threadsfor_fun#133"{var"#146#threadsfor_fun#132#134"{Matrix{Float64}, Transpose{Bool, BitMatrix}, Int64, UnitRange{Int64}}}, static::Bool)
        @ Base.Threads ./threadingconstructs.jl:38
      [3] macro expansion
        @ ./threadingconstructs.jl:89 [inlined]
      [4] fitness_func(individual::BitVector, distance_matrix::Matrix{Float64}, dim::Int64)
        @ Main ~/Master-Thesis/notebooks/AA_em2.ipynb:18
      [5] #137
        @ ~/Master-Thesis/notebooks/AA_em2.ipynb:45 [inlined]
      [6] value
        @ ~/.julia/packages/Evolutionary/65hL6/src/api/objective.jl:52 [inlined]
      [7] macro expansion
        @ ~/.julia/packages/Evolutionary/65hL6/src/api/objective.jl:90 [inlined]
      [8] (::Evolutionary.var"#37#threadsfor_fun#10"{Evolutionary.var"#37#threadsfor_fun#9#11"{EvolutionaryObjective{var"#137#138"{Int64, Matrix{Float64}}, Float64, BitVector, Val{:thread}}, Vector{Float64}, Vector{BitVector}, UnitRange{Int64}}})(tid::Int64; onethread::Bool)
        @ Evolutionary ./threadingconstructs.jl:84
      [9] #37#threadsfor_fun
        @ ./threadingconstructs.jl:51 [inlined]
     [10] (::Base.Threads.var"#1#2"{Evolutionary.var"#37#threadsfor_fun#10"{Evolutionary.var"#37#threadsfor_fun#9#11"{EvolutionaryObjective{var"#137#138"{Int64, Matrix{Float64}}, Float64, BitVector, Val{:thread}}, Vector{Float64}, Vector{BitVector}, UnitRange{Int64}}}, Int64})()
        @ Base.Threads ./threadingconstructs.jl:30
    
        nested task error: InterruptException:
        Stacktrace:
          [1] Array
            @ ./boot.jl:459 [inlined]
          [2] BitArray
            @ ./bitarray.jl:37 [inlined]
          [3] BitArray
            @ ./bitarray.jl:71 [inlined]
          [4] similar
            @ ./bitarray.jl:372 [inlined]
          [5] similar
            @ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/LinearAlgebra/src/adjtrans.jl:212 [inlined]
          [6] similar
            @ ./abstractarray.jl:795 [inlined]
          [7] _unsafe_getindex(::IndexCartesian, ::Transpose{Bool, BitMatrix}, ::Int64, ::Base.Slice{Base.OneTo{Int64}})
            @ Base ./multidimensional.jl:887
          [8] _getindex
            @ ./multidimensional.jl:875 [inlined]
          [9] getindex
            @ ./abstractarray.jl:1241 [inlined]
         [10] macro expansion
            @ ~/Master-Thesis/notebooks/AA_em2.ipynb:21 [inlined]
         [11] (::var"#146#threadsfor_fun#133"{var"#146#threadsfor_fun#132#134"{Matrix{Float64}, Transpose{Bool, BitMatrix}, Int64, UnitRange{Int64}}})(tid::Int64; onethread::Bool)
            @ Main ./threadingconstructs.jl:84
         [12] #146#threadsfor_fun
            @ ./threadingconstructs.jl:51 [inlined]
         [13] (::Base.Threads.var"#1#2"{var"#146#threadsfor_fun#133"{var"#146#threadsfor_fun#132#134"{Matrix{Float64}, Transpose{Bool, BitMatrix}, Int64, UnitRange{Int64}}}, Int64})()
            @ Base.Threads ./threadingconstructs.jl:30

In [None]:
distance_matrix = norm_grantham
# Generate binary vectors using the desired pairwise similarities
dim = 10000
num_vectors = size(distance_matrix, 1)
num_trials = 100

gr_best_vectors, gr_best_calculated_distance_matrix, gr_best_error = find_best_binary_vectors(dim, num_vectors, distance_matrix, num_trials, 500, 500)

println("Desired pairwise similarities:")
println(distance_matrix)

println("\nBest generated binary vectors:")
println(gr_best_vectors)

println("\nBest calculated pairwise similarities:")
println(gr_best_calculated_distance_matrix)

@printf("\nBest error: %f", gr_best_error)