https://www.kaggle.com/datasets/anuragupadhyaya/anticancer-peptides-data-set

In [42]:
using Random

"""
Construct a binary vector. By default 10000 elements long.
"""
bitHDV(N::Int=10000) = bitrand(N)


"""
Bundles binary hyperdimensional vectors based on the element-wise majority rule.
"""
function bitadd(vectors::BitVector ...)
    v = reduce(.+, vectors)
    n = length(vectors) / 2
    x = [i > n ? 1 : i < n ? 0 : rand(0:1) for i in v]
    return convert(BitVector, x)
end


"""
Binds binary hyperdimensional vectors based on an element-wise XOR gate.
"""
bitbind(vectors::BitVector ...) =  reduce(.⊻, vectors)


"""
Permutes a binary hyperdimensional vector by an adjustable circular shift.
"""
bitperm(vector::BitVector, k::Int=1) = circshift(vector, k)


"""
Calculates the Hamming distance between two binary vectors.
"""
hamming(x::BitVector, y::BitVector) = sum(x .!= y)/length(x)

hamming

In [43]:
using DataFrames, CSV
data = CSV.read("ProtExdata/ACPs_Breast_cancer.csv", DataFrame)

Unnamed: 0_level_0,ID,sequence,class
Unnamed: 0_level_1,Int64,String,String31
1,1,AAWKWAWAKKWAKAKKWAKAA,mod. active
2,2,AIGKFLHSAKKFGKAFVGEIMNS,mod. active
3,3,AWKKWAKAWKWAKAKWWAKAA,mod. active
4,4,ESFSDWWKLLAE,mod. active
5,5,ETFADWWKLLAE,mod. active
6,6,ETFSDWWKLLAE,mod. active
7,7,FAKALAKLAKKLL,mod. active
8,8,FAKALKALLKALKAL,inactive - exp
9,9,FAKFLAKFLKKAL,mod. active
10,10,FAKIIAKIAKIAKKIL,inactive - exp


In [44]:
unique(data.class)
class_num = [i == "very active" ? 1 : i == "mod. active" ? 2 : i == "inactive - exp" ? 3 : 4 for i in data.class]
data[!, :class_num] = class_num

947-element Vector{Int64}:
 2
 2
 2
 2
 2
 2
 2
 3
 2
 3
 3
 2
 3
 ⋮
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4
 4

In [52]:
AA_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'O', 'U', 'B', 'J', 'Z', 'X']
AA_hdv = [bitHDV() for i in AA_list]
AA_dict = Dict(zip(AA_list, AA_hdv))

Dict{Char, BitVector} with 26 entries:
  'E' => [1, 1, 1, 1, 0, 0, 0, 0, 1, 1  …  1, 0, 0, 0, 0, 1, 1, 0, 1, 0]
  'Z' => [0, 1, 0, 1, 1, 0, 1, 0, 1, 0  …  1, 1, 1, 0, 0, 1, 0, 1, 1, 0]
  'X' => [1, 0, 1, 0, 1, 1, 0, 1, 1, 0  …  1, 0, 1, 0, 1, 0, 1, 1, 0, 0]
  'C' => [0, 0, 1, 1, 1, 0, 1, 1, 1, 0  …  1, 1, 0, 1, 0, 0, 1, 1, 0, 1]
  'B' => [1, 1, 1, 1, 1, 0, 0, 1, 0, 1  …  0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
  'D' => [1, 1, 1, 1, 0, 1, 1, 0, 1, 1  …  1, 1, 0, 1, 0, 1, 0, 0, 1, 0]
  'A' => [0, 1, 0, 1, 0, 0, 1, 0, 0, 1  …  1, 1, 0, 0, 1, 1, 0, 1, 1, 1]
  'R' => [0, 0, 0, 0, 0, 1, 0, 1, 0, 0  …  1, 0, 1, 0, 0, 1, 0, 0, 1, 1]
  'G' => [1, 0, 1, 0, 0, 0, 0, 1, 0, 1  …  1, 1, 1, 1, 1, 0, 0, 0, 1, 0]
  'N' => [1, 1, 1, 1, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 1, 1, 0, 0, 1, 0, 0]
  'Q' => [1, 0, 1, 1, 1, 0, 0, 1, 1, 1  …  0, 1, 1, 0, 1, 0, 1, 1, 1, 1]
  'M' => [1, 0, 1, 0, 1, 0, 0, 0, 0, 0  …  1, 1, 1, 0, 0, 1, 1, 1, 1, 1]
  'K' => [0, 1, 0, 0, 0, 0, 1, 1, 1, 1  …  0, 1, 0, 0, 0, 0, 1, 0, 1, 0]
  'F' => [1,

In [55]:
trimer_hdvs = Dict(aa1 * aa2 * aa3 => bitbind(AA_dict[aa1], bitperm(AA_dict[aa2]), bitperm(AA_dict[aa3], 2)) for aa1 in AA_list for aa2 in AA_list for aa3 in AA_list)

Dict{String, BitVector} with 17576 entries:
  "HTY" => [1, 1, 1, 0, 1, 1, 1, 0, 1, 0  …  0, 0, 1, 0, 1, 0, 0, 0, 0, 1]
  "HBJ" => [0, 1, 1, 1, 0, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 0, 0, 1, 0, 1, 1]
  "MSW" => [1, 1, 1, 0, 0, 1, 1, 0, 0, 0  …  0, 0, 0, 1, 1, 0, 0, 0, 0, 1]
  "BKI" => [1, 1, 0, 1, 0, 0, 1, 0, 0, 1  …  1, 0, 0, 1, 0, 0, 0, 0, 1, 0]
  "ETI" => [1, 1, 0, 1, 0, 1, 0, 0, 1, 1  …  0, 1, 1, 0, 0, 0, 0, 1, 1, 1]
  "OLQ" => [0, 1, 0, 0, 0, 0, 1, 0, 0, 1  …  0, 0, 1, 0, 1, 1, 0, 1, 0, 1]
  "NMD" => [1, 0, 0, 1, 1, 0, 0, 0, 0, 1  …  1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
  "YVC" => [0, 1, 1, 1, 1, 0, 1, 1, 1, 1  …  0, 0, 1, 1, 0, 1, 1, 1, 1, 1]
  "QPE" => [1, 0, 1, 0, 0, 0, 1, 1, 0, 0  …  1, 1, 1, 1, 0, 0, 1, 1, 0, 0]
  "KIW" => [1, 0, 1, 0, 1, 1, 0, 1, 0, 1  …  0, 0, 1, 1, 1, 0, 0, 1, 1, 0]
  "BPX" => [0, 1, 1, 1, 0, 1, 0, 0, 1, 1  …  1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
  "NLG" => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
  "YWG" => [1, 0, 1, 1, 0, 1, 1, 0, 0, 1  …  1, 1, 1, 1,