https://www.kaggle.com/datasets/anuragupadhyaya/anticancer-peptides-data-set

In [2]:
using Random

"""
Construct a binary vector. By default 10000 elements long.
"""
bitHDV(N::Int=10000) = bitrand(N)


"""
Bundles binary hyperdimensional vectors based on the element-wise majority rule.
"""
function bitadd(vectors::BitVector ...)
    v = reduce(.+, vectors)
    n = length(vectors) / 2
    x = [i > n ? 1 : i < n ? 0 : rand(0:1) for i in v]
    return convert(BitVector, x)
end


"""
Binds binary hyperdimensional vectors based on an element-wise XOR gate.
"""
bitbind(vectors::BitVector ...) =  reduce(.⊻, vectors)


"""
Permutes a binary hyperdimensional vector by an adjustable circular shift.
"""
bitperm(vector::BitVector, k::Int=1) = circshift(vector, k)


"""
Calculates the Hamming distance between two binary vectors.
"""
hamming(x::BitVector, y::BitVector) = sum(x .!= y)/length(x)

hamming

In [3]:
using DataFrames, CSV
data = CSV.read("ProtExdata/ACPs_Lung_cancer.csv", DataFrame)

Unnamed: 0_level_0,ID,sequence,class
Unnamed: 0_level_1,Int64,String,String31
1,1,AIGKFLHSAKKFGKAFVGEIMNS,mod. active
2,2,FAKALAKLAKKLL,mod. active
3,3,FAKALKALLKALKAL,mod. active
4,4,FAKFLAKFLKKAL,mod. active
5,5,FAKIIAKIAKIAKKIL,inactive - exp
6,6,FAKKFAKKFKKFAKKFAKFAFAF,mod. active
7,7,FAKKLAKKLAKAAL,inactive - exp
8,8,FAKKLAKKLAKAL,inactive - exp
9,9,FAKKLAKKLAKLAL,inactive - exp
10,10,FAKKLAKKLAKLL,inactive - exp


In [4]:
unique(data.class)
class_num = [i == "very active" ? 1 : i == "mod. active" ? 2 : i == "inactive - exp" ? 3 : 4 for i in data.class]
data[!, :class_num] = class_num
first(data, 5)

Unnamed: 0_level_0,ID,sequence,class,class_num
Unnamed: 0_level_1,Int64,String,String31,Int64
1,1,AIGKFLHSAKKFGKAFVGEIMNS,mod. active,2
2,2,FAKALAKLAKKLL,mod. active,2
3,3,FAKALKALLKALKAL,mod. active,2
4,4,FAKFLAKFLKKAL,mod. active,2
5,5,FAKIIAKIAKIAKKIL,inactive - exp,3


In [6]:
AA_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'O', 'U', 'B', 'J', 'Z', 'X']
AA_hdv = [bitHDV() for i in AA_list]
AA_dict = Dict(zip(AA_list, AA_hdv))

Dict{Char, BitVector} with 26 entries:
  'E' => [0, 0, 0, 1, 0, 0, 1, 1, 1, 1  …  0, 1, 0, 1, 0, 0, 0, 1, 1, 1]
  'Z' => [0, 1, 1, 1, 0, 0, 0, 1, 1, 1  …  1, 0, 0, 1, 0, 0, 0, 1, 1, 1]
  'X' => [1, 1, 0, 1, 0, 1, 0, 1, 1, 1  …  1, 0, 1, 1, 0, 1, 1, 0, 0, 1]
  'C' => [1, 0, 0, 0, 0, 1, 1, 0, 0, 0  …  0, 1, 1, 0, 0, 0, 0, 1, 0, 0]
  'B' => [0, 1, 0, 0, 1, 1, 1, 1, 1, 1  …  1, 0, 0, 0, 0, 0, 1, 0, 0, 0]
  'D' => [1, 0, 0, 1, 1, 1, 0, 0, 0, 0  …  0, 0, 0, 1, 0, 0, 0, 0, 1, 1]
  'A' => [1, 1, 1, 0, 0, 1, 0, 1, 1, 1  …  0, 1, 0, 0, 1, 0, 1, 1, 1, 0]
  'R' => [0, 0, 1, 0, 1, 1, 0, 1, 0, 1  …  1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
  'G' => [1, 0, 0, 0, 0, 0, 1, 1, 0, 1  …  0, 1, 1, 0, 0, 0, 1, 1, 0, 1]
  'N' => [0, 1, 1, 1, 1, 1, 1, 0, 0, 1  …  1, 1, 1, 1, 1, 0, 0, 1, 0, 1]
  'Q' => [1, 0, 1, 1, 0, 0, 0, 1, 1, 1  …  1, 1, 1, 0, 1, 0, 1, 0, 1, 0]
  'M' => [1, 1, 1, 1, 1, 1, 0, 0, 1, 0  …  1, 1, 0, 0, 1, 0, 0, 1, 0, 0]
  'K' => [1, 0, 1, 0, 0, 0, 0, 1, 0, 0  …  1, 0, 0, 0, 1, 0, 1, 1, 1, 0]
  'F' => [1,

In [7]:
trimer_hdvs = Dict(aa1 * aa2 * aa3 => 
bitbind(AA_dict[aa1], bitperm(AA_dict[aa2]), bitperm(AA_dict[aa3], 2)) 
for aa1 in AA_list for aa2 in AA_list for aa3 in AA_list)

Dict{String, BitVector} with 17576 entries:
  "HTY" => [0, 0, 0, 0, 0, 1, 1, 1, 0, 1  …  1, 1, 0, 1, 0, 0, 0, 1, 0, 0]
  "HBJ" => [0, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  0, 1, 0, 0, 1, 1, 1, 1, 0, 1]
  "MSW" => [1, 0, 1, 0, 1, 0, 0, 0, 0, 0  …  0, 1, 1, 0, 1, 1, 1, 1, 0, 1]
  "BKI" => [1, 1, 0, 0, 1, 0, 1, 1, 1, 1  …  0, 0, 0, 0, 0, 0, 1, 0, 1, 1]
  "ETI" => [1, 1, 0, 1, 0, 0, 0, 1, 1, 1  …  0, 1, 0, 0, 1, 1, 1, 1, 0, 0]
  "OLQ" => [1, 0, 1, 0, 0, 0, 1, 1, 0, 1  …  1, 1, 0, 1, 1, 0, 1, 1, 0, 1]
  "NMD" => [1, 1, 1, 0, 0, 1, 1, 1, 0, 0  …  0, 1, 0, 1, 1, 0, 0, 1, 1, 1]
  "YVC" => [1, 1, 0, 1, 1, 0, 0, 1, 0, 1  …  0, 0, 0, 1, 1, 0, 0, 1, 1, 1]
  "QPE" => [1, 0, 0, 1, 0, 1, 0, 1, 1, 1  …  0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
  "KIW" => [0, 0, 0, 1, 0, 1, 1, 0, 0, 1  …  1, 1, 0, 0, 1, 0, 1, 0, 1, 1]
  "BPX" => [1, 1, 0, 1, 1, 0, 1, 0, 0, 1  …  1, 0, 0, 0, 1, 1, 0, 0, 0, 1]
  "NLG" => [0, 0, 0, 0, 1, 0, 1, 0, 1, 1  …  1, 0, 0, 1, 1, 1, 1, 1, 1, 0]
  "YWG" => [0, 1, 1, 0, 0, 1, 1, 0, 1, 0  …  1, 0, 0, 0,

In [8]:
function embedder(sequence)
    l = [trimer_hdvs[sequence[i:i+2]] for i in 1:length(sequence)-2]
    v = bitadd(hcat(l)...)
    return v
end

embedder (generic function with 1 method)

In [9]:
l = BitVector[]
for i in data.sequence
    push!(l, embedder(i))
end
data[!, :hdv] = l

901-element Vector{BitVector}:
 [1, 1, 0, 0, 1, 1, 1, 0, 0, 0  …  1, 1, 0, 1, 1, 1, 0, 1, 1, 0]
 [1, 0, 1, 1, 1, 1, 0, 0, 0, 0  …  1, 1, 1, 1, 1, 0, 0, 1, 0, 0]
 [1, 1, 0, 1, 1, 0, 0, 1, 1, 1  …  1, 1, 0, 1, 0, 0, 0, 1, 0, 1]
 [1, 1, 0, 1, 1, 0, 0, 0, 1, 1  …  1, 1, 0, 1, 1, 0, 1, 1, 0, 0]
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1  …  0, 1, 0, 0, 1, 0, 1, 1, 0, 1]
 [0, 0, 1, 0, 1, 1, 0, 0, 0, 0  …  0, 1, 1, 1, 1, 1, 0, 0, 1, 0]
 [0, 0, 1, 1, 1, 1, 0, 0, 0, 0  …  0, 0, 1, 1, 1, 1, 0, 1, 0, 0]
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0  …  0, 0, 1, 1, 1, 0, 0, 1, 0, 0]
 [0, 0, 1, 1, 1, 1, 0, 0, 0, 0  …  1, 0, 1, 0, 0, 0, 1, 1, 0, 1]
 [1, 0, 1, 1, 1, 1, 1, 1, 0, 0  …  1, 0, 1, 0, 1, 0, 0, 1, 0, 0]
 [0, 1, 0, 1, 1, 0, 0, 1, 0, 0  …  0, 0, 1, 1, 1, 0, 0, 1, 0, 1]
 [0, 0, 1, 1, 1, 1, 0, 1, 0, 0  …  0, 0, 1, 1, 1, 1, 1, 1, 0, 1]
 [1, 0, 1, 1, 1, 1, 0, 1, 0, 0  …  0, 0, 1, 1, 0, 0, 0, 1, 0, 1]
 ⋮
 [1, 1, 0, 1, 0, 0, 0, 1, 1, 0  …  0, 1, 0, 0, 1, 1, 1, 1, 1, 1]
 [0, 0, 0, 0, 0, 1, 0, 0, 1, 1  …  0, 1, 0, 0, 1, 0, 0, 

In [10]:
active_hdv = bitadd(hcat([i for i in data[data.class_num .== 1, :hdv]])...)
modactive_hdv = bitadd(hcat([i for i in data[data.class_num .== 2, :hdv]])...)
notactive_hdv = bitadd(hcat([i for i in data[data.class_num .> 2, :hdv]])...)

10000-element BitVector:
 0
 1
 0
 1
 1
 0
 0
 1
 0
 1
 1
 0
 0
 ⋮
 1
 1
 1
 1
 0
 1
 0
 1
 0
 1
 0
 1

In [11]:
println(hamming(data[data.sequence .== "FLKLLKKLAAKFLPTIICKISYKC", :hdv]..., active_hdv))
println(hamming(data[data.sequence .== "FLKLLKKLAAKFLPTIICKISYKC", :hdv]..., modactive_hdv))
println(hamming(data[data.sequence .== "FLKLLKKLAAKFLPTIICKISYKC", :hdv]..., notactive_hdv))

0.4117
0.4348
0.4504


In [12]:
n = nrow(data)

train = rand(n) .< 0.8
test = train = .! train

train_df = data[[i for i in 1:n if train[i] == 1], :]
test_df = data[[i for i in 1:n if test[i] == 1], :]

active_hdv_t = bitadd(hcat([i for i in train_df[train_df.class_num .== 1, :hdv]])...)
modactive_hdv_t = bitadd(hcat([i for i in train_df[train_df.class_num .== 2, :hdv]])...)
notactive_exp_hdv_t = bitadd(hcat([i for i in train_df[train_df.class_num .== 3, :hdv]])...)
notactive_virt_hdv_t = bitadd(hcat([i for i in train_df[train_df.class_num .== 4, :hdv]])...)

function predict(seq)
    y = [hamming(active_hdv_t, seq), hamming(modactive_hdv_t, seq), hamming(notactive_exp_hdv_t, seq), hamming(notactive_virt_hdv_t, seq)]
    return findmin(y)[2]
end

using StatsBase
pred = [predict(i) for i in test_df.hdv]
mean(test_df.class_num .== pred)



0.9523809523809523