In [67]:
using Random
using LinearAlgebra
"""
Construct a bipolar vector. By default 10000 elements long.
"""
hdv(N::Int=10000) = vec(rand((-1,1), 1, N))


"""
Bundles bipolar hyperdimensional vectors.
"""
add(vectors::Vector...) = reduce(.+, vectors) .|> sign


"""
Binds binpolar hyperdimensional vectors.
"""
multiply(vectors::Vector...) = reduce(.*, vectors)


"""
Permutes a bipolar hyperdimensional vector by an adjustable circular shift.
"""
perm(vector::Vector, k::Int=1) = circshift(vector, k)


"""
Calculates the cosine similarity between two bipolar vectors.
"""
cosine(x, y) = dot(x, y) / (norm(x) * norm(y))

cosine

In [55]:
"""
Construct a binary vector. By default 10000 elements long.
"""
bithdv(N::Int=10000) = bitrand(N)


"""
Bundles binary hyperdimensional vectors based on the element-wise majority rule.
"""
function bitadd(vectors::BitVector ...)
    v = reduce(.+, vectors)
    n = length(vectors) / 2
    x = [i > n ? 1 : i < n ? 0 : rand(0:1) for i in v]
    return convert(BitVector, x)
end


"""
Binds binary hyperdimensional vectors based on an element-wise XOR gate.
"""
bitbind(vectors::BitVector ...) =  reduce(.⊻, vectors)


"""
Permutes a hyperdimensional vector by an adjustable circular shift.
"""
bitperm(vector::BitVector, k::Int=1) = circshift(vector, k)


"""
Calculates the Hamming distance between two binary vectors.
"""
hamming(x::BitVector, y::BitVector) = sum(x .!= y)/length(x)

hamming

In [56]:
using DataFrames, CSV
data = CSV.read("ProtExdata/ACPs_Breast_cancer.csv", DataFrame)
unique(data.class)
class_num = [i == "very active" ? 1 : i == "mod. active" ? 2 : i == "inactive - exp" ? 3 : 4 for i in data.class]
data[!, :class_num] = class_num
data = data[data.class_num .!= 4, :]
first(data, 5)

Unnamed: 0_level_0,ID,sequence,class,class_num
Unnamed: 0_level_1,Int64,String,String31,Int64
1,1,AAWKWAWAKKWAKAKKWAKAA,mod. active,2
2,2,AIGKFLHSAKKFGKAFVGEIMNS,mod. active,2
3,3,AWKKWAKAWKWAKAKWWAKAA,mod. active,2
4,4,ESFSDWWKLLAE,mod. active,2
5,5,ETFADWWKLLAE,mod. active,2


In [57]:
AA_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
groups = Dict("polar" => ['S', 'T', 'Y', 'N', 'Q'], "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F'], 
         "pos+" => ['L', 'R', 'H'], "neg-" => ['D', 'E'])

Dict{String, Vector{Char}} with 4 entries:
  "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F']
  "polar"     => ['S', 'T', 'Y', 'N', 'Q']
  "pos+"      => ['L', 'R', 'H']
  "neg-"      => ['D', 'E']

In [58]:
sorted_list = [i for (k,v) in groups for i in v]

function AA_dist_calc(x, cos=true)
    df = DataFrame([[] for i in 1:20], string.(sorted_list), makeunique=true)
    for i in sorted_list
        r = []
        for j in sorted_list
            if cos == true
                push!(r, cosine(x[i], x[j]))
            else
                push!(r, hamming(x[i], x[j]))
            end
        end
        push!(df, r)
    end
    insertcols!(df, 1, :coef => sorted_list)
    return df
end
    

AA_dist_calc (generic function with 2 methods)

In [59]:
function scaler(row, lower, upper)
    minx = minimum(row)
    maxx = maximum(row)
    x = [lower + ((i - minx)*(upper-lower))/(maxx - minx) for i in row]
    return x
end

function mat_scaler(matrix, lower, upper, dim = 1)
    if dim == 1
        scaled = reduce(hcat, [scaler(matrix[:, i], lower, upper) for i in 1:size(matrix, 2)])
    elseif dim == 2
        scaled = reduce(hcat, [scaler(matrix[:, i], lower, upper) for i in 1:size(matrix, 1)])
    end
    return scaled
end

mat_scaler (generic function with 2 methods)

In [60]:
using PyCall
using TableTransforms
py"""
import pickle
infile = open("/home/mfat/Unief/Thesis/ThesisFatjanov/data/aa_embeddings",'rb')
embeddings = pickle.load(infile)
infile.close()
"""
embeddings = PyArray(py"embeddings"o)

embeddings_hdv = embeddings*random_hdv_bin
embeddings_hdv = mat_scaler(embeddings_hdv,-1,1)
hdvs = [embeddings_hdv[i, :] for i in 1:20]
AA_dict = Dict(zip(AA_list, hdvs))

embeddings_hdv_bin = round.(mat_scaler(embeddings*random_hdv_bin, 0, 1))
hdvs_bin = [convert(BitVector, embeddings_hdv_bin[i, :]) for i in 1:20]
AA_dict_bin = Dict(zip(AA_list, hdvs_bin))

embeddings_hdv_bip = mat_scaler(embeddings*random_hdv_bip, -1, 1) .|> sign
hdvs_bip = [embeddings_hdv_bip[i, :] for i in 1:20]
AA_dict_bip = Dict(zip(AA_list, hdvs_bip))

Dict{Char, Vector{Float32}} with 20 entries:
  'M' => [-1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0  …  -1.0, -1.0…
  'K' => [-1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0  …  -1.0, -1.0,…
  'P' => [1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0  …  1.0, -1.0, …
  'Q' => [-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0  …  -1.0, -1.…
  'I' => [-1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0  …  -1.0, -1.0,…
  'H' => [-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0  …  -1.0, -1.…
  'E' => [1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0  …  1.0, -1.0,…
  'W' => [-1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, -1.0  …  1.0, -1.0…
  'S' => [-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0  …  -1.0, -1.…
  'T' => [-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0  …  -1.0, -1.…
  'C' => [1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, -1.0  …  1.0, -1.0, 1…
  'D' => [-1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0 

In [61]:
df = AA_dist_calc(AA_dict)
df_bip = AA_dist_calc(AA_dict_bip)
df_bin = AA_dist_calc(AA_dict_bin, false)

Unnamed: 0_level_0,coef,G,A,V,C,P,L,I,M,W,F
Unnamed: 0_level_1,Char,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any
1,G,0.0,0.019,0.8369,0.6442,0.2981,0.7909,0.0756,0.1097,0.2431,0.4323
2,A,0.019,0.0,0.8403,0.6316,0.3051,0.7943,0.088,0.1223,0.2551,0.4199
3,V,0.8369,0.8403,0.0,0.4959,0.769,0.0582,0.8413,0.7744,0.74,0.7024
4,C,0.6442,0.6316,0.4959,0.0,0.4093,0.5415,0.6284,0.6775,0.5181,0.2183
5,P,0.2981,0.3051,0.769,0.4093,0.0,0.7864,0.2357,0.2924,0.1374,0.2338
6,L,0.7909,0.7943,0.0582,0.5415,0.7864,0.0,0.8025,0.7286,0.7284,0.7434
7,I,0.0756,0.088,0.8413,0.6284,0.2357,0.8025,0.0,0.0823,0.1677,0.4237
8,M,0.1097,0.1223,0.7744,0.6775,0.2924,0.7286,0.0823,0.0,0.1882,0.4882
9,W,0.2431,0.2551,0.74,0.5181,0.1374,0.7284,0.1677,0.1882,0.0,0.3588
10,F,0.4323,0.4199,0.7024,0.2183,0.2338,0.7434,0.4237,0.4882,0.3588,0.0


In [66]:
trimer_hdvs = Dict(aa1 * aa2 * aa3 => 
multiply(AA_dict[aa1], perm(AA_dict[aa2]), perm(AA_dict[aa3], 2)) 
for aa1 in AA_list for aa2 in AA_list for aa3 in AA_list)

trimer_hdvs_bip = Dict(aa1 * aa2 * aa3 => 
multiply(AA_dict_bip[aa1], perm(AA_dict_bip[aa2]), perm(AA_dict_bip[aa3], 2)) 
for aa1 in AA_list for aa2 in AA_list for aa3 in AA_list)

trimer_hdvs_bin = Dict(aa1 * aa2 * aa3 => 
bitbind(AA_dict_bin[aa1], bitperm(AA_dict_bin[aa2]), bitperm(AA_dict_bin[aa3], 2)) 
for aa1 in AA_list for aa2 in AA_list for aa3 in AA_list)

Dict{String, BitVector} with 8000 entries:
  "DRR" => [1, 1, 1, 1, 1, 0, 0, 1, 1, 1  …  1, 1, 0, 1, 0, 0, 1, 1, 0, 1]
  "HTY" => [1, 1, 0, 0, 1, 1, 1, 1, 1, 1  …  0, 0, 0, 1, 1, 1, 1, 1, 0, 0]
  "QAM" => [0, 0, 1, 0, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 0, 0, 0, 1, 1, 1, 1]
  "WMA" => [0, 1, 0, 1, 0, 0, 0, 1, 1, 0  …  1, 1, 1, 0, 1, 0, 1, 1, 0, 1]
  "PPV" => [0, 0, 1, 1, 0, 1, 1, 0, 0, 1  …  1, 1, 0, 1, 1, 0, 0, 0, 1, 0]
  "WNG" => [1, 1, 0, 1, 0, 0, 0, 1, 1, 0  …  1, 1, 1, 1, 1, 0, 1, 1, 0, 0]
  "MSW" => [1, 0, 1, 0, 1, 1, 0, 1, 1, 1  …  0, 1, 1, 0, 0, 1, 1, 1, 1, 1]
  "TKL" => [1, 0, 0, 1, 1, 0, 1, 0, 0, 0  …  0, 1, 1, 0, 0, 0, 0, 0, 0, 1]
  "ETI" => [1, 1, 0, 1, 0, 0, 0, 1, 1, 0  …  1, 1, 1, 1, 0, 0, 1, 1, 0, 0]
  "KLF" => [0, 0, 0, 1, 1, 0, 0, 0, 0, 0  …  0, 1, 0, 0, 1, 0, 0, 0, 0, 1]
  "AGI" => [1, 1, 1, 0, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 0, 0, 0, 1, 1, 0, 1]
  "NMD" => [0, 1, 1, 0, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 1, 1, 0, 1, 1, 0, 0]
  "YMQ" => [0, 1, 0, 1, 1, 0, 0, 1, 1, 0  …  1, 0, 0, 0, 

In [73]:
function embedder(sequence, c)
    if c == 1
        l = [trimer_hdvs[sequence[i:i+2]] for i in 1:length(sequence)-2]
        v = add(hcat(l)...)
        return v
    elseif c == 2
        l = [trimer_hdvs_bip[sequence[i:i+2]] for i in 1:length(sequence)-2]
        v = add(hcat(l)...)
        return v
    elseif c == 3
        l = [trimer_hdvs_bin[sequence[i:i+2]] for i in 1:length(sequence)-2]
        v = bitadd(hcat(l)...)
        return v
    end
end

l = []
for i in data.sequence
    push!(l, embedder(i, 1))
end
data[!, :hdv_r] = l

l = []
for i in data.sequence
    push!(l, embedder(i, 2))
end
data[!, :hdv_bip] = l

l = BitVector[]
for i in data.sequence
    push!(l, embedder(i, 3))
end
data[!, :hdv_bin] = l

197-element Vector{BitVector}:
 [0, 0, 1, 1, 1, 0, 0, 1, 1, 1  …  1, 1, 0, 1, 0, 0, 1, 1, 0, 0]
 [0, 0, 1, 0, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 0, 1, 0, 1, 1, 0, 0]
 [0, 0, 1, 1, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 1, 1, 0, 1, 1, 0, 0]
 [1, 1, 1, 0, 1, 0, 0, 1, 1, 1  …  0, 1, 1, 1, 0, 0, 1, 1, 0, 0]
 [1, 1, 1, 1, 1, 0, 0, 1, 1, 1  …  0, 0, 1, 1, 0, 0, 1, 1, 0, 0]
 [1, 0, 1, 0, 1, 0, 0, 1, 1, 1  …  0, 0, 1, 0, 0, 0, 1, 1, 0, 0]
 [1, 0, 0, 1, 1, 0, 1, 0, 0, 0  …  0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
 [0, 0, 0, 1, 1, 0, 1, 0, 0, 0  …  0, 1, 1, 0, 0, 0, 0, 0, 0, 1]
 [0, 1, 0, 0, 1, 1, 1, 0, 0, 0  …  1, 1, 0, 0, 1, 1, 0, 0, 0, 1]
 [0, 1, 1, 0, 0, 0, 0, 1, 1, 1  …  1, 1, 0, 0, 1, 0, 1, 1, 0, 0]
 [0, 0, 0, 1, 1, 0, 1, 0, 0, 0  …  1, 1, 1, 0, 0, 0, 0, 0, 1, 1]
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 1  …  0, 1, 0, 0, 1, 0, 1, 1, 0, 1]
 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0  …  1, 1, 0, 0, 1, 0, 0, 0, 0, 0]
 ⋮
 [1, 1, 0, 1, 1, 1, 1, 0, 0, 0  …  0, 1, 1, 0, 0, 1, 0, 0, 1, 0]
 [1, 1, 0, 1, 1, 1, 1, 0, 0, 0  …  0, 1, 1, 0, 0, 1, 0, 

In [74]:
active_hdv = add(hcat([i for i in data[data.class_num .== 1, :hdv_r]])...)
modactive_hdv = add(hcat([i for i in data[data.class_num .== 2, :hdv_r]])...)
notactive_exp_hdv = add(hcat([i for i in data[data.class_num .== 3, :hdv_r]])...)

active_hdv_bip = add(hcat([i for i in data[data.class_num .== 1, :hdv_bip]])...)
modactive_hdv_bip = add(hcat([i for i in data[data.class_num .== 2, :hdv_bip]])...)
notactive_exp_hdv_bip = add(hcat([i for i in data[data.class_num .== 3, :hdv_bip]])...)

active_hdv_bin = bitadd(hcat([i for i in data[data.class_num .== 1, :hdv_bin]])...)
modactive_hdv_bin = bitadd(hcat([i for i in data[data.class_num .== 2, :hdv_bin]])...)
notactive_exp_hdv_bin = bitadd(hcat([i for i in data[data.class_num .== 3, :hdv_bin]])...)

10000-element BitVector:
 1
 0
 0
 1
 1
 0
 1
 0
 0
 0
 0
 1
 1
 ⋮
 1
 0
 1
 1
 0
 0
 1
 0
 0
 0
 1
 0

In [81]:
n = nrow(data)

train = rand(n) .< 0.8
test = train = .! train

train_df = data[[i for i in 1:n if train[i] == 1], :]
test_df = data[[i for i in 1:n if test[i] == 1], :]

active_hdv_t_bin = bitadd(hcat([i for i in train_df[train_df.class_num .== 1, :hdv_bin]])...)
modactive_hdv_t_bin = bitadd(hcat([i for i in train_df[train_df.class_num .== 2, :hdv_bin]])...)
notactive_exp_hdv_t_bin = bitadd(hcat([i for i in train_df[train_df.class_num .== 3, :hdv_bin]])...)

active_hdv_t = add(hcat([i for i in train_df[train_df.class_num .== 1, :hdv_r]])...)
modactive_hdv_t = add(hcat([i for i in train_df[train_df.class_num .== 2, :hdv_r]])...)
notactive_exp_hdv_t = add(hcat([i for i in train_df[train_df.class_num .== 3, :hdv_r]])...)

active_hdv_t_bip = add(hcat([i for i in train_df[train_df.class_num .== 1, :hdv_bip]])...)
modactive_hdv_t_bip  = add(hcat([i for i in train_df[train_df.class_num .== 2, :hdv_bip]])...)
notactive_exp_hdv_t_bip  = add(hcat([i for i in train_df[train_df.class_num .== 3, :hdv_bip]])...)

10000-element Vector{Float32}:
  1.0
 -1.0
  1.0
 -1.0
  1.0
 -1.0
 -1.0
  1.0
  1.0
  1.0
  1.0
 -1.0
  1.0
  ⋮
 -1.0
  1.0
  1.0
 -1.0
  1.0
  1.0
  1.0
  1.0
  1.0
 -1.0
  1.0
  1.0

In [77]:
function predict(seq, c)
    if c == 1
        y = [cosine(active_hdv_t, seq), cosine(modactive_hdv_t, seq), cosine(notactive_exp_hdv_t, seq)]
        return findmin(y)[2]
    elseif c == 3
        y = [hamming(active_hdv_t_bin, seq), hamming(modactive_hdv_t_bin, seq), hamming(notactive_exp_hdv_t_bin, seq)]
        return findmin(y)[2]
    elseif c == 2
        y = [cosine(active_hdv_t_bip, seq), cosine(modactive_hdv_t_bip, seq), cosine(notactive_exp_hdv_t_bip, seq)]
        return findmin(y)[2]
    end
end

predict (generic function with 1 method)

In [82]:
using StatsBase
pred = [predict(i, 1) for i in test_df.hdv_r]
println(mean(test_df.class_num .== pred))

pred = [predict(i, 2) for i in test_df.hdv_bip]
println(mean(test_df.class_num .== pred))

pred = [predict(i, 3) for i in test_df.hdv_bin]
println(mean(test_df.class_num .== pred))


0.07317073170731707
0.07317073170731707
0.5365853658536586


In [None]:
function convolved_embedding(sequence, tokens, k=3)
    """
    Simple 2-layer convolved embedding in hyperdimensional space
    """
    # layer 1
    kmer_hdvs = []
    for i in 1:length(sequence)-k+1
        kmer = sequence[i:i+k-1]
        aa_hdvs = [circshift(tokens[aa], k-l) for (l, aa) in enumerate(kmer)]
        push!(kmer_hdvs, bitbind(aa_hdvs))
    end
    
    # layer 2
    conv_kmer_hdvs = []
    for i in 1:length(kmer_hdvs)-k+1
        convolved_kmers = kmer_hdvs[i:i+k-1]
        conv_hdvs = [circshift(convolved_kmers[l], k-l) for (l, km) in enumerate(convolved_kmers)]
        push!(conv_kmer_hdvs, bitbind(conv_hdvs))
    end
    
    return bitadd(conv_kmer_hdvs)
end

convolved_embedding (generic function with 2 methods)