# How to include biological information into hyperdimensional vectors and is this useful?

In [1]:
include("../src/HDC.jl")
include("../src/math.jl")
using DataFrames
using CSV

## Method 1: Via embeddings (ESM-2)

In [2]:
#Load embeddings from last hidden layer of ESM-2 model (21x1280)
aa_embeddings = DataFrame(CSV.File("../data/amino_acid_embeddings.csv"))
amino_acids_esm = aa_embeddings.protein_ID
aa_emb = Matrix(aa_embeddings[:, 2:end])

# Create HDVs
HDV_mat_bit = nested_arrays2mat([bithdv() for i in 1:size(aa_emb)[2]])
HDV_mat_bip = nested_arrays2mat([hdv() for i in 1:size(aa_emb)[2]])

# Extend embeddings into hyperdimensional space
AA_bip_esm = aa_emb * permutedims(HDV_mat_bip) .|> sign
AA_bit_esm = mat_scaler(aa_emb * permutedims(HDV_mat_bit), 0, 1, 2) .|> round

21×10000 Matrix{Float64}:
 0.0  1.0  0.0  1.0  1.0  1.0  1.0  0.0  …  0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0     0.0  0.0  1.0  0.0  1.0  0.0  1.0
 0.0  0.0  0.0  1.0  1.0  0.0  1.0  1.0     1.0  0.0  1.0  0.0  1.0  0.0  0.0
 1.0  0.0  0.0  1.0  1.0  1.0  1.0  1.0     0.0  1.0  1.0  0.0  1.0  0.0  0.0
 1.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0     0.0  1.0  1.0  0.0  1.0  0.0  0.0
 1.0  0.0  0.0  0.0  1.0  1.0  0.0  1.0  …  0.0  0.0  1.0  0.0  1.0  0.0  0.0
 1.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0     0.0  1.0  0.0  0.0  1.0  0.0  0.0
 0.0  0.0  1.0  0.0  1.0  0.0  1.0  1.0     1.0  0.0  1.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  1.0  1.0  1.0  1.0  0.0     0.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.0  1.0  0.0  0.0  0.0  1.0  1.0  0.0     0.0  1.0  0.0  1.0  0.0  1.0  0.0
 ⋮                        ⋮              ⋱            ⋮                   
 1.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0     0.0  1.0  1.0  1.0  1.0  1.0  1.0
 0.0  1.0  1.0  1.0  0.0  1.0  0.0  0.0  

## Method 2: A bit more true to HDC framework but naive without proper training

In [13]:
## Initiate random HDVs for every AA
AA_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
groups = Dict("polar" => ['S', 'T', 'Y', 'N', 'Q'], "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F'], 
         "pos+" => ['L', 'R', 'H'], "neg-" => ['D', 'E'])

aa_bithdv = [bithdv() for i in 1:length(AA_list)]
aa_biphdv = [hdv() for i in 1:length(AA_list)]

aa_bithdv = Dict(zip(AA_list, aa_bithdv))
aa_biphdv = Dict(zip(AA_list, aa_biphdv))

# Make an HDV for every class
polar_hdv_bip = add([aa_biphdv[i] for i in groups["polar"]]...)
polar_hdv_bit = bitadd([aa_bithdv[i] for i in groups["polar"]]...)

nonpolar_hdv_bip = add([aa_biphdv[i] for i in groups["non-polar"]]...)
nonpolar_hdv_bit = bitadd([aa_bithdv[i] for i in groups["non-polar"]]...)

pos_hdv_bip = add([aa_biphdv[i] for i in groups["pos+"]]...)
pos_hdv_bit = bitadd([aa_bithdv[i] for i in groups["pos+"]]...)

neg_hdv_bip = add([aa_biphdv[i] for i in groups["neg-"]]...)
neg_hdv_bit = bitadd([aa_bithdv[i] for i in groups["neg-"]]...)

group_hdv_bip = Dict("polar"=>polar_hdv_bip, "non-polar" => nonpolar_hdv_bip, "pos+" => pos_hdv_bip, "neg-" => neg_hdv_bip)
group_hdv_bit = Dict("polar"=>polar_hdv_bit, "non-polar" => nonpolar_hdv_bit, "pos+" => pos_hdv_bit, "neg-" => neg_hdv_bit)

# Add random AA HDV to its class HDV to introduce similarity
for i in keys(groups)
    for j in groups[i]
        aa_biphdv[j] = add(aa_biphdv[j], group_hdv_bip[i])
    end
end

for i in keys(groups)
    for j in groups[i]
        aa_bithdv[j] = bitadd(aa_bithdv[j], group_hdv_bit[i])
    end
end