# How to include biological information into hyperdimensional vectors and is this useful?

In [2]:
include("../src/HDC.jl")
include("../src/math.jl")
include("../src/experimental.jl")
using DataFrames
using CSV
using JLD

## Method 1: Via embeddings (ESM-2)

In [12]:
#Load embeddings from last hidden layer of ESM-2 model (21x1280)
aa_embeddings = DataFrame(CSV.File("../data/amino_acid_embeddings.csv"))
amino_acids_esm = aa_embeddings.protein_ID
aa_emb = Matrix(aa_embeddings[:, 2:end])
# Create HDVs
HDV_mat_bit = nested_arrays2mat([bithdv() for i in 1:size(aa_emb)[2]], true)
HDV_mat_bip = nested_arrays2mat([hdv() for i in 1:size(aa_emb)[2]], true)

# Extend embeddings into hyperdimensional space
AA_bip_esm = permutedims(aa_emb * HDV_mat_bip .|> sign)
AA_bit_esm = permutedims(mat_scaler(aa_emb * HDV_mat_bit, 0, 1, 2) .|> round)

InterruptException: InterruptException:

## Method 2: A bit more true to HDC framework but naive without proper training

In [13]:
## Initiate random HDVs for every AA
AA_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
groups = Dict("polar" => ['S', 'T', 'Y', 'N', 'Q'], "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F'], 
         "pos+" => ['L', 'R', 'H'], "neg-" => ['D', 'E'])

aa_bithdv = [bithdv() for i in 1:length(AA_list)]
aa_biphdv = [hdv() for i in 1:length(AA_list)]

aa_bithdv = Dict(zip(AA_list, aa_bithdv))
aa_biphdv = Dict(zip(AA_list, aa_biphdv))

# Make an HDV for every class
polar_hdv_bip = add([aa_biphdv[i] for i in groups["polar"]]...)
polar_hdv_bit = bitadd([aa_bithdv[i] for i in groups["polar"]]...)

nonpolar_hdv_bip = add([aa_biphdv[i] for i in groups["non-polar"]]...)
nonpolar_hdv_bit = bitadd([aa_bithdv[i] for i in groups["non-polar"]]...)

pos_hdv_bip = add([aa_biphdv[i] for i in groups["pos+"]]...)
pos_hdv_bit = bitadd([aa_bithdv[i] for i in groups["pos+"]]...)

neg_hdv_bip = add([aa_biphdv[i] for i in groups["neg-"]]...)
neg_hdv_bit = bitadd([aa_bithdv[i] for i in groups["neg-"]]...)

group_hdv_bip = Dict("polar"=>polar_hdv_bip, "non-polar" => nonpolar_hdv_bip, "pos+" => pos_hdv_bip, "neg-" => neg_hdv_bip)
group_hdv_bit = Dict("polar"=>polar_hdv_bit, "non-polar" => nonpolar_hdv_bit, "pos+" => pos_hdv_bit, "neg-" => neg_hdv_bit)

# Add random AA HDV to its class HDV to introduce similarity
for i in keys(groups)
    for j in groups[i]
        aa_biphdv[j] = add(aa_biphdv[j], group_hdv_bip[i])
    end
end

for i in keys(groups)
    for j in groups[i]
        aa_bithdv[j] = bitadd(aa_bithdv[j], group_hdv_bit[i])
    end
end

naive_bit = nested_arrays2mat([aa_bithdv[i] for i in AA_list])
naive_bip = nested_arrays2mat([aa_biphdv[i] for i in AA_list])

InterruptException: InterruptException:

## Baseline random HDVs

In [14]:
rand_bit = nested_arrays2mat([bithdv() for i in 1:20])
rand_bip = nested_arrays2mat([hdv() for i in 1:20])

10000×20 Matrix{Float64}:
 -1.0  -1.0  -1.0  -1.0  -1.0   1.0  …   1.0   1.0  -1.0  -1.0   1.0   1.0
  1.0  -1.0  -1.0   1.0   1.0  -1.0     -1.0   1.0   1.0   1.0  -1.0   1.0
  1.0   1.0  -1.0   1.0   1.0  -1.0     -1.0   1.0  -1.0  -1.0  -1.0  -1.0
 -1.0   1.0  -1.0  -1.0  -1.0   1.0      1.0   1.0   1.0   1.0   1.0   1.0
  1.0   1.0  -1.0  -1.0  -1.0  -1.0      1.0   1.0   1.0  -1.0   1.0  -1.0
  1.0  -1.0   1.0   1.0  -1.0  -1.0  …  -1.0  -1.0  -1.0   1.0  -1.0   1.0
  1.0   1.0   1.0  -1.0   1.0  -1.0     -1.0   1.0   1.0  -1.0   1.0  -1.0
  1.0  -1.0   1.0  -1.0  -1.0  -1.0      1.0  -1.0  -1.0   1.0   1.0   1.0
  1.0   1.0   1.0  -1.0  -1.0  -1.0      1.0   1.0  -1.0  -1.0   1.0  -1.0
  1.0  -1.0  -1.0   1.0  -1.0  -1.0     -1.0   1.0   1.0  -1.0   1.0   1.0
  ⋮                             ⋮    ⋱         ⋮                      
  1.0  -1.0  -1.0  -1.0   1.0  -1.0      1.0  -1.0  -1.0  -1.0  -1.0  -1.0
  1.0   1.0  -1.0   1.0   1.0  -1.0     -1.0   1.0  -1.0  -1.0  -1.0  -1.0
  1

## PCA plots: is the physicochemical and biological information encoded into the HDVs?
### Binary HDVs

In [15]:
using MultivariateStats

PCA_rand_bit = fit(PCA, rand_bit; maxoutdim=2)
proj_rand_bit = projection(PCA_rand_bit)

PCA_ESM_bit = fit(PCA, AA_bit_esm; maxoutdim=2)
proj_ESM_bit = projection(PCA_ESM_bit)

PCA_naive_bit = fit(PCA, naive_bit; maxoutdim=2)
proj_naive_bit = projection(PCA_naive_bit)

#### Random binary HDVs

In [None]:
using Plots

key = collect(keys(groups))
indices = [[i for i in 1:length(AA_list) if AA_list[i] in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow]
fig = Plots.plot()

for i in 1:4
    scatter!(fig, (proj_rand_bit[indices[i], 1], proj_rand_bit[indices[i], 2]), label=key[i], mc = colors[i])
end

fig

#### ESM embedded HDVs

In [None]:
indices = [[i for i in 1:length(amino_acids_esm) if only(amino_acids_esm[i]) in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow]
fig = Plots.plot()

for i in 1:4
    scatter!(fig, (proj_ESM_bit[indices[i], 1], proj_ESM_bit[indices[i], 2]), label=key[i], mc = colors[i])
end
scatter!(fig, (proj_ESM_bit[21, 1], proj_ESM_bit[21, 2]), label="X", mc = :black)

fig

In [None]:
indices = [[i for i in 1:length(AA_list) if AA_list[i] in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow]
fig = Plots.plot()

for i in 1:4
    scatter!(fig, (proj_naive_bit[indices[i], 1], proj_naive_bit[indices[i], 2]), label=key[i], mc = colors[i])
end

fig

# Uniref embeddings

In [13]:
aa_embeddings = DataFrame(CSV.File("../data/amino_acid_embeddings.csv"))
amino_acids_esm = [only(i) for i in aa_embeddings.protein_ID]
aa_emb = Matrix(aa_embeddings[:, 2:end])
# Create HDVs
HDV_mat_bit = nested_arrays2mat([bithdv() for i in 1:size(aa_emb)[2]], true)

# Extend embeddings into hyperdimensional space
AA_bit_esm = permutedims(mat_scaler(aa_emb * HDV_mat_bit, 0, 1, 2) .|> round)

AA_dict = Dict(zip(amino_acids_esm, [convert(BitVector,AA_bit_esm[:, i]) for i in 1:21]))

Dict{Char, BitVector} with 21 entries:
  'P' => [0, 0, 0, 0, 1, 1, 1, 1, 1, 1  …  0, 1, 0, 1, 0, 1, 0, 1, 0, 0]
  'K' => [1, 0, 0, 1, 0, 1, 0, 1, 0, 1  …  1, 0, 1, 0, 0, 1, 0, 0, 0, 1]
  'M' => [0, 0, 1, 1, 0, 0, 0, 1, 0, 1  …  1, 1, 1, 0, 1, 1, 0, 0, 0, 0]
  'F' => [0, 0, 0, 1, 0, 1, 1, 1, 0, 1  …  1, 0, 1, 0, 1, 1, 0, 0, 0, 1]
  'I' => [1, 0, 0, 1, 0, 1, 0, 0, 0, 1  …  1, 0, 1, 0, 1, 1, 0, 0, 0, 0]
  'H' => [1, 0, 0, 1, 1, 0, 1, 0, 1, 0  …  1, 1, 0, 0, 1, 0, 1, 0, 1, 0]
  'E' => [1, 1, 0, 0, 1, 0, 1, 1, 0, 0  …  0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
  'W' => [0, 0, 1, 0, 0, 1, 1, 0, 0, 1  …  0, 0, 0, 0, 0, 1, 1, 0, 1, 1]
  'S' => [0, 0, 1, 0, 1, 1, 1, 0, 0, 1  …  0, 1, 0, 1, 0, 0, 1, 0, 1, 0]
  'T' => [0, 1, 1, 1, 0, 1, 0, 1, 1, 0  …  0, 1, 1, 1, 0, 0, 0, 0, 0, 0]
  'C' => [0, 0, 1, 0, 1, 1, 1, 0, 1, 0  …  1, 1, 0, 1, 1, 1, 0, 1, 1, 0]
  'X' => [1, 1, 1, 1, 0, 1, 0, 1, 0, 0  …  1, 0, 1, 1, 0, 0, 0, 0, 0, 1]
  'D' => [1, 0, 1, 1, 1, 1, 0, 0, 1, 1  …  0, 1, 1, 1, 0, 1, 0, 1, 0, 1]
  'A' => [0,

In [15]:
dict4 = load("/home/mfat/Master-Thesis/data/aa_hdv_dict4.jld")["aa_hdv_dict"]


Dict{Any, Any} with 22 entries:
  'E' => [0.125091, 0.220092, 0.230829, 0.318459, 0.33199, 0.35439, 0.349447, 0…
  'X' => [0.567164, 0.283582, 0.208955, 0.238806, 0.208955, 0.268657, 0.626866,…
  'C' => [0.602932, 0.243082, 0.263814, 0.38665, 0.314673, 0.353856, 0.336264, …
  'D' => [0.64553, 0.219819, 0.504804, 0.314454, 0.397196, 0.356231, 0.373679, …
  'A' => [0.136929, 0.23487, 0.509148, 0.330794, 0.335764, 0.354117, 0.36473, 0…
  'R' => [0.661313, 0.501632, 0.234521, 0.313033, 0.378138, 0.360238, 0.341801,…
  'G' => [0.65254, 0.504937, 0.234531, 0.315271, 0.375423, 0.353973, 0.384252, …
  'Q' => [0.659055, 0.230191, 0.242632, 0.399406, 0.323656, 0.350014, 0.360674,…
  'N' => [0.640484, 0.222495, 0.236293, 0.393298, 0.296538, 0.372632, 0.378035,…
  'P' => [0.119705, 0.217133, 0.213401, 0.426339, 0.354804, 0.356779, 0.352623,…
  'K' => [0.642271, 0.516926, 0.240979, 0.31995, 0.306857, 0.364484, 0.369081, …
  'M' => [0.625044, 0.484361, 0.225744, 0.321283, 0.362602, 0.362598, 0.37398

In [17]:
mat = collect(values(dict4))
mat = nested_arrays2mat(mat)
AA_list = collect(keys(dict4))
groups = Dict("polar" => ['S', 'T', 'Y', 'N', 'Q'], "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F'], 
         "pos+" => ['L', 'R', 'H'], "neg-" => ['D', 'E'], "ambiguous" => ['U', 'X'])

Dict{String, Vector{Char}} with 5 entries:
  "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F']
  "polar"     => ['S', 'T', 'Y', 'N', 'Q']
  "pos+"      => ['L', 'R', 'H']
  "neg-"      => ['D', 'E']
  "ambiguous" => ['U', 'X']

In [20]:
using MultivariateStats

PCA_tr = fit(PCA, mat; maxoutdim=2)

PCA(indim = 10000, outdim = 2, principalratio = 0.19823586763585277)

Pattern matrix (unstandardized loadings):
─────────────────────────────────
                PC1           PC2
─────────────────────────────────
1      -0.0231762    -0.0837603
2      -0.0161453    -0.023929
3      -0.0273659     0.049411
4       0.0154729    -0.0191156
5       0.0185029    -0.000968417
6       0.00153764   -0.00412271
7      -0.0153587    -0.00121033
8       0.0026718     0.00109866
9       0.0145768    -0.000138827
10      0.0230211    -0.0396119
11      0.0014824    -0.0382832
12      0.00604239    0.0269956
13      0.0217709    -0.0205101
14     -0.00869218   -0.0151832
15      0.0451441    -0.0117014
16     -0.0463727    -0.000905012
17     -0.0229197     0.0115158
18      0.0211005     0.0233834
19      0.00130456   -0.00267595
20     -0.0153213     0.128701
21      0.231813      0.00254916
22      0.0770641     0.00273741
23      0.00161152    0.0421044
24     -0.0122556     0.024153
25     -0.

In [19]:
proj_tr = projection(PCA_tr)


10000×2 Matrix{Float64}:
 -0.00533673    0.019687
 -0.00371774    0.00562425
 -0.00630149   -0.0116135
  0.0035629     0.00449292
  0.00426061    0.000227616
  0.000354068   0.000968999
 -0.0035366     0.000284475
  0.000615228  -0.000258229
  0.00335656    3.26298e-5
  0.00530101    0.00931037
  ⋮            
 -0.00204665    0.000533198
  0.00688216   -0.00134229
 -0.000895985   0.00242885
  0.00240477    0.000896395
  0.00164835   -0.000361801
  0.00203437    0.00188047
 -0.00476081   -0.0135558
 -0.00935172   -0.000887811
  0.0188481    -0.0125815

In [21]:
using Plots

key = collect(keys(groups))
indices = [[i for i in 1:length(AA_list) if AA_list[i] in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow, :black]
fig = Plots.plot()

for i in 1:5
    scatter!(fig, (proj_tr[indices[i], 1], proj_tr[indices[i], 2]), label=key[i], mc = colors[i])
end




