# How to include biological information into hyperdimensional vectors and is this useful?

In [11]:
include("../src/HDC.jl")
include("../src/math.jl")
include("../src/experimental.jl")
using DataFrames
using CSV

## Method 1: Via embeddings (ESM-2)

In [12]:
#Load embeddings from last hidden layer of ESM-2 model (21x1280)
aa_embeddings = DataFrame(CSV.File("../data/amino_acid_embeddings.csv"))
amino_acids_esm = aa_embeddings.protein_ID
aa_emb = Matrix(aa_embeddings[:, 2:end])
# Create HDVs
HDV_mat_bit = nested_arrays2mat([bithdv() for i in 1:size(aa_emb)[2]], true)
HDV_mat_bip = nested_arrays2mat([hdv() for i in 1:size(aa_emb)[2]], true)

# Extend embeddings into hyperdimensional space
AA_bip_esm = permutedims(aa_emb * HDV_mat_bip .|> sign)
AA_bit_esm = permutedims(mat_scaler(aa_emb * HDV_mat_bit, 0, 1, 2) .|> round)

InterruptException: InterruptException:

## Method 2: A bit more true to HDC framework but naive without proper training

In [13]:
## Initiate random HDVs for every AA
AA_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
groups = Dict("polar" => ['S', 'T', 'Y', 'N', 'Q'], "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F'], 
         "pos+" => ['L', 'R', 'H'], "neg-" => ['D', 'E'])

aa_bithdv = [bithdv() for i in 1:length(AA_list)]
aa_biphdv = [hdv() for i in 1:length(AA_list)]

aa_bithdv = Dict(zip(AA_list, aa_bithdv))
aa_biphdv = Dict(zip(AA_list, aa_biphdv))

# Make an HDV for every class
polar_hdv_bip = add([aa_biphdv[i] for i in groups["polar"]]...)
polar_hdv_bit = bitadd([aa_bithdv[i] for i in groups["polar"]]...)

nonpolar_hdv_bip = add([aa_biphdv[i] for i in groups["non-polar"]]...)
nonpolar_hdv_bit = bitadd([aa_bithdv[i] for i in groups["non-polar"]]...)

pos_hdv_bip = add([aa_biphdv[i] for i in groups["pos+"]]...)
pos_hdv_bit = bitadd([aa_bithdv[i] for i in groups["pos+"]]...)

neg_hdv_bip = add([aa_biphdv[i] for i in groups["neg-"]]...)
neg_hdv_bit = bitadd([aa_bithdv[i] for i in groups["neg-"]]...)

group_hdv_bip = Dict("polar"=>polar_hdv_bip, "non-polar" => nonpolar_hdv_bip, "pos+" => pos_hdv_bip, "neg-" => neg_hdv_bip)
group_hdv_bit = Dict("polar"=>polar_hdv_bit, "non-polar" => nonpolar_hdv_bit, "pos+" => pos_hdv_bit, "neg-" => neg_hdv_bit)

# Add random AA HDV to its class HDV to introduce similarity
for i in keys(groups)
    for j in groups[i]
        aa_biphdv[j] = add(aa_biphdv[j], group_hdv_bip[i])
    end
end

for i in keys(groups)
    for j in groups[i]
        aa_bithdv[j] = bitadd(aa_bithdv[j], group_hdv_bit[i])
    end
end

naive_bit = nested_arrays2mat([aa_bithdv[i] for i in AA_list])
naive_bip = nested_arrays2mat([aa_biphdv[i] for i in AA_list])

InterruptException: InterruptException:

## Baseline random HDVs

In [14]:
rand_bit = nested_arrays2mat([bithdv() for i in 1:20])
rand_bip = nested_arrays2mat([hdv() for i in 1:20])

10000×20 Matrix{Float64}:
 -1.0  -1.0  -1.0  -1.0  -1.0   1.0  …   1.0   1.0  -1.0  -1.0   1.0   1.0
  1.0  -1.0  -1.0   1.0   1.0  -1.0     -1.0   1.0   1.0   1.0  -1.0   1.0
  1.0   1.0  -1.0   1.0   1.0  -1.0     -1.0   1.0  -1.0  -1.0  -1.0  -1.0
 -1.0   1.0  -1.0  -1.0  -1.0   1.0      1.0   1.0   1.0   1.0   1.0   1.0
  1.0   1.0  -1.0  -1.0  -1.0  -1.0      1.0   1.0   1.0  -1.0   1.0  -1.0
  1.0  -1.0   1.0   1.0  -1.0  -1.0  …  -1.0  -1.0  -1.0   1.0  -1.0   1.0
  1.0   1.0   1.0  -1.0   1.0  -1.0     -1.0   1.0   1.0  -1.0   1.0  -1.0
  1.0  -1.0   1.0  -1.0  -1.0  -1.0      1.0  -1.0  -1.0   1.0   1.0   1.0
  1.0   1.0   1.0  -1.0  -1.0  -1.0      1.0   1.0  -1.0  -1.0   1.0  -1.0
  1.0  -1.0  -1.0   1.0  -1.0  -1.0     -1.0   1.0   1.0  -1.0   1.0   1.0
  ⋮                             ⋮    ⋱         ⋮                      
  1.0  -1.0  -1.0  -1.0   1.0  -1.0      1.0  -1.0  -1.0  -1.0  -1.0  -1.0
  1.0   1.0  -1.0   1.0   1.0  -1.0     -1.0   1.0  -1.0  -1.0  -1.0  -1.0
  1

## PCA plots: is the physicochemical and biological information encoded into the HDVs?
### Binary HDVs

In [15]:
using MultivariateStats

PCA_rand_bit = fit(PCA, rand_bit; maxoutdim=2)
proj_rand_bit = projection(PCA_rand_bit)

PCA_ESM_bit = fit(PCA, AA_bit_esm; maxoutdim=2)
proj_ESM_bit = projection(PCA_ESM_bit)

PCA_naive_bit = fit(PCA, naive_bit; maxoutdim=2)
proj_naive_bit = projection(PCA_naive_bit)

#### Random binary HDVs

In [None]:
using Plots

key = collect(keys(groups))
indices = [[i for i in 1:length(AA_list) if AA_list[i] in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow]
fig = Plots.plot()

for i in 1:4
    scatter!(fig, (proj_rand_bit[indices[i], 1], proj_rand_bit[indices[i], 2]), label=key[i], mc = colors[i])
end

fig

#### ESM embedded HDVs

In [None]:
indices = [[i for i in 1:length(amino_acids_esm) if only(amino_acids_esm[i]) in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow]
fig = Plots.plot()

for i in 1:4
    scatter!(fig, (proj_ESM_bit[indices[i], 1], proj_ESM_bit[indices[i], 2]), label=key[i], mc = colors[i])
end
scatter!(fig, (proj_ESM_bit[21, 1], proj_ESM_bit[21, 2]), label="X", mc = :black)

fig

In [None]:
indices = [[i for i in 1:length(AA_list) if AA_list[i] in groups[j]] for j in key]

colors = [:green, :red, :blue, :yellow]
fig = Plots.plot()

for i in 1:4
    scatter!(fig, (proj_naive_bit[indices[i], 1], proj_naive_bit[indices[i], 2]), label=key[i], mc = colors[i])
end

fig

# Uniref embeddings

In [None]:
aa_embeddings = DataFrame(CSV.File("../data/amino_acid_embeddings.csv"))
amino_acids_esm = [only(i) for i in aa_embeddings.protein_ID]
aa_emb = Matrix(aa_embeddings[:, 2:end])
# Create HDVs
HDV_mat_bit = nested_arrays2mat([bithdv() for i in 1:size(aa_emb)[2]], true)

# Extend embeddings into hyperdimensional space
AA_bit_esm = permutedims(mat_scaler(aa_emb * HDV_mat_bit, 0, 1, 2) .|> round)

AA_dict = Dict(zip(amino_acids_esm, [convert(BitVector,AA_bit_esm[:, i]) for i in 1:21]))

In [None]:
using FastaIO
using ProgressMeter

dict4 = Dict()
dictk = Dict()

p = Progress(20591)

for (name,seq) in FastaReader("../data/UP000005640_9606.fasta")
    seq = String(seq)
    dict4[seq] = encodeAA(AA_dict, seq)
    dictk[seq] = encodeAA(AA_dict, seq, length(seq))
    next!(p)
    end
end
