In [76]:
using Random
using LinearAlgebra
"""
Construct a bipolar vector. By default 10000 elements long.
"""
hdv(N::Int=10000) = vec(rand((-1,1), 1, N))


"""
Bundles bipolar hyperdimensional vectors.
"""
add(vectors::Vector{Int}...) = reduce(.+, vectors) .|> sign


"""
Binds binpolar hyperdimensional vectors.
"""
multiply(vectors::Vector{Int}...) = reduce(.*, vectors)


"""
Permutes a bipolar hyperdimensional vector by an adjustable circular shift.
"""
perm(vector::Vector, k::Int=1) = circshift(vector, (0, k))


"""
Calculates the cosine similarity between two bipolar vectors.
"""
cosine(x, y) = dot(x, y) / (norm(x) * norm(y))

cosine

In [77]:
"""
Construct a binary vector. By default 10000 elements long.
"""
bithdv(N::Int=10000) = bitrand(N)


"""
Bundles binary hyperdimensional vectors based on the element-wise majority rule.
"""
function bitadd(vectors::BitVector ...)
    v = reduce(.+, vectors)
    n = length(vectors) / 2
    x = [i > n ? 1 : i < n ? 0 : rand(0:1) for i in v]
    return convert(BitVector, x)
end


"""
Binds binary hyperdimensional vectors based on an element-wise XOR gate.
"""
bitbind(vectors::BitVector ...) =  reduce(.⊻, vectors)


"""
Permutes a hyperdimensional vector by an adjustable circular shift.
"""
bitperm(vector::BitVector, k::Int=1) = circshift(vector, k)


"""
Calculates the Hamming distance between two binary vectors.
"""
hamming(x::BitVector, y::BitVector) = sum(x .!= y)/length(x)

hamming

In [78]:
using DataFrames, CSV
data = CSV.read("ProtExdata/ACPs_Breast_cancer.csv", DataFrame)
unique(data.class)
class_num = [i == "very active" ? 1 : i == "mod. active" ? 2 : i == "inactive - exp" ? 3 : 4 for i in data.class]
data[!, :class_num] = class_num
data = data[data.class_num .!= 4, :]
first(data, 5)

Unnamed: 0_level_0,ID,sequence,class,class_num
Unnamed: 0_level_1,Int64,String,String31,Int64
1,1,AAWKWAWAKKWAKAKKWAKAA,mod. active,2
2,2,AIGKFLHSAKKFGKAFVGEIMNS,mod. active,2
3,3,AWKKWAKAWKWAKAKWWAKAA,mod. active,2
4,4,ESFSDWWKLLAE,mod. active,2
5,5,ETFADWWKLLAE,mod. active,2


In [110]:

AA_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
groups = Dict("polar" => ['S', 'T', 'Y', 'N', 'Q'], "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F'], 
         "pos+" => ['L', 'R', 'H'], "neg-" => ['D', 'E'])

Dict{String, Vector{Char}} with 4 entries:
  "non-polar" => ['G', 'A', 'V', 'C', 'P', 'L', 'I', 'M', 'W', 'F']
  "polar"     => ['S', 'T', 'Y', 'N', 'Q']
  "pos+"      => ['L', 'R', 'H']
  "neg-"      => ['D', 'E']

In [85]:
sorted_list = [i for (k,v) in groups for i in v]

function AA_dist_calc(x, cos=true)
    df = DataFrame([[] for i in 1:20], string.(sorted_list), makeunique=true)
    for i in sorted_list
        r = []
        for j in sorted_list
            if cos == true
                push!(r, cosine(x[i], x[j]))
            else
                push!(r, hamming(x[i], x[j]))
            end
        end
        push!(df, r)
    end
    insertcols!(df, 1, :coef => sorted_list)
    return df
end
    

AA_dist_calc (generic function with 2 methods)

In [None]:
function scaler()
    
end

In [120]:
using PyCall
using StatsBase
py"""
import pickle
infile = open("/home/mfat/Unief/Thesis/ThesisFatjanov/data/aa_embeddings",'rb')
embeddings_trans = pickle.load(infile)
infile.close()
"""
embeddings_trans = PyArray(py"embeddings_trans"o)

random_hdv_bin = permutedims(hcat([bithdv() for i in 1:1024]...))
random_hdv_bip = permutedims(hcat([hdv() for i in 1:1024]...))

embeddings_trans_hdv = embeddings_trans*random_hdv_bin
hdvs_trans = [embeddings_trans_hdv[i, :] for i in 1:20]
AA_dict_trans = Dict(zip(AA_list, hdvs_trans))
dt = fit(UnitRangeTransform, embeddings_trans_hdv, dims = 1)
embeddings_trans_hdv = StatsBase.transform(dt, embeddings_trans_hdv)

embeddings_trans_hdv_bin = round.(embeddings_trans_hdv)
hdvs_trans_bin = [convert(BitVector, embeddings_trans_hdv_bin[i, :]) for i in 1:20]
AA_dict_trans_bin = Dict(zip(AA_list, hdvs_trans_bin))

embeddings_trans_hdv_bip = embeddings_trans*random_hdv_bip
hdvs_trans_bip = [embeddings_trans_hdv_bip[i, :] .|> sign for i in 1:20]
AA_dict_trans_bip = Dict(zip(AA_list, hdvs_trans_bip))

Dict{Char, Vector{Float32}} with 20 entries:
  'M' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'K' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'P' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'Q' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'I' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'H' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'E' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'W' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'S' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'T' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'C' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0  …  1.0, 1.0, …
  'D' => [-1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, 1.0

In [124]:
using PyCall
using TableTransforms
py"""
import pickle
infile = open("/home/mfat/Unief/Thesis/ThesisFatjanov/data/aa_embeddings",'rb')
embeddings_trans = pickle.load(infile)
infile.close()
"""
embeddings_trans = PyArray(py"embeddings_trans"o)

random_hdv_bin = permutedims(hcat([bithdv() for i in 1:1024]...))
random_hdv_bip = permutedims(hcat([hdv() for i in 1:1024]...))

embeddings_trans_hdv = embeddings_trans*random_hdv_bip
scaler = fit(FixedRangeScaler, embeddings_trans_hdv, -1, 1; obsdim=1)
transform!(embeddings_trans_hdv, scaler)
hdvs_trans = [embeddings_trans_hdv[i, :] for i in 1:20]
AA_dict_trans = Dict(zip(AA_list, hdvs_trans))

embeddings_trans_hdv_bin = embeddings_trans*random_hdv_bin
scaler = JuliaML.fit(FixedRangeScaler, embeddings_trans_hdv_bin, 0, 1; obsdim=1)
transform!(embeddings_trans_hdv_bin, scaler)
hdvs_trans_bin = [round.(embeddings_trans_hdv_bin[i, :]) for i in 1:20]
AA_dict_trans_bin = Dict(zip(AA_list, hdvs_trans_bin))

embeddings_trans_hdv_bip = embeddings_trans*random_hdv_bip
scaler = fit(FixedRangeScaler, embeddings_trans_hdv_bip, -1, 1; obsdim=1)
transform!(embeddings_trans_hdv_bip, scaler)
hdvs_trans_bip = [embeddings_trans_hdv_bip[i, :] .|> sign for i in 1:20]
AA_dict_trans_bip = Dict(zip(AA_list, hdvs_trans_bip))

ArgumentError: ArgumentError: Package JuliaML not found in current path:
- Run `import Pkg; Pkg.add("JuliaML")` to install the JuliaML package.


In [121]:
df_trans = AA_dist_calc(AA_dict_trans)
df_trans_bip = AA_dist_calc(AA_dict_trans_bip)
df_trans_bin = AA_dist_calc(AA_dict_trans_bin, false)

Unnamed: 0_level_0,coef,G,A,V,C,P,L,I,M,W,F
Unnamed: 0_level_1,Char,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any
1,G,0.0,0.0222,0.8359,0.6464,0.3024,0.7856,0.0826,0.1193,0.2478,0.4394
2,A,0.0222,0.0,0.8401,0.6322,0.3076,0.7884,0.0976,0.1353,0.2626,0.4244
3,V,0.8359,0.8401,0.0,0.4939,0.7709,0.0609,0.8373,0.7658,0.7305,0.6971
4,C,0.6464,0.6322,0.4939,0.0,0.417,0.5406,0.6318,0.6895,0.5314,0.2132
5,P,0.3024,0.3076,0.7709,0.417,0.0,0.7844,0.2374,0.2967,0.148,0.2434
6,L,0.7856,0.7884,0.0609,0.5406,0.7844,0.0,0.7958,0.7175,0.715,0.7376
7,I,0.0826,0.0976,0.8373,0.6318,0.2374,0.7958,0.0,0.0881,0.1656,0.4332
8,M,0.1193,0.1353,0.7658,0.6895,0.2967,0.7175,0.0881,0.0,0.1865,0.5041
9,W,0.2478,0.2626,0.7305,0.5314,0.148,0.715,0.1656,0.1865,0.0,0.3782
10,F,0.4394,0.4244,0.6971,0.2132,0.2434,0.7376,0.4332,0.5041,0.3782,0.0


In [84]:
function convolved_embedding(sequence, tokens, k=3)
    """
    Simple 2-layer convolved embedding in hyperdimensional space
    """
    # layer 1
    kmer_hdvs = []
    for i in 1:length(sequence)-k+1
        kmer = sequence[i:i+k-1]
        aa_hdvs = [circshift(tokens[aa], k-l) for (l, aa) in enumerate(kmer)]
        push!(kmer_hdvs, bitbind(aa_hdvs))
    end
    
    # layer 2
    conv_kmer_hdvs = []
    for i in 1:length(kmer_hdvs)-k+1
        convolved_kmers = kmer_hdvs[i:i+k-1]
        conv_hdvs = [circshift(convolved_kmers[l], k-l) for (l, km) in enumerate(convolved_kmers)]
        push!(conv_kmer_hdvs, bitbind(conv_hdvs))
    end
    
    return bitadd(conv_kmer_hdvs)
end

convolved_embedding (generic function with 2 methods)