In [48]:
import os
import sys
import pickle as pkl
import time
import numpy as np
from SequenceEncoding import SequenceEncoding
from scipy import signal

In [3]:
dataset_folder = "../data/avgfp/"
dataset = dataset_folder.split('data/')[-1].split('/')[0]

In [28]:
encoding_names = ["One_hot", "One_hot_6_bit", "Binary_5_bit",
                      "Hydrophobicity_matrix", "Meiler_parameters", "Acthely_factors",
                      "PAM250", "BLOSUM62",
                      "Miyazawa_energies", "Micheletti_potentials",
                      "AESNN3", "ANN4D", "ProtVec"]
    
encodings_dict = dict()

i=0
for encoding_name in encoding_names:
    start = time.time()
    encoding_file = os.path.join(dataset_folder, f'X_{encoding_name}.pkl')
    i+=1
    # If file does not exist, encode
    if not os.path.exists(encoding_file):
        print(f"--> {encoding_name} [{i}/{len(encoding_names)}]", end="", flush=True)
        X_file = os.path.join(dataset_folder, dataset+"_X.pkl")
        X = pkl.load(open(X_file, 'rb'))
        enc_X = np.array([SequenceEncoding(encoding_name).get_encoding(seq) for seq in X])
        # Flatten the encoding
        # enc_X = enc_X.reshape(enc_X.shape[0], -1)
        # Save the encoding to a pickle file
        with open(encoding_file, 'wb') as handle:
            pkl.dump(enc_X, handle, protocol=pkl.HIGHEST_PROTOCOL) 
        print(f"\tsize={round(enc_X.nbytes/(1024*1024), 2)} MB\t time={time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start))}", flush=True)
    else:
        enc_X = pkl.load(open(encoding_file, 'rb'))
    
    encodings_dict[encoding_name] = enc_X

In [29]:
# Sequences have been previously flattened, so we need to reshape them to the original shape (N, seq_length, 20)
onehot = encodings_dict["One_hot"].reshape(encodings_dict["One_hot"].shape[0], -1, 20)
onehot_6bit = encodings_dict["One_hot_6_bit"].reshape(encodings_dict["One_hot_6_bit"].shape[0], -1, 6)
binary_5bit = encodings_dict["Binary_5_bit"].reshape(encodings_dict["Binary_5_bit"].shape[0], -1, 5)
hydrophobicity = encodings_dict["Hydrophobicity_matrix"].reshape(encodings_dict["Hydrophobicity_matrix"].shape[0], -1, 20)
meiler = encodings_dict["Meiler_parameters"].reshape(encodings_dict["Meiler_parameters"].shape[0], -1, 7)
acthely = encodings_dict["Acthely_factors"].reshape(encodings_dict["Acthely_factors"].shape[0], -1, 5)
pam250 = encodings_dict["PAM250"].reshape(encodings_dict["PAM250"].shape[0], -1, 20)
blosum62 = encodings_dict["BLOSUM62"].reshape(encodings_dict["BLOSUM62"].shape[0], -1, 20)
miyazawa = encodings_dict["Miyazawa_energies"].reshape(encodings_dict["Miyazawa_energies"].shape[0], -1, 20)
micheletti = encodings_dict["Micheletti_potentials"].reshape(encodings_dict["Micheletti_potentials"].shape[0], -1, 20)
aesnn3 = encodings_dict["AESNN3"].reshape(encodings_dict["AESNN3"].shape[0], -1, 3)
ann4d = encodings_dict["ANN4D"].reshape(encodings_dict["ANN4D"].shape[0], -1, 4)
protvec = encodings_dict["ProtVec"].reshape(encodings_dict["ProtVec"].shape[0], -1, 100)

In [34]:
seq_idx = 42
onehot_seq = onehot[seq_idx]
onehot_6bit_seq = onehot_6bit[seq_idx]
binary_5bit_seq = binary_5bit[seq_idx]
hydrophobicity_seq = hydrophobicity[seq_idx]
meiler_seq = meiler[seq_idx]
acthely_seq = acthely[seq_idx]
pam250_seq = pam250[seq_idx]
blosum62_seq = blosum62[seq_idx]
miyazawa_seq = miyazawa[seq_idx]
micheletti_seq = micheletti[seq_idx]
aesnn3_seq = aesnn3[seq_idx]
ann4d_seq = ann4d[seq_idx]
protvec_seq = protvec[seq_idx]

In [39]:
onehot_seq.shape, hydrophobicity_seq.shape

((235, 20), (235, 20))

In [57]:
combined_seq = signal.fftconvolve(onehot_seq, hydrophobicity_seq)

In [58]:
combined_seq

array([[-1.19608027e-15, -1.10134124e-15, -1.00660221e-15, ...,
        -1.72898732e-15, -1.56911521e-15, -2.41584530e-15],
       [-1.60464234e-15, -1.75267208e-15, -1.67569662e-15, ...,
        -4.26325641e-16,  9.47390314e-17, -9.47390314e-17],
       [ 8.33703477e-15,  7.56728014e-15,  8.10018719e-15, ...,
         7.29490542e-15,  7.29490542e-15,  7.98176340e-15],
       ...,
       [ 4.19220214e-15,  3.81324602e-15,  3.50534416e-15, ...,
         4.50000000e-01,  3.30000000e-01,  5.40000000e-01],
       [ 3.64745271e-15,  4.31062593e-15,  3.03164901e-15, ...,
         1.20000000e-01,  2.50000000e-01,  4.00000000e-02],
       [ 1.87109587e-15,  1.10134124e-15,  1.12502600e-15, ...,
         1.20000000e-01,  1.13982897e-15,  2.00000000e-01]])

In [61]:
onehot_flatten = onehot.reshape(onehot.shape[0], -1)
hydrophobicity_flatten = hydrophobicity.reshape(hydrophobicity.shape[0], -1)

In [67]:
combine_fft = signal.fftconvolve(onehot_flatten, hydrophobicity_flatten, mode='same')
combine = signal.convolve(onehot_flatten, hydrophobicity_flatten, mode='same')


In [68]:
onehot_flatten.shape, hydrophobicity_flatten.shape, combine_fft.shape, combine.shape

((32610, 4700), (32610, 4700), (32610, 4700), (32610, 4700))

In [66]:
combine_fft

array([[564799.68, 512938.36, 512683.87, ..., 558745.3 , 568475.34,
        561501.97],
       [564832.75, 512955.63, 512705.16, ..., 558772.57, 568511.88,
        561526.06],
       [564868.78, 513009.43, 512724.77, ..., 558810.46, 568551.61,
        561579.19],
       ...,
       [564654.53, 513149.58, 511575.44, ..., 558882.7 , 568622.3 ,
        561891.68],
       [564622.69, 513131.5 , 511528.55, ..., 558850.66, 568591.6 ,
        561867.94],
       [564590.96, 513086.75, 511497.74, ..., 558816.36, 568543.19,
        561830.49]])

In [69]:
combine

array([[564799.68, 512938.36, 512683.87, ..., 558745.3 , 568475.34,
        561501.97],
       [564832.75, 512955.63, 512705.16, ..., 558772.57, 568511.88,
        561526.06],
       [564868.78, 513009.43, 512724.77, ..., 558810.46, 568551.61,
        561579.19],
       ...,
       [564654.53, 513149.58, 511575.44, ..., 558882.7 , 568622.3 ,
        561891.68],
       [564622.69, 513131.5 , 511528.55, ..., 558850.66, 568591.6 ,
        561867.94],
       [564590.96, 513086.75, 511497.74, ..., 558816.36, 568543.19,
        561830.49]])