### Numerical representation strategies demonstration

In [1]:
import pandas as pd
import sys
sys.path.insert(0, "../")
from numerical_rep.embedding_representations import BioEmbeddings
from numerical_rep.fft_encoder import FFTTransform
from numerical_rep.one_hot_encoding import OneHotEncoder
from numerical_rep.physicochemical_properties import PhysicochemicalEncoder

### Loading dataset

In [2]:
df_data = pd.read_csv("../../data/Antiviral.csv")
df_data.head(5)

Unnamed: 0,sequence,half_life_seconds,experimental_characteristics,hl_category
0,AAAMSQVTN,15840.0,Mammalian,Medium
1,AACEVAKNLNESLIDLQELGKYEQYIKW,15840.0,Mammalian,Medium
2,AAGAVVNDL,15840.0,Mammalian,Medium
3,AAHLIDALYAEFLGGRVLTT,15840.0,Mammalian,Medium
4,AAHLIDALYAEFLGGRVLTTPVVHRALFYASAVLRQPFLAGVPSA,15840.0,Mammalian,Medium


### Numerical representation strategies explored in this work

#### One Hot encoder

In [3]:
one_hot_instance = OneHotEncoder(
    dataset=df_data,
    column_sequence="sequence",
    max_length=150
)

data_coded = one_hot_instance.run_process()
data_coded["hl_category"] = df_data["hl_category"]
data_coded.to_csv("../results_demo/Antiviral_one_hot.csv", index=False)
data_coded.head(5)

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_2991,p_2992,p_2993,p_2994,p_2995,p_2996,p_2997,p_2998,p_2999,hl_category
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Medium
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Medium
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Medium
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Medium
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Medium


#### Physicochemical properties

In [4]:
dataset_encoder=pd.read_csv("../numerical_rep/input_data/cluster_encoders.csv")
dataset_encoder.index = dataset_encoder["residue"]
dataset_encoder.head(5)

Unnamed: 0_level_0,residue,Group_0,Group_1,Group_2,Group_3,Group_4,Group_5,Group_6,Group_7
residue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,A,290.40675,71.850787,6.250299,44.65141,-107.792042,15.33599,56.16028,92.925289
R,R,172.577375,-6.96389,84.091653,200.152218,51.157141,172.36012,1.448105,-37.39311
N,N,-38.377385,-90.145475,-21.731374,-191.180531,73.940581,-259.135737,-54.69043,-77.746565
D,D,159.436015,-56.585499,-28.963699,-232.261465,55.369736,-216.012067,-29.383132,-7.421269
C,C,-4.241925,15.678516,-34.886819,-156.2126,-54.192823,-242.000209,10.074813,40.041394


In [5]:
physicochemical_instance = PhysicochemicalEncoder(
    dataset=df_data,
    property_encoder="Group_0",
    dataset_encoder=dataset_encoder,
    name_column_seq="sequence",
    columns_to_ignore=["hl_category"]
)

physicochemical_instance.run_process()
physicochemical_instance.df_data_encoded.to_csv("../results_demo/Antiviral_physicochemical_properties.csv", index=False)
physicochemical_instance.df_data_encoded.head(5)

Encoding and Processing results
Creating dataset
Export dataset


Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_79,p_80,p_81,p_82,p_83,p_84,p_85,p_86,p_87,hl_category
0,290.40675,290.40675,290.40675,21.944601,-314.201739,-268.556728,150.752932,-252.509397,-38.377385,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Medium
1,290.40675,290.40675,-4.241925,-0.028483,150.752932,290.40675,195.599646,-38.377385,-91.117252,-38.377385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Medium
2,290.40675,290.40675,-104.495222,290.40675,150.752932,150.752932,-38.377385,159.436015,-91.117252,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Medium
3,290.40675,290.40675,-159.877881,-91.117252,-34.080828,159.436015,290.40675,-91.117252,-10.20771,290.40675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Medium
4,290.40675,290.40675,-159.877881,-91.117252,-34.080828,159.436015,290.40675,-91.117252,-10.20771,290.40675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Medium


#### FFT transform

In [6]:
fft_instance = FFTTransform(
    dataset=physicochemical_instance.df_data_encoded,
    size_data=len(physicochemical_instance.df_data_encoded.columns)-1,
    columns_to_ignore=["hl_category"]
)

response_fft = fft_instance.encoding_dataset()
response_fft.to_csv("../results_demo/Antiviral_fft_encoded.csv", index=False)
response_fft.head(5)

Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset


Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_55,p_56,p_57,p_58,p_59,p_60,p_61,p_62,p_63,hl_category
0,170.272533,236.91564,367.732436,509.495844,647.133514,774.401692,887.583559,984.147478,1062.398403,1121.37968,...,256.023663,280.827457,326.4938,381.278126,436.705616,487.355145,529.714928,561.456241,581.070917,Medium
1,160.086406,639.090726,1131.031812,1431.218148,1516.189767,1431.67293,1268.739942,1116.597207,1004.314117,892.344688,...,887.653808,757.347105,549.91753,391.072578,386.92987,488.77914,652.03574,856.489994,1032.445291,Medium
2,1098.172271,1093.679546,1080.180507,1057.622717,1025.951267,985.152986,935.310194,876.657636,809.636598,734.941564,...,505.596816,553.718159,593.192192,624.056098,646.956624,662.970534,673.417689,679.654707,682.859214,Medium
3,449.749581,623.657361,903.946725,1089.477959,1116.106935,980.240075,724.882307,445.740303,334.223582,435.643472,...,396.479836,335.671194,326.711365,354.311987,380.656377,384.77148,365.34339,332.916621,303.26937,Medium
4,2532.42312,1870.792744,541.382484,949.743263,1071.937741,620.030758,879.240327,1015.717617,786.342202,626.30559,...,1799.900963,1272.367412,1303.457702,1573.258224,1487.943849,950.268395,585.20938,763.676322,600.077414,Medium


#### Embedding through bio-embedding tool

In [7]:
bioembedding_instance = BioEmbeddings(
    dataset=df_data,
    seq_column="sequence",
    is_reduced=True,
    device="cuda",
    column_response="hl_category",
    path_export="../results_demo/"
)

bioembedding_instance.apply_prottrans_t5_uniref(name_export="df_training")

NVIDIA GeForce RTX 4060 with CUDA capability sm_89 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 4060 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
RuntimeError for sequence with 9 residues: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_1015,p_1016,p_1017,p_1018,p_1019,p_1020,p_1021,p_1022,p_1023,hl_category
0,0.221108,-0.095024,-0.176811,-0.147592,0.098018,-0.012107,-0.163594,-0.084584,-0.093292,-0.073915,...,-0.066105,0.247374,-0.133751,-0.061800,-0.147012,-0.043793,0.068672,-0.002200,-0.049787,Medium
1,0.135052,-0.112733,-0.023905,-0.010038,0.098003,0.027644,0.048664,-0.102691,-0.022150,-0.107410,...,-0.088512,0.030768,-0.138818,0.011149,0.004017,-0.072209,-0.036229,-0.033255,0.157988,Medium
2,0.214635,-0.032838,-0.105045,-0.097705,0.076046,-0.055113,-0.062762,-0.045538,0.003070,-0.014289,...,-0.102643,0.159857,-0.088896,-0.094590,-0.053697,0.053822,0.100262,0.022551,-0.003262,Medium
3,0.084283,0.030307,-0.066888,-0.056141,0.097343,0.040673,0.030255,-0.073112,0.026516,-0.012377,...,0.006682,0.090254,-0.133367,-0.043872,-0.027266,0.023461,0.104748,-0.021168,-0.079446,Medium
4,0.054688,0.069592,-0.048406,-0.050511,0.026016,0.046918,0.015309,-0.076438,0.069543,-0.020661,...,-0.078738,0.045299,-0.056401,-0.062468,-0.037647,-0.051923,0.095160,-0.009511,-0.034595,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2263,0.136491,0.036897,-0.043670,-0.098495,0.057122,0.065973,-0.026807,-0.062816,0.052658,0.046822,...,0.032487,0.066900,-0.138251,0.034346,-0.025337,-0.015256,0.116411,-0.048493,-0.009225,Medium
2264,0.076845,-0.001439,-0.103801,-0.160190,0.046218,0.043718,0.027165,-0.152327,0.025885,0.008452,...,-0.040289,0.046481,-0.218028,0.070731,0.000944,0.001626,0.106362,0.005441,-0.003304,Low
2265,0.076845,-0.001439,-0.103801,-0.160190,0.046218,0.043718,0.027165,-0.152327,0.025885,0.008452,...,-0.040289,0.046481,-0.218028,0.070731,0.000944,0.001626,0.106362,0.005441,-0.003304,Medium
2266,0.076845,-0.001439,-0.103801,-0.160190,0.046218,0.043718,0.027165,-0.152327,0.025885,0.008452,...,-0.040289,0.046481,-0.218028,0.070731,0.000944,0.001626,0.106362,0.005441,-0.003304,Low


In [8]:
df_testing = pd.read_csv("../results_demo/independent_df.csv")
bioembedding_instance = BioEmbeddings(
    dataset=df_testing,
    seq_column="sequence",
    is_reduced=True,
    device="cuda",
    column_response="hl_category",
    path_export="../results_demo/"
)

bioembedding_instance.apply_prottrans_t5_uniref(name_export="independent_df")

FileNotFoundError: [Errno 2] No such file or directory: '../results_demo/independent_df.csv'