### Init data based on the covid db data

In [None]:
import pandas as pd
import numpy as np
import os
import random
import torch
import time

os.chdir("PATH_TO_PROJECT")

In [None]:
# load selected samples
df_selected = pd.read_csv("PATH_TO_DATASET", sep=',')

Epitope = list(set(df_selected["Epitope"].to_list()))

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")
model = AutoModel.from_pretrained("Rostlab/prot_bert")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Running on: {device}")

In [None]:
sequence_list = df_selected["TCR"].to_list()

feature_list = []

batch_size = 16
for i in range(0, df_selected.shape[0], batch_size):
    batch = sequence_list[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    feature_vector = outputs.last_hidden_state.mean(dim=1)

    feature_list.append(feature_vector)

all_features = torch.cat(feature_list, dim=0)
mtx = all_features.cpu().numpy()

In [None]:
mtx_other = np.zeros((df_selected.shape[0], 1))
for i in range(df_selected.shape[0]):
    mtx_other[i,0] = Epitope.index(df_selected["Epitope"].iloc[i])


In [None]:
data = np.concatenate((mtx, mtx_other), axis=1)
np.save("data/covid/X_dataset.npy", data)

dict = {'Epitope': Epitope} 
import pickle 

with open('data/covid/dataset_feature.pkl', 'wb') as f:
    pickle.dump(dict, f)