# Packages

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# One-hot enocoding.

In [9]:
positives = pd.read_csv("./positive.csv")
negatives = pd.read_csv("./negative.csv")

amino_acids = []

def add_to_amino_acids(a_sequence: str):
    for acid in a_sequence:
        if acid not in amino_acids:
            amino_acids.append(acid)

positives.stack().reset_index(drop=True).apply(add_to_amino_acids)

amino_acids.sort()

amino_acid_label_encoder = LabelEncoder()
amino_acid_label_encoder.fit(amino_acids)

all_amino_acids = amino_acid_label_encoder.transform(amino_acids)

def feature_map(p_sequence):
    return [tf.one_hot(amino_acid_label_encoder.transform(list(x)), len(all_amino_acids)) for x in p_sequence]

data_cd3r = feature_map(positives["cdr3"])
data_epitope = feature_map(positives["antigen.epitope"])


# BERT

In [10]:
from transformers import BertTokenizer, TFBertForMaskedLM
import keras
import torch

In [25]:
def convToArray(x):
    return ' '.join(list(x))

for column in positives.columns:
    positives[column] = positives[column].apply(convToArray)
def construct_sentences(dataframe):
    cdr3_sentences = "[CLS] " + " [SEP] ".join(dataframe["cdr3"]) + " [SEP]"
    epitope_sentences = " [SEP] ".join(dataframe["antigen.epitope"]) + " [SEP]"
    return cdr3_sentences + epitope_sentences

sentences = construct_sentences(positives)


lengths_of_data = [len(sentence.split(" ")) for sentence in sentences.split(" [SEP] ")]

def pad_sentence(sentence, max_length):
    tokens = sentence.split()
    if len(tokens) < max_length:
        padding = "[PAD]" * (max_length - len(tokens))
        return sentence + " " + padding
    return sentence

max_length = 1
sentences = pad_sentence(sentences, max_length)

train_data, test_data = train_test_split(sentences, test_size=0.2, random_state=42)