# Packages

In [80]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split



# One-hot enocoding.

In [81]:
positives = pd.read_csv("./positive.csv")
negatives = pd.read_csv("./negative.csv")

amino_acids = []

def add_to_amino_acids(a_sequence: str):
    for acid in a_sequence:
        if acid not in amino_acids:
            amino_acids.append(acid)

positives.stack().reset_index(drop=True).apply(add_to_amino_acids)

amino_acids.sort()

amino_acid_label_encoder = LabelEncoder()
amino_acid_label_encoder.fit(amino_acids)

all_amino_acids = amino_acid_label_encoder.transform(amino_acids)

def feature_map(p_sequence):
    return [tf.one_hot(amino_acid_label_encoder.transform(list(x)), len(all_amino_acids)) for x in p_sequence]

data_cd3r = feature_map(positives["cdr3"])
data_epitope = feature_map(positives["antigen.epitope"])

# BERT

In [82]:
from transformers import BertTokenizer, BertModel
import keras
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup


# Sentence Construction

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split

def convert_to_space_separated_string(series):
    return ' '.join(series)
def construct_sentences(dataframe):
    cdr3_sentences = "[CLS] " + " [SEP] ".join(dataframe["cdr3"]) + " [SEP]"
    epitope_sentences = " [SEP] ".join(dataframe["antigen.epitope"]) + " [SEP]"
    return cdr3_sentences + epitope_sentences
def pad_sentence(sentence, max_length):
    tokens = sentence.split()
    if len(tokens) < max_length:
        padding = "[PAD]" * (max_length - len(tokens))
        return sentence + " " + padding
    return sentence
max_length = 256
for column in positives.columns:
    positives[column] = positives[column].apply(convert_to_space_separated_string)
sentences = construct_sentences(positives)

sentences = pad_sentence(sentences, max_length)

lengths_of_data = [len(sentence.split(" ")) for sentence in sentences.split(" [SEP] ")]

train_data, test_data = train_test_split(sentences, test_size=0.2, random_state=42)
print(sentences)

[CLS] C A S S S G Q L T N T E A F F [SEP] C A S S A S A R P E Q F F [SEP] C A S S S G L L T A D E Q F F [SEP] C A S S S G Q V S N T G E L F F [SEP] C S A R D R T G N G Y T F [SEP] C S A R G D G Q G D L L Q E T Q Y F [SEP] C S V G T G G T N E K L F F [SEP] C S V G S G G T N E K L F F [SEP] C S V G A G G T N E K L F F [SEP] C A S S P D R L G T G E L F F [SEP] C A S S Q S P G G I I Q Y F [SEP] C A S S S G Q T L P G E L F F [SEP] C A S S S Q R K V P G E L F F [SEP] C A S F S Q R K V P G E L F F [SEP] C A S S E V K V S P G E L F F [SEP] C A S S E G A V A P G E L F F [SEP] C S A R D R G L G N T I Y F [SEP] C A S S Q S P G G I A F F [SEP] C S A R D S T G N G Y T F [SEP] C S A R D R T G N T I Y F [SEP] C S V G A A G T N E K L F F [SEP] C A S S Q A G L A A Y N E Q F F [SEP] C A S G G G G F Q E T Q Y F [SEP] C S A R D G T G N G Y T F [SEP] C S A R D R A Y G N T I Y F [SEP] C S A R D R S L G N T I Y F [SEP] C A S S L A A G L N L K N I Q Y F [SEP] C A S S Q D G A G G L G E Q F F [SEP] C A I S G E 

# Tokenization

In [93]:
vocab_file = './vocab.txt'
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.add_tokens(open(vocab_file).read().splitlines())
tokenizer.save_pretrained('/Users/ceejayarana/Desktop/ACM_Project/directory')

# Taken from Matt's code to see if this biatch works.

# encoded_text = tokenizer.encode(" ".join(list("CSARGDGQGDLLQETQYF")))
# print(encoded_text)

('/Users/ceejayarana/Desktop/ACM_Project/directory/tokenizer_config.json',
 '/Users/ceejayarana/Desktop/ACM_Project/directory/special_tokens_map.json',
 '/Users/ceejayarana/Desktop/ACM_Project/directory/vocab.txt',
 '/Users/ceejayarana/Desktop/ACM_Project/directory/added_tokens.json')

In [85]:
import os
from transformers import BertTokenizer, BertForSequenceClassification

output_directory = '/Users/ceejayarana/Desktop/ACM_Project/directory'

os.makedirs(output_directory, exist_ok=True)

model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

('/Users/ceejayarana/Desktop/ACM_Project/directory/tokenizer_config.json',
 '/Users/ceejayarana/Desktop/ACM_Project/directory/special_tokens_map.json',
 '/Users/ceejayarana/Desktop/ACM_Project/directory/vocab.txt',
 '/Users/ceejayarana/Desktop/ACM_Project/directory/added_tokens.json')

In [88]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import os  

model_name = '/Users/ceejayarana/Desktop/ACM_Project/bert-based-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_data))

num_epochs = 3
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

tokenizer_files = [
    'tokenizer_config.json',
    'special_tokens_map.json',
    'vocab.txt',
    'added_tokens.json'
]

for file in tokenizer_files:
    source_path = f'/Users/ceejayarana/Desktop/ACM_Project/directory/{file}'
    dest_path = os.path.join(output_directory, file)
    if os.path.exists(source_path):
        os.replace(source_path, dest_path)

output_directory = '/Users/ceejayarana/Desktop/ACM_Project/directory'

model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

OSError: Can't load tokenizer for '/Users/ceejayarana/Desktop/ACM_Project/bert-based-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/Users/ceejayarana/Desktop/ACM_Project/bert-based-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.