In [1]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## Preparing Dataset

In [2]:
df = pd.read_csv("splicing_small.csv")

In [3]:
labels_mapping = {"intron": 0, "exon": 1}

In [4]:
sequence = df["sequence"][0]

In [5]:
def to_tokens_sequence(sequence):
  return "".join([f"[{nucl}]" for nucl in sequence])

In [6]:
crop = int(0.9 * df["sequence"].count())

df["sequence"] = df["sequence"].apply(to_tokens_sequence)
df["label"] = df["label"].map(labels_mapping)

sequences = df.iloc[:crop, 0].tolist()
labels = df.iloc[:crop, 1].tolist()

test_sequences = df.iloc[crop:, 0].tolist()
test_labels = df.iloc[crop:, 1].tolist()

## Creating & Training BERT

In [7]:
checkpoint = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
special_tokens = ["[A]", "[C]", "[G]", "[T]"]
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(30526, 768, padding_idx=0)

In [9]:
tokens = tokenizer(sequences, padding="longest", truncation=True, return_tensors="pt")

In [10]:
class SplicingDataset(Dataset):
	def __init__(self, tokens, labels):
		self.tokens = tokens
		self.labels = labels
		
	def __len__(self):
		return len(self.labels)
	
	def __getitem__(self, idx):
		return {
			"input_ids": self.tokens["input_ids"][idx],
			"attention_mask": self.tokens["attention_mask"][idx],
			"labels": torch.tensor(self.labels[idx])
		}

dataset = SplicingDataset(tokens, labels)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

optimizer = AdamW(model.parameters(), lr=0.0005)
loss_fn = CrossEntropyLoss()

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30526, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
epochs = 3
for epoch in range(epochs):
	model.train()
	total_loss = 0

	for batch in dataloader:
		optimizer.zero_grad()

		input_ids = batch["input_ids"].to(device)
		attention_mask = batch["attention_mask"].to(device)
		labels = batch["labels"].to(device)

		outputs = model(input_ids, attention_mask-attention_mask, labels=labels)
		loss = outputs.loss
		loss.backward()
	
	print(f"Epoch {epoch + 1}, Loss: {total_loss/len(dataloader)}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1, Loss: 0.0
Epoch 2, Loss: 0.0
Epoch 3, Loss: 0.0


In [13]:
from sklearn.metrics import classification_report

test_tokens = tokenizer(test_sequences, padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
	input_ids = test_tokens["input_ids"].to(device)
	attention_mask = test_tokens["attention_mask"].to(device)

	outputs = model(input_ids, attention_mask=attention_mask)
	predictions = torch.argmax(outputs.logits, dim=-1)

print(classification_report(test_labels, predictions.cpu()))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       128
           1       0.26      1.00      0.41        44

    accuracy                           0.26       172
   macro avg       0.13      0.50      0.20       172
weighted avg       0.07      0.26      0.10       172



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
test_tokens = tokenizer(test_sequences[0], padding=True, truncation=True, return_tensors="pt")

input_ids = test_tokens["input_ids"].to(device)
attention_mask = test_tokens["attention_mask"].to(device)

In [19]:
print(f"Sequence:{test_sequences[0]}")
print(f"Awnser:{test_labels[0]}")
print(f"input_ids: {input_ids}")
print(f"attention_mask: {attention_mask}")

Sequence:[C][C][T][C][C][A][T][T][G][T][C][G][G][G][C][G][C][C][C][T][C][G][C][C][A][C][C][A][T][G][G]
Awnser:1
input_ids: tensor([[  101, 30523, 30523, 30525, 30523, 30523, 30522, 30525, 30525, 30524,
         30525, 30523, 30524, 30524, 30524, 30523, 30524, 30523, 30523, 30523,
         30525, 30523, 30524, 30523, 30523, 30522, 30523, 30523, 30522, 30525,
         30524, 30524,   102]], device='cuda:0')
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')


In [20]:
outputs = model(input_ids, attention_mask=attention_mask)
predictions = torch.argmax(outputs.logits, dim=-1)

In [21]:
predictions

tensor([1], device='cuda:0')