In [3]:
from transformers import AdamW, RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from datasets import Dataset, load_dataset
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
NUM_EPOCHS = 2
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 5e-05

In [6]:
df = pd.read_json('/content/drive/MyDrive/RapNotRap.json')
train = df.sample(frac=0.8) # train split 80%
test = df.drop(train.index) # test split 20%
training_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(test)

In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenization(batch):
  return tokenizer(batch['text'], padding = True, truncation=True)

In [8]:
train_data = training_dataset.map(tokenization, batched = True, batch_size = len(training_dataset))
test_data = validation_dataset.map(tokenization, batched = True, batch_size = len(validation_dataset))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [9]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [8]:
train_set = TensorDataset(train_data['input_ids'], train_data['attention_mask'], train_data['label'])
train_dataloader = DataLoader(train_set, shuffle=True, batch_size=TRAIN_BATCH_SIZE)

test_set = TensorDataset(test_data['input_ids'], test_data['attention_mask'], test_data['label'])
test_dataloader = DataLoader(test_set, shuffle=True, batch_size=EVAL_BATCH_SIZE)

In [9]:
class RoBERTaBinaryClassifier(nn.Module):
    def __init__(self):
        super(RoBERTaBinaryClassifier, self).__init__()

        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask):
        sequence_output, pooled_output = self.roberta(
               input_ids, 
               attention_mask=attention_mask
          )
        dropout_output = self.dropout(sequence_output)
        logits = self.linear(dropout_output)
        proba = self.sigmoid(logits)
        return proba

In [10]:
model = RoBERTaBinaryClassifier()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [42]:
torch.cuda.empty_cache()

In [None]:
for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = model(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()

        if optimizer is not None:       
          batch_loss.backward()
          optimizer.step()
          optimizer.zero_grad()
        
        print('Epoch: ', NUM_EPOCHS + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / TRAIN_BATCH_SIZE, train_loss / (step_num + 1)))

In [None]:
def save_model(model, path):
  torch.save(model.state_dict(), path)

save_model(model, './roberta_ranker.pth')

In [None]:
model.eval()
model_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = model(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        model_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

In [None]:
print(classification_report(test_data['label'].tolist(), model_predicted))