In [None]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 72.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 80.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [None]:
from transformers import AdamW, RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from datasets import Dataset, load_dataset
import numpy as np
import pandas as pd
import tqdm
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
NUM_EPOCHS = 3
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 5e-05

In [None]:
df = pd.read_json('/content/drive/MyDrive/RapMachine/RapNotRap.json')
train = df.sample(frac=0.8) # train split 80%
test = df.drop(train.index) # test split 20%
training_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(test)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenization(batch):
  return tokenizer(batch['text'], padding = True, truncation=True)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
train_data = training_dataset.map(tokenization, batched = True, batch_size = len(training_dataset))
test_data = validation_dataset.map(tokenization, batched = True, batch_size = len(validation_dataset))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
train_set = TensorDataset(train_data['input_ids'], train_data['attention_mask'], train_data['label'])
train_dataloader = DataLoader(train_set, shuffle=True, batch_size=TRAIN_BATCH_SIZE)

test_set = TensorDataset(test_data['input_ids'], test_data['attention_mask'], test_data['label'])
test_dataloader = DataLoader(test_set, shuffle=True, batch_size=EVAL_BATCH_SIZE)

In [None]:
class RoBERTaBinaryClassifier(nn.Module):
    def __init__(self):
        super(RoBERTaBinaryClassifier, self).__init__()

        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2, return_dict=False)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        _, pooled_output = self.roberta(input_ids,
                                        attention_mask=attention_mask,
                                        labels=labels,
                                        return_dict=False
                                        )
        
        dropout_output = self.dropout(pooled_output)
        logits = self.linear(dropout_output)
        proba = self.sigmoid(logits)
        return proba

In [None]:
model = RoBERTaBinaryClassifier()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [None]:
torch.cuda.empty_cache()

In [None]:
for epoch in range(NUM_EPOCHS):
    # setup loop with TQDM and dataloader
    loop = tqdm.tqdm(train_dataloader)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad()
        # pull all tensor batches required for training
        input_ids, attention_mask, labels = (tensor.to(device) for tensor in batch)
        # process
        logits = model(input_ids, attention_mask=attention_mask, labels=labels)
        # extract loss
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels.unsqueeze(1).float())
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optimizer.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 431/431 [06:06<00:00,  1.17it/s, loss=0.00418]
Epoch 1: 100%|██████████| 431/431 [06:06<00:00,  1.17it/s, loss=0.0103]
Epoch 2: 100%|██████████| 431/431 [06:07<00:00,  1.17it/s, loss=0.0333]


In [None]:
def save_model(model, path):
  torch.save(model.state_dict(), path)

save_model(model, './roberta_ranker.pth')

In [None]:
model.eval()
model_predicted = []
all_logits = []
with torch.no_grad():
   loop = tqdm.tqdm(test_dataloader)
   for batch in loop:
        input_ids, attention_mask, labels = (tensor.to(device) for tensor in batch)
        logits = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels.unsqueeze(1).float())
        numpy_logits = logits.cpu().detach().numpy()
        
        model_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

100%|██████████| 27/27 [00:30<00:00,  1.12s/it]


In [None]:
print(classification_report(test_data['label'].tolist(), model_predicted))

              precision    recall  f1-score   support

           0       0.52      0.52      0.52       886
           1       0.49      0.48      0.49       838

    accuracy                           0.50      1724
   macro avg       0.50      0.50      0.50      1724
weighted avg       0.50      0.50      0.50      1724

