In [None]:
!pip install transformers==4.2.0
!pip install datasets==1.2.0
!pip install tensorflow

In [1]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
import torch.nn as nn
import torch
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
NUM_EPOCHS = 2
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 64
WARMUP_STEPS = 50
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 50
LEARNING_RATE = 5e-05

In [4]:
import pandas as pd
from datasets import Dataset

df = pd.read_json('/content/drive/MyDrive/RapNotRap.json')
train = df.sample(frac=0.8) # train split 80%
test = df.drop(train.index) # test split 20%
training_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(test)

In [None]:
def run_tokenizer(train_dataset, dev_dataset):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def get_sequence_len(tokenizer, train_dataset, dev_dataset):

        def tokenize_for_lengths(batch):
            return tokenizer(batch['text'], padding=False, truncation=True, max_length=512, return_length = True)

        train_dataset_for_lengths = train_dataset.map(tokenize_for_lengths, batched=True, batch_size=len(train_dataset))

        tweet_lengths = np.array(train_dataset_for_lengths[:]['length'])
        chosen_sequence_len = int(np.percentile(tweet_lengths, 95)) +1 #EXAMPLE
        
        return chosen_sequence_len

    chosen_sequence_len = get_sequence_len(tokenizer, train_dataset, dev_dataset)

    def tokenize(batch, sequence_len):
        return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=sequence_len)

    train_dataset = train_dataset.map(tokenize, fn_kwargs={'sequence_len': chosen_sequence_len}, batched=True, batch_size=len(train_dataset))
    dev_dataset = dev_dataset.map(tokenize, fn_kwargs={'sequence_len': chosen_sequence_len}, batched=True, batch_size=len(dev_dataset))

    return (train_dataset, dev_dataset)

train_dataset, dev_dataset = run_tokenizer(training_dataset, validation_dataset)

In [6]:
def set_format(train_dataset, dev_dataset):
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    dev_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    return (train_dataset, dev_dataset)

train_dataset, dev_dataset = set_format(train_dataset, dev_dataset)

In [67]:
train_dataset

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 6895
})

In [61]:
class RoBERTaBinaryClassifier(nn.Module):
    def __init__(self):
        super(RoBERTaBinaryClassifier, self).__init__()

        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, ids, mask, token_type_ids, labels):
        sequence_output, pooled_output = self.roberta(ids,
                                                      attention_mask=mask,
                                                      token_type_ids=token_type_ids,
                                                      labels=labels)
        sequence_output = self.dropout(sequence_output)
        logits = self.linear(sequence_output)
        proba = self.sigmoid(logits)
        return proba

In [62]:
model = RoBERTaBinaryClassifier()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [63]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience= 2, early_stopping_threshold = 0.0)

In [64]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    weight_decay=WEIGHT_DECAY,
    logging_dir='./logs/',
    evaluation_strategy="steps",
    logging_steps=LOGGING_STEPS,
    learning_rate=LEARNING_RATE,
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

In [65]:
def define_trainer(model, training_args, train_dataset, dev_dataset, compute_metrics, early_stopping_callback):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
        callbacks = [early_stopping_callback]
    )
    
    return trainer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
trainer = define_trainer(model, training_args, train_dataset, dev_dataset, compute_metrics, early_stopping_callback)

In [66]:
def train(trainer):
    trainer.train() #EXAMPLE
    return trainer

trainer = train(trainer)

TypeError: ignored

In [None]:
def save_model(model, path):
  torch.save(model.state_dict(), path)

save_model(trainer)

In [None]:
def evaluate(trainer):
    trainer.evaluate()

evaluate(trainer)