In [None]:
!pip install datasets
!pip install transformers

In [None]:
!pip install wandb

In [3]:
import wandb

In [4]:
import pandas as pd
import datasets
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from datasets import load_metric
# import wandb
# import os

In [None]:
from datasets import load_dataset
train_data, test_data = load_dataset("trec", split =['train', 'test'])

In [6]:
train_data_f = train_data.rename_column('fine_label', 'labels')
test_data_f = test_data.rename_column('fine_label', 'labels')

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

In [8]:
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = True, truncation=True)


train_data_f = train_data_f.map(tokenization, batched = True, batch_size = len(train_data_f))
test_data_f = test_data_f.map(tokenization, batched = True, batch_size = len(test_data_f))

Map:   0%|          | 0/5452 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
train_data_f.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data_f.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [10]:
# define accuracy metrics
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions.argmax(-1)
    f1 = f1_score(y_true=labels, y_pred=predictions, average="macro")
    precision = precision_score(y_true=labels, y_pred=predictions, average="macro")
    recall = recall_score(y_true=labels, y_pred=predictions, average="macro")
    return {
        "accuracy": (predictions == labels).astype(np.float32).mean().item(),
        "f1": f1.astype(np.float32).astype(np.float32).mean().item(),
        "precision": precision.astype(np.float32).mean().item(),
        "recall": recall.astype(np.float32).mean().item(),
        }

default \\
hidden_dropout_prob = 0.1 \\
num_hidden_layers = 6 \\
learning_rate = 2e-5 \\
batch_size = 16



In [18]:
from transformers import RobertaConfig

In [19]:
model_config = RobertaConfig.from_pretrained('roberta-base', num_labels=50)

In [20]:
model_config.num_hidden_layers = 6
model_config.hidden_dropout_prob = 0.1

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=model_config)

In [22]:
# define the training arguments
train_args = TrainingArguments(
    run_name="eric_hl6_lr2e-5_bs16_dr0.1",
    num_train_epochs=5.0, 
    learning_rate=2e-5, ####
    weight_decay=0,
    per_device_train_batch_size=16, # trainig batch size ####
    per_device_eval_batch_size=16, # evaluation batch size
    optim="adamw_hf",
    data_seed=417,
    seed=417,
    save_strategy="epoch",
    output_dir="./outputs",
    evaluation_strategy="epoch",
    # logging_steps=600,
    logging_strategy='epoch',
    # report_to="wandb"
    )

In [23]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=train_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data_f,
    eval_dataset=test_data_f
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
# train the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.state.log_history

In [None]:
wandb.finish()

In [None]:
# evaluation
# preds, labels, metrics = trainer.predict(test_data_f)
# print(preds.shape, labels.shape)

# print(compute_metrics((preds, labels)))

# %rm -rf wandb