In [1]:
import torch
from datasets import load_dataset
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
    print(torch.cuda.current_device())
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, RobertaTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import evaluate
path = '/app/Data/'
#model_path = 'google-bert/bert-base-uncased'
# tokenizer = BertTokenizerFast.from_pretrained(model_path)
model_path = 'FacebookAI/roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


True
NVIDIA GeForce RTX 3080
0


2024-04-07 13:54:55.722781: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-07 13:54:55.841033: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
arrow_file_name = "data-00000-of-00001"
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/tokenized/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/tokenized/test_ds/{arrow_file_name}.arrow',
            'val':path+f'/tokenized/val_ds/{arrow_file_name}.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/test_ds/{arrow_file_name}.arrow',
            'validation':path+f'/val_ds/{arrow_file_name}.arrow'
        })
    return full_ds

### Init Evaluation metrics, Data Collator and Model

In [3]:
tokenized_ds = load_data(path,tokenized=True)

Generating train split: 15424 examples [00:00, 364574.10 examples/s]
Generating test split: 4035 examples [00:00, 233644.19 examples/s]
Generating val split: 8190 examples [00:00, 183637.15 examples/s]


In [4]:
tokenized_ds['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 15424
})

In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Model Training

In [6]:
to_torch_compile = False # Change to True only if on linux
training_args = TrainingArguments(
    output_dir="/app/model_checkpoints",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    optim="adamw_bnb_8bit", # Supposedly better than AdamW while using less space
    gradient_accumulation_steps=2, # Increases Effective Batch Size for smoother gradient descent
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="steps",
    push_to_hub=False,
    metric_for_best_model="eval_loss",
    logging_steps=100,
    save_steps=100,
    eval_steps=100,
    save_total_limit=5,
    load_best_model_at_end=True,
    torch_compile = to_torch_compile
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [7]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,0.3654,0.159358,0.939072
200,0.1768,0.169605,0.956532
300,0.1798,0.124579,0.958852
400,0.1598,0.142904,0.957631
500,0.1638,0.267306,0.933822
600,0.1373,0.286411,0.933455
700,0.1351,0.16978,0.960806
800,0.1417,0.093186,0.970696
900,0.1099,0.086527,0.973626
1000,0.1136,0.111669,0.970818


TrainOutput(global_step=3856, training_loss=0.07063372224893931, metrics={'train_runtime': 4047.9589, 'train_samples_per_second': 15.241, 'train_steps_per_second': 0.953, 'total_flos': 1.372804953558144e+16, 'train_loss': 0.07063372224893931, 'epoch': 4.0})

In [8]:
eval_results = trainer.evaluate()

In [9]:
eval_results

{'eval_loss': 0.08284582942724228,
 'eval_accuracy': 0.9832722832722832,
 'eval_runtime': 67.9106,
 'eval_samples_per_second': 120.6,
 'eval_steps_per_second': 15.079,
 'epoch': 4.0}

In [10]:
trainer.save_model("/app/models/roberta_aug")

## Model Evaluation

In [17]:
tokenized_ds = load_data(path,tokenized=True)
tokenized_test_ds = tokenized_ds['test']
classifier = pipeline('text-classification',model='/app/models/roberta_aug', device=torch.cuda.current_device())
def decode_tokens(example):
    tokens = example['text']
    # label_map = {0:"ham",1:"spam"}
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    # label = label_map[example['label']]
    return {'text':text}

tokenized_test_ds = tokenized_test_ds.map(decode_tokens)

In [18]:
from evaluate import evaluator
task_evaluator = evaluator('text-classification')
results = task_evaluator.compute(
    model_or_pipeline=classifier,
    data=tokenized_test_ds,
    metric=evaluate.combine(['accuracy','recall','precision','f1']),
    label_mapping=label2id,
)

In [19]:
results

{'accuracy': 0.9871127633209418,
 'recall': 0.97,
 'precision': 0.9778225806451613,
 'f1': 0.9738955823293173,
 'total_time_in_seconds': 42.155235250997066,
 'samples_per_second': 95.71764873271732,
 'latency_in_seconds': 0.010447394114249582}