In [1]:
import torch
from datasets import load_dataset
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
    print(torch.cuda.current_device())
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import evaluate
path = '/app/Data/'
model_path = 'google-bert/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


True
NVIDIA GeForce RTX 3080
0


2024-04-01 15:52:48.512143: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-01 15:52:48.570782: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
arrow_file_name = "data-00000-of-00001"
def load_data(path,tokenized=False):
    if tokenized:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/tokenized/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/tokenized/test_ds/{arrow_file_name}.arrow',
            'val':path+f'/tokenized/val_ds/{arrow_file_name}.arrow'
        })
    else:
        full_ds = load_dataset('arrow',data_files={
            'train':path+f'/train_ds/{arrow_file_name}.arrow',
            'test':path+f'/test_ds/{arrow_file_name}.arrow',
            'validation':path+f'/val_ds/{arrow_file_name}.arrow'
        })
    return full_ds

### Init Evaluation metrics, Data Collator and Model

In [4]:
tokenized_ds = load_data(path,tokenized=True)

In [4]:
tokenized_ds['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12666
})

In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0:"ham",
    1:"spam",
}

label2id = {
    "ham":0,
    "spam":1,
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

to_torch_compile = False # Change to True only if on linux

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir="model_checkpoints",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    optim="adamw_bnb_8bit",
    gradient_accumulation_steps=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="steps",
    push_to_hub=False,
    metric_for_best_model="eval_loss",
    logging_steps=200,
    save_steps=200,
    eval_steps=200,
    save_total_limit=5,
    load_best_model_at_end=True,
    torch_compile = to_torch_compile
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [8]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
200,0.2428,0.117756,0.955368
400,0.1309,0.143002,0.967485
600,0.1371,0.103542,0.967791
800,0.1145,0.093857,0.972393
1000,0.066,0.10006,0.976687
1200,0.0498,0.101369,0.977607
1400,0.0561,0.088921,0.978988
1600,0.0602,0.099374,0.977147
1800,0.0237,0.083821,0.982669
2000,0.0339,0.074332,0.982055


TrainOutput(global_step=3168, training_loss=0.0618113267301309, metrics={'train_runtime': 1656.8514, 'train_samples_per_second': 30.578, 'train_steps_per_second': 1.912, 'total_flos': 1.208580899467008e+16, 'train_loss': 0.0618113267301309, 'epoch': 4.0})

In [9]:
eval_results = trainer.evaluate()

In [10]:
eval_results

{'eval_loss': 0.07433223724365234,
 'eval_accuracy': 0.9820552147239264,
 'eval_runtime': 39.2567,
 'eval_samples_per_second': 166.086,
 'eval_steps_per_second': 20.761,
 'epoch': 4.0}

In [12]:
trainer.save_model("/app/models/bert_model_new")

In [14]:
tokenized_ds = load_data(path,tokenized=True)
tokenized_test_ds = tokenized_ds['test']
classifier = pipeline('text-classification',model='/app/models/bert_model_new', device=torch.cuda.current_device())
def decode_tokens(example):
    tokens = example['text']
    # label_map = {0:"ham",1:"spam"}
    text = tokenizer.decode(tokens,skip_special_tokens=True)
    # label = label_map[example['label']]
    return {'text':text}

tokenized_test_ds = tokenized_test_ds.map(decode_tokens)

Map: 100%|██████████| 8150/8150 [00:14<00:00, 579.83 examples/s]


In [15]:
from evaluate import evaluator
task_evaluator = evaluator('text-classification')
results = task_evaluator.compute(
    model_or_pipeline=classifier,
    data=tokenized_test_ds,
    metric=evaluate.combine(['accuracy','recall','precision','f1']),
    label_mapping=label2id,
)

Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 13.4MB/s]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 17.5MB/s]
Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 24.5MB/s]


In [17]:
results

{'accuracy': 0.9808588957055214,
 'recall': 0.9864864864864865,
 'precision': 0.9385714285714286,
 'f1': 0.9619326500732065,
 'total_time_in_seconds': 91.088021577998,
 'samples_per_second': 89.47389413899187,
 'latency_in_seconds': 0.011176444365398528}