In [1]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

import datasets
import evaluate
import numpy as np
import os 

In [5]:
data = datasets.load_from_disk('../../data/preprocessed')
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 340675
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 272541
    })
})

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    'prajjwal1/bert-tiny', 
    num_labels=38,
)  

tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny', use_fast=True, max_length=512)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_batch(batch):
  texts = batch['text']
   
  tokenized_inputs = tokenizer(texts, padding=False, truncation=True, max_length=512) 
  tokenized_inputs['labels'] = batch['labels']
  return tokenized_inputs

tokenized_dataset = data.map(tokenize_batch, batched=True, num_proc=16) 

Map (num_proc=16):   0%|          | 0/340675 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/272541 [00:00<?, ? examples/s]

In [9]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = 1 / (1 + np.exp(-predictions))
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   references = labels.astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=references)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

In [10]:
os.environ["WANDB_DISABLED"] = "true" 

In [11]:
training_args = TrainingArguments(
   output_dir="logs",
   learning_rate=2e-5,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   num_train_epochs=2,
   weight_decay=0.01,
   eval_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   report_to=None
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1008,0.099482,0.970459,0.0,0.0,0.0
2,0.0966,0.096863,0.970459,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=10648, training_loss=0.12584767898700006, metrics={'train_runtime': 2391.5433, 'train_samples_per_second': 284.9, 'train_steps_per_second': 4.452, 'total_flos': 818097840137748.0, 'train_loss': 0.12584767898700006, 'epoch': 2.0})