In [1]:
# !pip install git+https://github.com/IndoNLP/nusa-crowd.git@release_exp

In [2]:
import os, sys
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = '3,4'

In [3]:
from nusacrowd import NusantaraConfigHelper
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [4]:
conhelps = NusantaraConfigHelper()

In [5]:
len(conhelps.available_dataset_names), conhelps.available_dataset_names[:20]

(113,
 ['barasa',
  'bible_en_id',
  'bible_jv_id',
  'bible_su_id',
  'casa',
  'cc100',
  'cod',
  'code_mixed_jv_id',
  'covost2',
  'cvss',
  'emot',
  'emotcmt',
  'emotion_id_opinion',
  'facqa',
  'hoasa',
  'id_abusive',
  'id_abusive_news_comment',
  'id_clickbait',
  'id_frog_story',
  'id_google_play_review'])

# Loading SMSA Dataset

In [6]:
smsa_dset = conhelps.filtered(lambda x: x.dataset_name == 'smsa' and 'nusantara_text' in x.config.name)[0].load_dataset()
smsa_dset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 11000
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})

## Load Model

In [7]:
# Load Tokenizer and Config
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = AutoConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = smsa_dset['train'].features['label'].num_classes

# Instantiate model
model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Preprocess Dataset

In [8]:
def preprocess_data(row):
    encoded_input =  tokenizer(row['text'], return_tensors="pt", truncation=True, padding=True)
    row['input_ids'] = encoded_input['input_ids'][0]
    row['attention_mask'] = encoded_input['attention_mask'][0]
    return row
smsa_dset = smsa_dset.map(preprocess_data, remove_columns=['id', 'text'])
smsa_dset

  0%|          | 0/11000 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1260 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 11000
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

# Setup HF Trainer
https://huggingface.co/docs/transformers/v4.23.1/en/main_classes/trainer#transformers.TrainingArguments)

In [9]:
def classification_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    metrics = classification_metrics_fn(pred_ids, pred.label_ids)
    return metrics

In [10]:
training_args = TrainingArguments(
    output_dir='./save', 
    seed=42,
    data_seed=42,
    learning_rate=1e-5,
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    dataloader_num_workers=32,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    gradient_accumulation_steps=1,
    report_to='tensorboard',
    # fp16=True,
    # gradient_checkpointing=True,
    # sharded_ddp='simple',
)

trainer = Trainer(
    train_dataset=smsa_dset["train"],
    eval_dataset=smsa_dset["validation"],
    model=model,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Run Training

In [11]:
trainer.train()

***** Running training *****
  Num examples = 11000
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 430
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fa

Epoch,Training Loss,Validation Loss,Acc,F1,Rec,Pre
1,No log,0.200725,0.921429,0.890267,0.891315,0.890494
2,No log,0.177245,0.938095,0.910536,0.905285,0.917461
3,No log,0.178013,0.938095,0.910406,0.900318,0.923312
4,No log,0.177218,0.94127,0.916916,0.909711,0.925415
5,No log,0.181773,0.938095,0.911994,0.904893,0.921222


***** Running Evaluation *****
  Num examples = 1260
  Batch size = 128
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertToken

TrainOutput(global_step=430, training_loss=0.15302500170330668, metrics={'train_runtime': 308.246, 'train_samples_per_second': 178.429, 'train_steps_per_second': 1.395, 'total_flos': 2640318480017568.0, 'train_loss': 0.15302500170330668, 'epoch': 5.0})

# Run Eval on Validation & Test Set

In [12]:
trainer.predict(smsa_dset['validation']).metrics

***** Running Prediction *****
  Num examples = 1260
  Batch size = 128
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertToken

{'test_loss': 0.1772184520959854,
 'test_ACC': 0.9412698412698413,
 'test_F1': 0.9169156959066732,
 'test_REC': 0.9097113783837761,
 'test_PRE': 0.9254151933997643,
 'test_runtime': 3.5434,
 'test_samples_per_second': 355.596,
 'test_steps_per_second': 2.822}

In [13]:
trainer.predict(smsa_dset['test']).metrics

***** Running Prediction *****
  Num examples = 500
  Batch size = 128
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'test_loss': 0.2722337245941162,
 'test_ACC': 0.916,
 'test_F1': 0.8872090658812577,
 'test_REC': 0.8671014443073267,
 'test_PRE': 0.928317901234568,
 'test_runtime': 2.2613,
 'test_samples_per_second': 221.109,
 'test_steps_per_second': 1.769}

**Notes**: For more example on different tasks, check https://github.com/huggingface/transformers/tree/main/examples/pytorch