In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install --q peft evaluate

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

2024-07-24 10:29:12.385591: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-24 10:29:12.385712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-24 10:29:12.503975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**<h1>Base model**

In [3]:
model_checkpoint = 'distilbert-base-uncased'

# define label maps
id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
label2id = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=6, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**<h1>Load data**

In [4]:
dataset = load_dataset('SetFit/emotion')

Downloading readme:   0%|          | 0.00/194 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/276k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/279k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

**<h1>Preprocess data**

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def tokenize_function(examples):
    text = examples['text']
    
    tokenized_inputs = tokenizer(text, truncation=True, return_tensors='np', max_length=512)
    
    return tokenized_inputs

In [8]:
# add pad token if none exists
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [9]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [11]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

**<h1>Evaluation metrics**

In [12]:
# import accuracy evaluation metric
accuracy = evaluate.load('accuracy')

# define an evaluation function to pass into trainer later
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

**<h2>Untrained model performance**

In [13]:
text_list = ["i am sad.", "today, i have a boy friend, i'm happy", "this is the most yummy dishes",
            "wowwww", "exciting film!"]

print("Utrained model predictions: ")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors='pt')
    logits = model(inputs).logits
    predictions = torch.argmax(logits, dim=1).item()
    
    print(text + " - " + id2label[predictions])

Utrained model predictions: 
i am sad. - fear
today, i have a boy friend, i'm happy - fear
this is the most yummy dishes - fear
wowwww - sadness
exciting film! - fear


**<h1>Fine-tuning with LoRA**

In [14]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                         r=4, # intrinsic rank of trainable weight matrix
                         lora_alpha=32, # this is like a learning rate
                         lora_dropout=0.01, # probablity of dropout
                         target_modules=['q_lin']) # we apply lora to query layer

In [15]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 632,070 || all params: 67,590,156 || trainable%: 0.9352


In [16]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

# define training arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint + '-lora-text-classification',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)



In [17]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6666,0.429088,{'accuracy': 0.878}
2,0.5672,0.561128,{'accuracy': 0.8925}
3,0.536,0.513265,{'accuracy': 0.9035}
4,0.4804,0.435311,{'accuracy': 0.9085}
5,0.496,0.386505,{'accuracy': 0.902}
6,0.4577,0.407456,{'accuracy': 0.91}
7,0.4421,0.360833,{'accuracy': 0.912}
8,0.3207,0.327478,{'accuracy': 0.916}
9,0.2966,0.30246,{'accuracy': 0.919}
10,0.2447,0.309313,{'accuracy': 0.919}


Trainer is attempting to log a value of "{'accuracy': 0.878}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8925}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9035}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9085}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.902}" of type <class 'dict'> for key "eval/accuracy" as a scalar. Thi

TrainOutput(global_step=40000, training_loss=0.45008111057281497, metrics={'train_runtime': 875.7671, 'train_samples_per_second': 182.697, 'train_steps_per_second': 45.674, 'total_flos': 1471518910155168.0, 'train_loss': 0.45008111057281497, 'epoch': 10.0})

In [18]:
test_dataset = tokenized_dataset['test']

# Đánh giá mô hình trên tập dữ liệu kiểm tra
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Hiển thị kết quả đánh giá
print("Test results:")
for key, value in test_results.items():
    print(f"{key}: {value}")


Trainer is attempting to log a value of "{'accuracy': 0.916}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Test results:
eval_loss: 0.2890542149543762
eval_accuracy: {'accuracy': 0.916}
eval_runtime: 3.9139
eval_samples_per_second: 511.003
eval_steps_per_second: 127.751
epoch: 10.0


In [19]:
model.to('cpu')

print("Trained model predictions: ")
for text in text_list:
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors='pt').to('cpu')
  # compute logits
  logits = model(inputs).logits
  # convert logits to label
  predictions = torch.argmax(logits, dim=1).item()

  print(text + " - " + id2label[predictions])

Trained model predictions: 
i am sad. - sadness
today, i have a boy friend, i'm happy - joy
this is the most yummy dishes - joy
wowwww - joy
exciting film! - joy


In [20]:
model.save_pretrained('lora_text_classification_model')
# tokenizer.save_pretrained('tokenizer')

In [28]:
# reload and merge
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=6, 
    id2label=id2label, 
    label2id=label2id,
)
model = PeftModel.from_pretrained(base_model, '/kaggle/working/lora_text_classification_model')
model = model.merge_and_unload()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# reload tokenize to save it
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [32]:
model.save_pretrained("models/finetune_model.pt")
tokenizer.save_pretrained("models/tokenizer/")

('models/tokenizer/tokenizer_config.json',
 'models/tokenizer/special_tokens_map.json',
 'models/tokenizer/vocab.txt',
 'models/tokenizer/added_tokens.json',
 'models/tokenizer/tokenizer.json')