In [1]:
!pip install transformers
!pip install datasets

import os
import random
from sklearn.metrics import accuracy_score
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

import pandas as pd
from tqdm import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification

# for graphing
import seaborn as sns
import matplotlib.pyplot as plt



2023-02-16 10:11:20.498766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
from datasets import load_dataset

dataset = load_dataset("klue", "ynat")
test = dataset['validation']

Found cached dataset klue (/Users/hwangtaegyeong/.cache/huggingface/datasets/klue/ynat/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# argment setting
model_checkpoint = "./4bit"#"./FP32" #"bert-base-multilingual-cased"
batch_size_per_device = 32
max_length = 64
n_epochs = 3
warmup_ratio = .2
lr = 5e-5

In [10]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')
# torch.device("cpu") 

No GPU available, using the CPU instead.


In [11]:
# load model&tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

In [12]:
def get_input_ids(data):
  document_bert = ["[CLS] " + str(s) + " [SEP]" for s in data]
  tokenized_texts = [tokenizer.tokenize(s) for s in tqdm(document_bert, "Tokenizing")]
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tqdm(tokenized_texts, "Converting tokens to ids")]
  print("Padding sequences...")
  input_ids = pad_sequences(input_ids, maxlen=max_length, dtype='long', truncating='post', padding='post')
  return input_ids

def get_attention_masks(input_ids):
  attention_masks = []
  for seq in tqdm(input_ids, "Generating attention masks"):
      seq_mask = [float(i > 0) for i in seq]
      attention_masks.append(seq_mask)
  return attention_masks

In [13]:
test_ids = torch.tensor(get_input_ids(test['title']))
test_masks = torch.tensor(get_attention_masks(test_ids))
test_labels = torch.tensor(test['label'])

Tokenizing: 100%|████████████████████████| 9107/9107 [00:00<00:00, 10293.37it/s]
Converting tokens to ids: 100%|██████████| 9107/9107 [00:00<00:00, 71396.18it/s]


Padding sequences...


Generating attention masks: 100%|█████████| 9107/9107 [00:02<00:00, 4223.18it/s]


In [14]:
test_data = TensorDataset(test_ids, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size_per_device)

In [15]:
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=7)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def predict(model, data_loader):
    print('start predict')

    model.eval()

    total_preds = []
    total_labels = []

    for step, batch in tqdm(enumerate(data_loader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        logit = outputs[0]

        logit = logit.detach().cpu().numpy()
        label = b_labels.cpu().numpy()
    
        total_preds += np.argmax(logit, axis=1).tolist()
        total_labels += label.tolist()

    f1 = f1_score(total_labels, total_preds, average='macro')
    avg = accuracy_score(total_labels, total_preds) * 100
    return f1, avg

In [17]:
model_dynamic_quantized = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8,
)
model_dynamic_quantized.to(device)
model_dynamic_quantized

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )
     

In [18]:
def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (KB):', size/1e3)
    os.remove('temp.p')
    return size

# compare the sizes
f=print_size_of_model(model,"fp32")
print(sum(p.numel() for p in model.parameters()))

q=print_size_of_model(model_dynamic_quantized,"int8")
print(sum(p.numel() for p in model_dynamic_quantized.parameters()))

print("{0:.2f} times smaller".format(f/q))

model:  fp32  	 Size (KB): 711504.585
177858823
model:  int8  	 Size (KB): 454972.325
92245248
1.56 times smaller


In [None]:
import time

def time_model_evaluation(model, tokenizer):
    eval_start_time = time.time()
    f1, avg_test_accuracy = predict(model, test_dataloader)
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    print("Accuracy : {0:.4f}".format(avg_test_accuracy))
    print("f1 score : {0:.4f}".format(f1))
    print("Evaluate total time (seconds) : {0:.1f}".format(eval_duration_time))

# Evaluate the original FP32 BERT model
time_model_evaluation(model, tokenizer)

# Evaluate the INT8 BERT model after the dynamic quantization
time_model_evaluation(model_dynamic_quantized, tokenizer)

start predict


285it [05:47,  1.22s/it]


Accuracy : 72.4607
f1 score : 0.7229
Evaluate total time (seconds) : 347.6
start predict


141it [02:30,  1.18s/it]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def sample(n):
  data = {
      'title':test['title'],
      'label':test['label']
  }
  return pd.DataFrame(data).sample(n)

def eval(model, d):
  
  total_preds = []
  total_labels = d['label']

  eval_start_time = time.time()
  for step, txt in tqdm(enumerate(d['title'])):
    output = torch.tensor([tokenizer.encode(txt)]).to(device)
    with torch.no_grad():
      preds = model(output).logits.cpu()

    pred = np.argmax(preds, axis=1).item()
    total_preds.append(pred)

  eval_end_time = time.time()
  eval_duration_time = eval_end_time - eval_start_time

  f1 = f1_score(total_labels, total_preds, average='macro')
  avg = accuracy_score(total_labels, total_preds) * 100
  
  print("Accuracy : {0:.4f}".format(avg))
  print("f1 score : {0:.4f}".format(f1))
  print("Evaluate total time (seconds) : {0:.1f}".format(eval_duration_time))

In [None]:
d = test
eval(model, d)
eval(model_dynamic_quantized, d)