In [1]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install transformers[sentencepiece]
!pip install transformers[torch]
!pip install accelerate -U
!pip install sacremoses

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
import sys
from textwrap import TextWrapper
import datasets
import huggingface_hub
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
import torch
import transformers
import pandas as pd
import numpy as np
import sentencepiece
from transformers import pipeline


In [3]:
emotions = datasets.load_dataset("emotion")

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
emotions.set_format(type="pandas")
df = emotions["train"][:]
len(df)
#

16000

In [5]:
def label2str(row):
  return emotions["train"].features["label"].int2str(row)
df["label_name"] = df["label"].apply(label2str)
df.label_name.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: label_name, dtype: int64

In [6]:
emotions.reset_format()

In [7]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
text = """
Impossible is nothing.
"""
encoded_text = tokenizer(text)
print(encoded_text)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': [101, 5263, 2003, 2498, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [8]:
tokenizer.vocab_size

30522

In [9]:
tokenizer.model_max_length

512

In [10]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)
print(tokenize(emotions["train"][:3]))

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102], [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [11]:
token2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
special_tokens = sorted(token2ids, key=lambda x: x[-1])
df = pd.DataFrame(special_tokens, columns=["Special Token", "Special Token ID"])
df.T
# PAD: padding
# UNK: unknown (사전에 없음)
# CLS: classification (보통 문장의 시작을 마킹함 SOS로 표기하는 경우도 있음)
# SEP: separator (문장이 끝나는 지점; EOS로 표기하는 경우도 있음)
# MASK: 빈칸 채우기 태스크를 풀 때 빈 칸을 [MASK]로 표기함

Unnamed: 0,0,1,2,3,4
Special Token,[PAD],[UNK],[CLS],[SEP],[MASK]
Special Token ID,0,100,101,102,103


In [12]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
print(emotions_encoded["train"].column_names)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

['text', 'label', 'input_ids', 'attention_mask']


In [13]:
# 텍스트 분류기 만들기
# 사전학습된 distilbert 가져다 쓰는 중
# BERT는 인코더니까 자연어의 적절한 벡터/텐서 표현을 반환함

# 그렇게 반환된 텐서를 입력값으로 받는 분류기를 이후에 이어붙임

# 방법 1
# 트랜스포머는 그대로; 우리가 이어붙이는 분류기만 학습
# 트랜스포머는 신경망의 일종
# 트랜스포머에도 weights가 있음
# -> 이 weight는 건드리지 않음 (업데이트 안 함) weights를 freeze했다

# 방법 2 (미세조정 fine tuning)
# 트랜스포머와 우리가 이어붙이는 분류기 둘 다 학습
# 트랜스포머의 weights와 우리 모델 둘 다 학습

In [14]:
from transformers import AutoModel
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [15]:
device

device(type='cuda')

In [16]:
def extract_hidden_states(batch):
  inputs = {k:v.to(device) for k, v in batch.items()
            if k in tokenizer.model_input_names}
  with torch.no_grad(): # 기울기 계산 안 함
    last_hidden_state = model(**inputs).last_hidden_state
  return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

emotions_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
# 이 이후는 sklearn에서 했던 흐름과 똑같음
X_train = np.array(emotions_hidden["train"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])

X_val= np.array(emotions_hidden["validation"]["hidden_state"])
y_val= np.array(emotions_hidden["validation"]["label"])

X_test= np.array(emotions_hidden["test"]["hidden_state"])
y_test= np.array(emotions_hidden["test"]["label"])

In [18]:
# 원하는 분류기 가져오세요
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
clf.score(X_val, y_val)

0.6075

In [None]:
# 방법 2 Fine tuning
# base model을 epoch 2~3번만 추가학습시키기

In [20]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [22]:
from transformers import Trainer, TrainingArguments

batch_size = 64

model_name = "{}-finetuned-emotions".format(model_ckpt)

In [23]:
# 학습 관련 하이퍼파라미터
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=1e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01, # 과적합 방지 수단 W의 vector norm을 줄이는 규제
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False
                                  )

In [24]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)

In [25]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.632661,0.78,0.732003
2,0.809600,0.429201,0.8745,0.866321


TrainOutput(global_step=500, training_loss=0.8096135864257813, metrics={'train_runtime': 229.2084, 'train_samples_per_second': 139.611, 'train_steps_per_second': 2.181, 'total_flos': 720342861696000.0, 'train_loss': 0.8096135864257813, 'epoch': 2.0})