In [1]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

2023-01-27 08:33:49.880458: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 데이터 셋 적재
raw_datasets = load_dataset("nsmc")
# 사전학습 언어모델 checkpoint 이름 지정
checkpoint = "monologg/koelectra-base-v2-discriminator"
# 지정된 사전학습 언어모델에서 토크나이저 인스턴스화
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


# 토크나이저 함수 사용자 정의화 (sentence1, sentence2 컬럼에 대해서만 토크나이징 수행)
def tokenize_function(example):
    return tokenizer(example["document"], truncation=True)


# 토크나이징 수행
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# 배치(batch)별 패딩(padding)을 위한 data collator 정의
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 불필요한 입력 컬럼을 제거하고 사전학습 언어모델에 필요한 입력만 남김.
tokenized_datasets = tokenized_datasets.remove_columns(["id", "document"])
# 데이터셋의 label 컬럼명을 labels로 변경
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# 데이터셋의 유형을 PyTorch tensor로 변경
tokenized_datasets.set_format("torch")

# 변경된 컬럼 출력
print(tokenized_datasets["train"].column_names)


# 각 종류별 데이터 로더 생성
train_dataloader = DataLoader(tokenized_datasets["train"], 
                              shuffle=True, 
                              batch_size=128, 
                              collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"],
                             shuffle=True,
                             batch_size=128,
                             collate_fn=data_collator)

# 사전학습 언어모델 인스턴스화
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# 최적화 함수 정의
optimizer = AdamW(model.parameters(), lr=5e-5)

# 에포크 개수 설정
num_epochs = 1
# 학습 스텝 수 계산
num_training_steps = num_epochs * len(train_dataloader)
# 학습 스케쥴러 설정
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

Found cached dataset nsmc (/Users/sudong/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/sudong/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-724edfd1a8053cff.arrow


  0%|          | 0/50 [00:00<?, ?ba/s]

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


Some weights of the model checkpoint at monologg/koelectra-base-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v2-discriminator and are newly initialized: 

In [3]:
# GPU로 모델을 이동
device  = torch.device("mps")
model.to(device)

# 진행 상황바 정의
progress_bar = tqdm(range(num_training_steps))

# 모델을 학습 모드로 전환
model.train()
# 학습 루프 시작
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # 현재 배치 중에서 입력값을 모두 GPU로 이동.
        batch = {k: v.to(device) for k, v in batch.items()}
        # 모델 실행
        outputs = model(**batch)
        # 손실값 가져오기
        loss = outputs.loss
        # 역전파 수행
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1172 [00:00<?, ?it/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
# 평가 메트릭 가져오기
metric = load_metric('f1')
# 모델을 평가 모드로 전환
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

# 평가 결과 계산 및 출력 
metric.compute()

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

KeyboardInterrupt: 

In [11]:
# 모델 저장 변수 이름.save_pretrained(원하는 디렉토리) 형태
model.save_pretrained('./models_param/model.pt')
# torch.save(model, PATH)

In [19]:
model = AutoModelForSequenceClassification.from_pretrained('./models_param/model.pt')
# model = torch.load(PATH)
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# MODEL_NAME = "monologg/koelectra-base-generator"

tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-generator")

# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
from transformers import pipeline,TextClassificationPipeline

classifier = TextClassificationPipeline(model = model, tokenizer = tokenizer)

In [21]:
print(classifier("이쁘고 좋아요~~~씻기도 편하고 아이고 이쁘다고 자기방에 갖다놓고 잘써요~^^"))
print(classifier("와 너 진짜 별로다"))
print(classifier("그냥저냥 보통인데?"))
print(classifier("이보다 재미있을 수 없다"))
print(classifier("와 진짜 진짜 재미 없다"))
print(classifier("아 더빙 ...진짜 짜증나네 목소리"))

[{'label': 'LABEL_0', 'score': 0.5516111850738525}]
[{'label': 'LABEL_0', 'score': 0.6502959728240967}]
[{'label': 'LABEL_0', 'score': 0.6575488448143005}]
[{'label': 'LABEL_0', 'score': 0.5142531394958496}]
[{'label': 'LABEL_1', 'score': 0.5147390365600586}]
[{'label': 'LABEL_1', 'score': 0.5049659609794617}]
