# 네이버 영화리뷰 sentiment analysis

In [18]:
# https://github.com/kiyoungkim1/LMkor

In [None]:
!pip install transformers datasets evaluate

In [20]:
!pip install accelerate -U



In [21]:
import os
import torch
import random
import numpy as np
import pandas as pd

In [22]:
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # for faster training, but not deterministic

set_seeds(seed=42)

In [37]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
from datasets import load_dataset

nsmc = load_dataset("nsmc")

In [25]:
nsmc

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [26]:
nsmc['train']['document'][6]

'원작의 긴장감을 제대로 살려내지못했다.'

In [27]:
# # data clean
# for i in range(1, len(nsmc['train']['document'])+1):
#   print(nsmc['train']['document'][i])

In [28]:
# AutoTokenizer : 자동으로 적절한 토크나이저 선택하고 초기화
# BertForSequenceClassification : 사전 학습된 BERT 모델을 다운로드하고 분류 작업을 위해 fine-tuning 하는 클래스

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, FunnelTokenizerFast, ElectraTokenizerFast

model_path = 'kykim/electra-kor-base'
# model_path = 'kykim/funnel-kor-base'
tokenizer = ElectraTokenizerFast.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to('cuda')

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
def preprocess_function(examples):
    return tokenizer(examples["document"], truncation=True)

In [30]:
tokenized_nsmc = nsmc.map(preprocess_function, batched=True)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [31]:
print(tokenizer.tokenize(tokenized_nsmc['train'][0]['document']))
# attention_mask : attention_mask는 입력 토큰의 실제 내용이 있는 부분은 1로 표시하고, 패딩(padding) 토큰이 있는 부분은 0으로 표시

['아', '더', '##빙', '.', '.', '진짜', '짜증', '##나네요', '목소리']


In [32]:
# DataCollatorWithPadding : 모델 학습을 위해 데이터를 배치로 묶을 때 사용
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [34]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [38]:
import torch
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="ai.keepit/test22",
    learning_rate=5e-6,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_nsmc["train"],
    eval_dataset=tokenized_nsmc["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/jiiyy/test22 into local empty directory.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2496,0.23852,0.90722
2,0.2108,0.249619,0.91272
3,0.1879,0.26783,0.91296


TrainOutput(global_step=28125, training_loss=0.22856841586642795, metrics={'train_runtime': 5435.9151, 'train_samples_per_second': 82.783, 'train_steps_per_second': 5.174, 'total_flos': 1.209810121428768e+16, 'train_loss': 0.22856841586642795, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

## 참고자료
### Huggingface documents
https://huggingface.co/docs/transformers/tasks/sequence_classification  
https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer  

### DACON 분류 예제
#### [코드공유]의 다양한 코드를 보며 여러가지 방법을 직접 적용해보세요!
https://dacon.io/competitions/official/236037/overview/description  
https://dacon.io/competitions/official/235875/overview/description