# kcbert Fine-tune with Naver Shopping Review Data

- kcbert 모델을 네이버 쇼핑 리뷰 데이터를 이용해 파인튜닝합니다.

In [1]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW, BertForSequenceClassification
from tqdm.notebook import tqdm

In [2]:
if torch.cuda.is_available():    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2070


# data load 
- fine-tuning 할 데이터를 불러옵니다.
- 네이버 쇼핑 리뷰 데이터
    - 언어: 한국어
    - 출처: 네이버 쇼핑 (https://shopping.naver.com/)
    - 수집 기간: 2020.06~2020.07
    - 데이터 건수: 20만 건
    - 데이터 출처: (https://github.com/bab2min/corpus/tree/master/sentiment)

In [3]:
import os 
import csv

file_path = os.getenv('HOME') + '/Projects/crawling_data'

In [4]:
ns_data = pd.read_csv(file_path + '/naver_shopping.txt', delimiter = '\t', names =['label', 'text'])

In [5]:
len(ns_data)

200000

In [6]:
ns_data.head()

Unnamed: 0,label,text
0,5,배공빠르고 굿
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


- label 5, 4 = 긍정    -> 1
- label 1, 2 = 부정    -> 0

In [7]:
ns_data['label'] = ns_data['label'].apply(lambda row : 1 if row == 5 or row == 4 else 0)

In [8]:
ns_data.head()

Unnamed: 0,label,text
0,1,배공빠르고 굿
1,0,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,1,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,0,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,1,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


In [9]:
ns_data.iloc[4, :].values

array([1, '민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ'], dtype=object)

In [10]:
ns_data.label.unique()

array([1, 0])

In [11]:
class createDataset(Dataset):
  
    def __init__(self, csv_file):
        # 일부 값중에 NaN이 있음...
        self.dataset = csv_file.dropna(axis=0) 
        # 중복제거
        self.dataset.drop_duplicates(subset=['text'], inplace=True)
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

        print(self.dataset.describe())
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, :].values
        y = row[0]
        text = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=64,
            pad_to_max_length=True,
            add_special_tokens=True
            )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [12]:
train_dataset = createDataset(ns_data)

               label
count  199908.000000
mean        0.499995
std         0.500001
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000


In [13]:
model = BertForSequenceClassification.from_pretrained(
    "beomi/kcbert-base", num_labels = 3).to(device)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [50]:
# 모델 레이어 보기
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [19]:
epochs = 5
batch_size = 64

In [20]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [21]:
losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)
    acc = correct.float() / total
    
    # 모델 저장하기
    torch.save(model.state_dict(), file_path + f"/model_{i}_{acc}acc.pt")

  0%|          | 0/3124 [00:00<?, ?it/s]

Batch Loss: 22.41069170832634 Accuracy: tensor(0.9155, device='cuda:0')
Batch Loss: 45.24523111432791 Accuracy: tensor(0.9159, device='cuda:0')
Batch Loss: 66.17004200816154 Accuracy: tensor(0.9182, device='cuda:0')
Batch Loss: 86.33263068646193 Accuracy: tensor(0.9210, device='cuda:0')
Batch Loss: 108.22352690249681 Accuracy: tensor(0.9208, device='cuda:0')
Batch Loss: 128.34162107110023 Accuracy: tensor(0.9217, device='cuda:0')
Batch Loss: 148.72233647853136 Accuracy: tensor(0.9226, device='cuda:0')
Batch Loss: 169.24662873148918 Accuracy: tensor(0.9230, device='cuda:0')
Batch Loss: 189.3186566606164 Accuracy: tensor(0.9231, device='cuda:0')
Batch Loss: 208.03979951515794 Accuracy: tensor(0.9239, device='cuda:0')
Batch Loss: 226.78177179023623 Accuracy: tensor(0.9245, device='cuda:0')
Batch Loss: 246.4026281349361 Accuracy: tensor(0.9249, device='cuda:0')
Batch Loss: 265.6738530881703 Accuracy: tensor(0.9253, device='cuda:0')
Batch Loss: 284.33616726472974 Accuracy: tensor(0.9258, de

  0%|          | 0/3124 [00:00<?, ?it/s]

Batch Loss: 15.535484116524458 Accuracy: tensor(0.9445, device='cuda:0')
Batch Loss: 30.850275671109557 Accuracy: tensor(0.9457, device='cuda:0')
Batch Loss: 45.80500068701804 Accuracy: tensor(0.9465, device='cuda:0')
Batch Loss: 61.41094231419265 Accuracy: tensor(0.9467, device='cuda:0')
Batch Loss: 77.2916609402746 Accuracy: tensor(0.9455, device='cuda:0')
Batch Loss: 92.56482781097293 Accuracy: tensor(0.9453, device='cuda:0')
Batch Loss: 108.13161433115602 Accuracy: tensor(0.9458, device='cuda:0')
Batch Loss: 123.26839564926922 Accuracy: tensor(0.9454, device='cuda:0')
Batch Loss: 138.81923618353903 Accuracy: tensor(0.9456, device='cuda:0')
Batch Loss: 154.8282688651234 Accuracy: tensor(0.9455, device='cuda:0')
Batch Loss: 169.97794582135975 Accuracy: tensor(0.9456, device='cuda:0')
Batch Loss: 184.3544636555016 Accuracy: tensor(0.9460, device='cuda:0')
Batch Loss: 201.51122590899467 Accuracy: tensor(0.9455, device='cuda:0')
Batch Loss: 217.35330041870475 Accuracy: tensor(0.9455, de

  0%|          | 0/3124 [00:00<?, ?it/s]

Batch Loss: 13.101665252819657 Accuracy: tensor(0.9581, device='cuda:0')
Batch Loss: 26.02288237400353 Accuracy: tensor(0.9570, device='cuda:0')
Batch Loss: 38.490024676546454 Accuracy: tensor(0.9574, device='cuda:0')
Batch Loss: 50.440011670812964 Accuracy: tensor(0.9577, device='cuda:0')
Batch Loss: 61.586201997473836 Accuracy: tensor(0.9588, device='cuda:0')
Batch Loss: 73.49722432158887 Accuracy: tensor(0.9585, device='cuda:0')
Batch Loss: 84.40155727043748 Accuracy: tensor(0.9591, device='cuda:0')
Batch Loss: 96.59836361557245 Accuracy: tensor(0.9591, device='cuda:0')
Batch Loss: 109.77462154999375 Accuracy: tensor(0.9585, device='cuda:0')
Batch Loss: 121.69824912026525 Accuracy: tensor(0.9584, device='cuda:0')
Batch Loss: 133.5923726912588 Accuracy: tensor(0.9583, device='cuda:0')
Batch Loss: 145.53878804296255 Accuracy: tensor(0.9582, device='cuda:0')
Batch Loss: 158.25548366643488 Accuracy: tensor(0.9578, device='cuda:0')
Batch Loss: 170.17795721068978 Accuracy: tensor(0.9577, 

  0%|          | 0/3124 [00:00<?, ?it/s]

Batch Loss: 9.88232403434813 Accuracy: tensor(0.9692, device='cuda:0')
Batch Loss: 19.2182597797364 Accuracy: tensor(0.9688, device='cuda:0')
Batch Loss: 28.695704546757042 Accuracy: tensor(0.9682, device='cuda:0')
Batch Loss: 37.68656265642494 Accuracy: tensor(0.9682, device='cuda:0')
Batch Loss: 47.34520173911005 Accuracy: tensor(0.9680, device='cuda:0')
Batch Loss: 56.70309564564377 Accuracy: tensor(0.9683, device='cuda:0')
Batch Loss: 65.66183421108872 Accuracy: tensor(0.9685, device='cuda:0')
Batch Loss: 75.62898162472993 Accuracy: tensor(0.9684, device='cuda:0')
Batch Loss: 85.59384157322347 Accuracy: tensor(0.9683, device='cuda:0')
Batch Loss: 95.50813751854002 Accuracy: tensor(0.9683, device='cuda:0')
Batch Loss: 106.0159612474963 Accuracy: tensor(0.9678, device='cuda:0')
Batch Loss: 115.517677645199 Accuracy: tensor(0.9677, device='cuda:0')
Batch Loss: 125.4143646331504 Accuracy: tensor(0.9676, device='cuda:0')
Batch Loss: 134.55193828884512 Accuracy: tensor(0.9677, device='cu

  0%|          | 0/3124 [00:00<?, ?it/s]

Batch Loss: 7.446292203851044 Accuracy: tensor(0.9770, device='cuda:0')
Batch Loss: 16.399508429691195 Accuracy: tensor(0.9737, device='cuda:0')
Batch Loss: 22.999107289128006 Accuracy: tensor(0.9749, device='cuda:0')
Batch Loss: 30.496809924952686 Accuracy: tensor(0.9747, device='cuda:0')
Batch Loss: 37.78622882813215 Accuracy: tensor(0.9747, device='cuda:0')
Batch Loss: 45.36345586832613 Accuracy: tensor(0.9748, device='cuda:0')
Batch Loss: 51.712055205367506 Accuracy: tensor(0.9755, device='cuda:0')
Batch Loss: 59.698407934978604 Accuracy: tensor(0.9751, device='cuda:0')
Batch Loss: 66.54481840599328 Accuracy: tensor(0.9752, device='cuda:0')
Batch Loss: 73.58150220476091 Accuracy: tensor(0.9753, device='cuda:0')
Batch Loss: 81.37639909144491 Accuracy: tensor(0.9752, device='cuda:0')
Batch Loss: 90.43283574096859 Accuracy: tensor(0.9747, device='cuda:0')
Batch Loss: 98.5108848027885 Accuracy: tensor(0.9745, device='cuda:0')
Batch Loss: 106.78875660896301 Accuracy: tensor(0.9742, devi

In [22]:
losses, accuracies

([606.3243498317897,
  484.34553881548345,
  390.2278171814978,
  306.77541410923004,
  242.86972376238555],
 [tensor(0.9292, device='cuda:0'),
  tensor(0.9451, device='cuda:0'),
  tensor(0.9565, device='cuda:0'),
  tensor(0.9668, device='cuda:0'),
  tensor(0.9738, device='cuda:0')])