<a href="https://colab.research.google.com/github/JeehwanLim/202002_NLP_FIN/blob/main/%EA%B8%B0%EB%A7%90%EA%B3%BC%EC%A0%9C_%ED%95%9C%EA%B8%805.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 사전 설정

In [1]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers

import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 5.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 12.8MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 14.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=44ff1a26ccb85

In [2]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [3]:
##GPU 사용 시
device = torch.device("cuda:0")

In [4]:
!git clone https://github.com/JeehwanLim/202002_NLP_FIN.git

Cloning into '202002_NLP_FIN'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 32 (delta 14), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (32/32), done.


In [5]:
!git clone https://github.com/e9t/nsmc.git

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Total 14763 (delta 0), reused 0 (delta 0), pack-reused 14763[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 7.50 MiB/s, done.
Resolving deltas: 100% (1749/1749), done.
Checking out files: 100% (14737/14737), done.


# 데이터 전처리

In [6]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
# 한글과 빈칸만 남기고 나머지 제거 
#    self.dataset['document'] = self.dataset['document'].str.replace("[^가-힣ㄱ-ㅎㅏ-ㅣ ]","")
# 빈값 제거
#    self.dataset = self.dataset.dropna(how = 'any')
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [7]:
class NSMCDataset_test(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep=',', encoding='cp949')
# 한글과 빈칸만 남기고 나머지 제거 
#    self.dataset["Sentence"] = self.dataset["Sentence"].str.replace("[^가-힣ㄱ-ㅎㅏ-ㅣ ]","")
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:2].values
    text = row[0]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [8]:
train_dataset = NSMCDataset("nsmc/ratings_train.txt")
validation_dataset = NSMCDataset("nsmc/ratings_test.txt")
test_dataset = NSMCDataset_test("202002_NLP_FIN/ko_data.csv")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=458.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id         label
count  1.499950e+05  149995.00000
mean   6.743648e+06       0.49885
std    2.919001e+06       0.50000
min    3.300000e+01       0.00000
25%    4.767467e+06       0.00000
50%    7.526885e+06       0.00000
75%    9.249448e+06       1.00000
max    1.027815e+07       1.00000
                 id         label
count  4.999700e+04  49997.000000
mean   6.728781e+06      0.503450
std    2.936634e+06      0.499993
min    6.010000e+02      0.000000
25%    4.736525e+06      0.000000
50%    7.530895e+06      1.000000
75%    9.246512e+06      1.000000
max    1.027809e+07      1.000000
                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000


# 모델 생성 및 학습

In [21]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [22]:
# model.load_state_dict(torch.load("koelectra-small_4.pt"))
# model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [23]:
epochs = 8
batch_size = 128
learning_rate = 2e-5

In [24]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=16)
test_loader = DataLoader(test_dataset)

In [None]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

# Validation set 정확도 확인
  model.eval()

  test_correct = 0
  test_total = 0

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(validation_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

  print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))



Batch Loss: 67.51781737804413 Accuracy: tensor(0.6121, device='cuda:0')
Batch Loss: 117.68248838186264 Accuracy: tensor(0.7014, device='cuda:0')
Batch Loss: 160.73041751980782 Accuracy: tensor(0.7389, device='cuda:0')
Batch Loss: 200.29025700688362 Accuracy: tensor(0.7625, device='cuda:0')
Batch Loss: 238.70119455456734 Accuracy: tensor(0.7766, device='cuda:0')
Batch Loss: 274.7589765936136 Accuracy: tensor(0.7875, device='cuda:0')
Batch Loss: 311.3776843100786 Accuracy: tensor(0.7951, device='cuda:0')
Batch Loss: 346.49002565443516 Accuracy: tensor(0.8016, device='cuda:0')
Batch Loss: 379.74806198477745 Accuracy: tensor(0.8078, device='cuda:0')
Batch Loss: 413.0254458785057 Accuracy: tensor(0.8128, device='cuda:0')
Batch Loss: 446.0561658143997 Accuracy: tensor(0.8167, device='cuda:0')

Train Loss: 469.4898682832718 Accuracy: tensor(0.8193, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8650, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 31.163569942116737 Accuracy: tensor(0.8678, device='cuda:0')
Batch Loss: 62.13106107711792 Accuracy: tensor(0.8686, device='cuda:0')
Batch Loss: 93.1138311624527 Accuracy: tensor(0.8688, device='cuda:0')
Batch Loss: 123.73970374464989 Accuracy: tensor(0.8691, device='cuda:0')
Batch Loss: 154.4095882177353 Accuracy: tensor(0.8692, device='cuda:0')
Batch Loss: 184.20616514980793 Accuracy: tensor(0.8702, device='cuda:0')
Batch Loss: 213.87646593153477 Accuracy: tensor(0.8713, device='cuda:0')
Batch Loss: 243.61544539034367 Accuracy: tensor(0.8713, device='cuda:0')
Batch Loss: 273.23567190766335 Accuracy: tensor(0.8715, device='cuda:0')
Batch Loss: 302.8237336575985 Accuracy: tensor(0.8717, device='cuda:0')
Batch Loss: 331.80491714179516 Accuracy: tensor(0.8724, device='cuda:0')

Train Loss: 352.69267167150974 Accuracy: tensor(0.8726, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8783, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 27.93405720591545 Accuracy: tensor(0.8845, device='cuda:0')
Batch Loss: 55.257594764232635 Accuracy: tensor(0.8854, device='cuda:0')
Batch Loss: 83.46253299713135 Accuracy: tensor(0.8838, device='cuda:0')
Batch Loss: 110.5992774516344 Accuracy: tensor(0.8844, device='cuda:0')
Batch Loss: 138.2295883744955 Accuracy: tensor(0.8843, device='cuda:0')
Batch Loss: 165.16664023697376 Accuracy: tensor(0.8852, device='cuda:0')
Batch Loss: 192.70865805447102 Accuracy: tensor(0.8851, device='cuda:0')
Batch Loss: 219.24484087526798 Accuracy: tensor(0.8858, device='cuda:0')
Batch Loss: 245.59590135514736 Accuracy: tensor(0.8865, device='cuda:0')
Batch Loss: 271.9146990031004 Accuracy: tensor(0.8866, device='cuda:0')
Batch Loss: 298.5512617379427 Accuracy: tensor(0.8867, device='cuda:0')

Train Loss: 317.60789158940315 Accuracy: tensor(0.8866, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8833, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 24.97001101076603 Accuracy: tensor(0.8984, device='cuda:0')
Batch Loss: 50.38802683353424 Accuracy: tensor(0.8964, device='cuda:0')
Batch Loss: 75.78401626646519 Accuracy: tensor(0.8959, device='cuda:0')
Batch Loss: 101.28569076955318 Accuracy: tensor(0.8952, device='cuda:0')
Batch Loss: 126.53315897285938 Accuracy: tensor(0.8952, device='cuda:0')
Batch Loss: 152.24990399181843 Accuracy: tensor(0.8946, device='cuda:0')
Batch Loss: 176.72514324635267 Accuracy: tensor(0.8949, device='cuda:0')
Batch Loss: 201.66904152184725 Accuracy: tensor(0.8953, device='cuda:0')
Batch Loss: 226.01036744564772 Accuracy: tensor(0.8959, device='cuda:0')
Batch Loss: 250.31381728500128 Accuracy: tensor(0.8963, device='cuda:0')
Batch Loss: 275.62917274981737 Accuracy: tensor(0.8961, device='cuda:0')

Train Loss: 293.5753981694579 Accuracy: tensor(0.8961, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8884, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 23.90875492990017 Accuracy: tensor(0.9013, device='cuda:0')
Batch Loss: 47.713162653148174 Accuracy: tensor(0.9021, device='cuda:0')
Batch Loss: 70.13828787952662 Accuracy: tensor(0.9043, device='cuda:0')
Batch Loss: 93.8296270892024 Accuracy: tensor(0.9031, device='cuda:0')
Batch Loss: 117.18943288177252 Accuracy: tensor(0.9036, device='cuda:0')
Batch Loss: 140.300767429173 Accuracy: tensor(0.9040, device='cuda:0')
Batch Loss: 163.75103927403688 Accuracy: tensor(0.9037, device='cuda:0')
Batch Loss: 186.80484993755817 Accuracy: tensor(0.9040, device='cuda:0')
Batch Loss: 210.26538603007793 Accuracy: tensor(0.9040, device='cuda:0')
Batch Loss: 232.89975741505623 Accuracy: tensor(0.9046, device='cuda:0')
Batch Loss: 255.8469576984644 Accuracy: tensor(0.9048, device='cuda:0')

Train Loss: 273.3288594484329 Accuracy: tensor(0.9044, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8877, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 20.674531921744347 Accuracy: tensor(0.9152, device='cuda:0')
Batch Loss: 42.7574348449707 Accuracy: tensor(0.9121, device='cuda:0')
Batch Loss: 63.87871254980564 Accuracy: tensor(0.9130, device='cuda:0')
Batch Loss: 84.80242496728897 Accuracy: tensor(0.9133, device='cuda:0')
Batch Loss: 106.37891831994057 Accuracy: tensor(0.9130, device='cuda:0')
Batch Loss: 128.17916472256184 Accuracy: tensor(0.9131, device='cuda:0')
Batch Loss: 149.74462220817804 Accuracy: tensor(0.9130, device='cuda:0')
Batch Loss: 171.354287058115 Accuracy: tensor(0.9129, device='cuda:0')
Batch Loss: 193.99634851515293 Accuracy: tensor(0.9120, device='cuda:0')
Batch Loss: 216.16551364958286 Accuracy: tensor(0.9121, device='cuda:0')
Batch Loss: 238.82402943074703 Accuracy: tensor(0.9116, device='cuda:0')

Train Loss: 254.85273630917072 Accuracy: tensor(0.9115, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8909, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 20.15785203874111 Accuracy: tensor(0.9210, device='cuda:0')
Batch Loss: 40.59214938431978 Accuracy: tensor(0.9191, device='cuda:0')
Batch Loss: 61.05867328494787 Accuracy: tensor(0.9184, device='cuda:0')
Batch Loss: 81.02025801688433 Accuracy: tensor(0.9185, device='cuda:0')
Batch Loss: 102.12934067100286 Accuracy: tensor(0.9174, device='cuda:0')
Batch Loss: 122.52237788587809 Accuracy: tensor(0.9176, device='cuda:0')
Batch Loss: 142.74385908991098 Accuracy: tensor(0.9176, device='cuda:0')
Batch Loss: 162.65537051856518 Accuracy: tensor(0.9180, device='cuda:0')
Batch Loss: 182.9702299311757 Accuracy: tensor(0.9177, device='cuda:0')
Batch Loss: 203.48217893391848 Accuracy: tensor(0.9176, device='cuda:0')
Batch Loss: 223.95699451863766 Accuracy: tensor(0.9177, device='cuda:0')

Train Loss: 238.52126350998878 Accuracy: tensor(0.9176, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8927, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1172.0), HTML(value='')))

Batch Loss: 19.34702306985855 Accuracy: tensor(0.9243, device='cuda:0')
Batch Loss: 38.43065541237593 Accuracy: tensor(0.9248, device='cuda:0')
Batch Loss: 57.13880566507578 Accuracy: tensor(0.9247, device='cuda:0')
Batch Loss: 75.43701507151127 Accuracy: tensor(0.9253, device='cuda:0')
Batch Loss: 93.708944670856 Accuracy: tensor(0.9256, device='cuda:0')
Batch Loss: 111.7077522277832 Accuracy: tensor(0.9266, device='cuda:0')
Batch Loss: 130.59697571396828 Accuracy: tensor(0.9263, device='cuda:0')
Batch Loss: 150.8973452374339 Accuracy: tensor(0.9255, device='cuda:0')
Batch Loss: 169.84778137505054 Accuracy: tensor(0.9250, device='cuda:0')
Batch Loss: 189.20723965764046 Accuracy: tensor(0.9248, device='cuda:0')
Batch Loss: 207.78429460525513 Accuracy: tensor(0.9247, device='cuda:0')

Train Loss: 221.877219080925 Accuracy: tensor(0.9244, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))


Accuracy: tensor(0.8920, device='cuda:0')


In [None]:
losses, accuracies

([469.4898682832718,
  352.69267167150974,
  317.60789158940315,
  293.5753981694579,
  273.3288594484329,
  254.85273630917072,
  238.52126350998878,
  221.877219080925],
 [tensor(0.8193, device='cuda:0'),
  tensor(0.8726, device='cuda:0'),
  tensor(0.8866, device='cuda:0'),
  tensor(0.8961, device='cuda:0'),
  tensor(0.9044, device='cuda:0'),
  tensor(0.9115, device='cuda:0'),
  tensor(0.9176, device='cuda:0'),
  tensor(0.9244, device='cuda:0')])

In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "koelectra-small_4.pt")

# 제출 파일 생성

In [None]:
model.eval()

num = 0
f = open("ko_pred_4.csv", "w")
f.write("Id" + ',' + "Predicted" + '\n')

for input_ids_batch, attention_masks_batch in tqdm(test_loader):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  f.write(str(num) + ',' + str(predicted[0].cpu().numpy()) + '\n')

  num += 1

f.close()