<a href="https://colab.research.google.com/github/JeehwanLim/202002_NLP_FIN/blob/main/%EA%B8%B0%EB%A7%90%EA%B3%BC%EC%A0%9C_%ED%95%9C%EA%B8%806.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 사전 설정

In [1]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers

import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 8.3MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 14.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 36.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=f5bbf47036031e943b6

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
##GPU 사용 시
device = torch.device("cuda:0")

In [3]:
!git clone https://github.com/JeehwanLim/202002_NLP_FIN.git

Cloning into '202002_NLP_FIN'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 37 (delta 17), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (37/37), done.


In [4]:
!git clone https://github.com/e9t/nsmc.git

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Total 14763 (delta 0), reused 0 (delta 0), pack-reused 14763[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 22.73 MiB/s, done.
Resolving deltas: 100% (1749/1749), done.
Checking out files: 100% (14737/14737), done.


# 데이터 전처리

In [5]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
# 한글과 빈칸만 남기고 나머지 제거 
#    self.dataset['document'] = self.dataset['document'].str.replace("[^가-힣ㄱ-ㅎㅏ-ㅣ ]","")
# 빈값 제거
#    self.dataset = self.dataset.dropna(how = 'any')
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [6]:
class NSMCDataset_test(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep=',', encoding='cp949')
# 한글과 빈칸만 남기고 나머지 제거 
#    self.dataset["Sentence"] = self.dataset["Sentence"].str.replace("[^가-힣ㄱ-ㅎㅏ-ㅣ ]","")
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:2].values
    text = row[0]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [7]:
train_dataset = NSMCDataset("nsmc/ratings_train.txt")
validation_dataset = NSMCDataset("nsmc/ratings_test.txt")
test_dataset = NSMCDataset_test("202002_NLP_FIN/ko_data.csv")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id         label
count  1.499950e+05  149995.00000
mean   6.743648e+06       0.49885
std    2.919001e+06       0.50000
min    3.300000e+01       0.00000
25%    4.767467e+06       0.00000
50%    7.526885e+06       0.00000
75%    9.249448e+06       1.00000
max    1.027815e+07       1.00000
                 id         label
count  4.999700e+04  49997.000000
mean   6.728781e+06      0.503450
std    2.936634e+06      0.499993
min    6.010000e+02      0.000000
25%    4.736525e+06      0.000000
50%    7.530895e+06      1.000000
75%    9.246512e+06      1.000000
max    1.027809e+07      1.000000
                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000


# 모델 생성 및 학습

In [8]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [None]:
# model.load_state_dict(torch.load("koelectra-base_5.pt"))
# model.to(device)

In [9]:
epochs = 4
batch_size = 32
learning_rate = 1e-5

In [10]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=8)
test_loader = DataLoader(test_dataset)

In [11]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

# Validation set 정확도 확인
  model.eval()

  test_correct = 0
  test_total = 0

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(validation_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

  print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=4688.0), HTML(value='')))



Batch Loss: 64.21300512552261 Accuracy: tensor(0.6500, device='cuda:0')
Batch Loss: 105.7320316284895 Accuracy: tensor(0.7359, device='cuda:0')
Batch Loss: 141.83828619122505 Accuracy: tensor(0.7730, device='cuda:0')
Batch Loss: 174.1862806379795 Accuracy: tensor(0.7954, device='cuda:0')
Batch Loss: 204.48665761202574 Accuracy: tensor(0.8111, device='cuda:0')
Batch Loss: 234.618557728827 Accuracy: tensor(0.8215, device='cuda:0')
Batch Loss: 265.4269367977977 Accuracy: tensor(0.8276, device='cuda:0')
Batch Loss: 296.7165076434612 Accuracy: tensor(0.8325, device='cuda:0')
Batch Loss: 325.3267736211419 Accuracy: tensor(0.8377, device='cuda:0')
Batch Loss: 353.69534834474325 Accuracy: tensor(0.8413, device='cuda:0')
Batch Loss: 381.65957563370466 Accuracy: tensor(0.8453, device='cuda:0')
Batch Loss: 409.75675600767136 Accuracy: tensor(0.8485, device='cuda:0')
Batch Loss: 438.503969155252 Accuracy: tensor(0.8507, device='cuda:0')
Batch Loss: 468.296050645411 Accuracy: tensor(0.8524, device=

HBox(children=(FloatProgress(value=0.0, max=6250.0), HTML(value='')))


Accuracy: tensor(0.9045, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=4688.0), HTML(value='')))

Batch Loss: 20.681840494275093 Accuracy: tensor(0.9191, device='cuda:0')
Batch Loss: 41.345000894740224 Accuracy: tensor(0.9156, device='cuda:0')
Batch Loss: 61.98749587871134 Accuracy: tensor(0.9140, device='cuda:0')
Batch Loss: 83.31489460729063 Accuracy: tensor(0.9149, device='cuda:0')
Batch Loss: 102.50794657506049 Accuracy: tensor(0.9160, device='cuda:0')
Batch Loss: 123.30252478085458 Accuracy: tensor(0.9163, device='cuda:0')
Batch Loss: 143.61038267053664 Accuracy: tensor(0.9162, device='cuda:0')
Batch Loss: 165.4011120032519 Accuracy: tensor(0.9160, device='cuda:0')
Batch Loss: 185.03692144341767 Accuracy: tensor(0.9164, device='cuda:0')
Batch Loss: 206.8564520534128 Accuracy: tensor(0.9159, device='cuda:0')
Batch Loss: 227.16465857438743 Accuracy: tensor(0.9160, device='cuda:0')
Batch Loss: 248.49585045315325 Accuracy: tensor(0.9154, device='cuda:0')
Batch Loss: 269.9068093765527 Accuracy: tensor(0.9155, device='cuda:0')
Batch Loss: 289.9381752666086 Accuracy: tensor(0.9155, d

HBox(children=(FloatProgress(value=0.0, max=6250.0), HTML(value='')))


Accuracy: tensor(0.9085, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=4688.0), HTML(value='')))

Batch Loss: 15.594406185671687 Accuracy: tensor(0.9406, device='cuda:0')
Batch Loss: 31.58941170759499 Accuracy: tensor(0.9387, device='cuda:0')
Batch Loss: 47.916379099711776 Accuracy: tensor(0.9388, device='cuda:0')
Batch Loss: 63.510479509830475 Accuracy: tensor(0.9383, device='cuda:0')
Batch Loss: 79.15804393123835 Accuracy: tensor(0.9386, device='cuda:0')
Batch Loss: 95.27785582561046 Accuracy: tensor(0.9384, device='cuda:0')
Batch Loss: 110.7306326655671 Accuracy: tensor(0.9385, device='cuda:0')
Batch Loss: 126.7785809719935 Accuracy: tensor(0.9386, device='cuda:0')
Batch Loss: 144.09819186944515 Accuracy: tensor(0.9376, device='cuda:0')
Batch Loss: 160.27226483915 Accuracy: tensor(0.9373, device='cuda:0')
Batch Loss: 177.32470167707652 Accuracy: tensor(0.9366, device='cuda:0')
Batch Loss: 193.05986158829182 Accuracy: tensor(0.9370, device='cuda:0')
Batch Loss: 208.41534213069826 Accuracy: tensor(0.9370, device='cuda:0')
Batch Loss: 224.33340300712734 Accuracy: tensor(0.9372, dev

HBox(children=(FloatProgress(value=0.0, max=6250.0), HTML(value='')))


Accuracy: tensor(0.9072, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=4688.0), HTML(value='')))

Batch Loss: 11.449177294969559 Accuracy: tensor(0.9584, device='cuda:0')
Batch Loss: 21.456816618330777 Accuracy: tensor(0.9622, device='cuda:0')
Batch Loss: 31.025388149544597 Accuracy: tensor(0.9620, device='cuda:0')
Batch Loss: 43.66652592364699 Accuracy: tensor(0.9601, device='cuda:0')
Batch Loss: 55.18145865947008 Accuracy: tensor(0.9596, device='cuda:0')
Batch Loss: 68.79546484351158 Accuracy: tensor(0.9574, device='cuda:0')
Batch Loss: 80.87390547432005 Accuracy: tensor(0.9567, device='cuda:0')
Batch Loss: 92.51025325804949 Accuracy: tensor(0.9562, device='cuda:0')
Batch Loss: 105.23035385087132 Accuracy: tensor(0.9558, device='cuda:0')
Batch Loss: 117.75163915660232 Accuracy: tensor(0.9556, device='cuda:0')
Batch Loss: 129.80927911680192 Accuracy: tensor(0.9557, device='cuda:0')
Batch Loss: 141.4507578611374 Accuracy: tensor(0.9560, device='cuda:0')
Batch Loss: 154.65150877647102 Accuracy: tensor(0.9555, device='cuda:0')
Batch Loss: 167.86287904251367 Accuracy: tensor(0.9550, d

HBox(children=(FloatProgress(value=0.0, max=6250.0), HTML(value='')))


Accuracy: tensor(0.9078, device='cuda:0')


In [12]:
losses, accuracies

([1322.3479239866138, 969.9114787094295, 766.7504205773585, 595.1889339843765],
 [tensor(0.8802, device='cuda:0'),
  tensor(0.9164, device='cuda:0'),
  tensor(0.9364, device='cuda:0'),
  tensor(0.9516, device='cuda:0')])

In [13]:
# 모델 저장하기
torch.save(model.state_dict(), "koelectra-base_5.pt")

# 제출 파일 생성

In [14]:
model.eval()

num = 0
f = open("ko_pred_5.csv", "w")
f.write("Id" + ',' + "Predicted" + '\n')

for input_ids_batch, attention_masks_batch in tqdm(test_loader):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  f.write(str(num) + ',' + str(predicted[0].cpu().numpy()) + '\n')

  num += 1

f.close()

HBox(children=(FloatProgress(value=0.0, max=11187.0), HTML(value='')))




