# 파이토치 라이트닝 자연어 처리 태스크

In [None]:
!pip install pytorch-lightning
!pip install transformers
!pip install kiwipiepy
!pip install wget
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
cd /content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/model

/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/model


In [None]:
ls

[0m[01;34mcheckpoint[0m/      hash-koelectra-small.pt     model_test.ipynb
cs_kobigbird.pt  hash-koelectra-small-v6.pt  [01;34m__pycache__[0m/
[01;34mhash[0m/            [01;34mlightning_logs[0m/             pytorch_lightning.ipynb


In [None]:
from datetime import datetime
from typing import Optional

import torch
import datasets

import pandas as pd
import numpy as np

from collections import OrderedDict
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import (
    AutoConfig,
    ElectraForSequenceClassification,
    ElectraTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
class HashClassification(LightningModule):
  def __init__(
      self,
      model_name_or_path: str,
      num_labels: int,
      learning_rate: float = 5e-5,
      adam_epsilon: float = 1e-8,
      warmup_steps: int = 0,
      weight_decay: float = 0.0,
      train_batch_size: int = 64,
      eval_batch_size = 64,
      eval_splits: Optional[list] = None,
      **kwargs,
  ):
    super().__init__() # LightnintModule의 속성 및 메소드를 자동으로 불러와 상속

    self.save_hyperparameters() # 제공되는 모든 아규먼트를 self.hparams 속성에 저장

    self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
    self.model = ElectraForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
    self.metric = datasets.load_metric('accuracy')

  def forward(self, **inputs): # 추론에만 사용(training_step과는 별개)
    return self.model(**inputs)

  def training_step(self, batch, batch_idx):
    outputs = self(**batch)
    loss = outputs[0]
    return loss

  def validation_step(self, batch, batch_idx, dataloader_idx=0):
    outputs = self(**batch)
    val_loss, logits = outputs[:2]
    
    if self.hparams.num_labels > 1:
      preds = torch.argmax(logits, axis=1)
    elif self.hparams.num_labels == 1:
      preds = logits.squeeze() # 사이즈가 1인 차원 삭제 => (3, 1, 2, 4) -> (3, 2, 4)
    
    labels = batch['labels']

    return {'loss': val_loss, 'preds': preds, 'labels': labels}

  def validation_epoch_end(self, outputs):
    preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy() # detach(): 연산 기록으로 부터 분리한 텐서를 반환
    labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()
    loss = torch.stack([x['loss'] for x in outputs]).mean() # cat은 주어진 차원을 기준으로 텐서들을 병합 / stack은 새로운 차원으로 텐서들을 병합
    self.log('val_loss', loss, prog_bar=True) # epoch별 지표와 로그를 보기 위함 / prog_bar=True: 진행 상태를 바 형태로 보여줌
    self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)
  
  def configure_optimizers(self):
    '''Prepare optimizer and schedule (linear warmup and decay)'''
    model = self.model
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
                                    {
                                        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                                     'weight_decay': self.hparams.weight_decay,
                                    },
                                    {
                                        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                                     'weight_decay': 0.0,
                                    },
    ]

    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) # AdamW 이제 사용 안되므로 torch.optim.AdamW 사용 추천

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=self.hparams.warmup_steps,
        num_training_steps=self.trainer.estimated_stepping_batches,
    )
    scheduler = {'scheduler': scheduler, 'interval': 'step', 'frequency': 1}
    return [optimizer], [scheduler]

In [None]:
class HashDataModule(LightningDataModule):

  loader_columns = [
    'datasets_idx',
    'input_ids',
    'token_type_ids',
    'attention_mask',
    'start_positions',
    'end_positions',
    'labels',
  ]

  def __init__(
      self,
      model_name_or_path: str,
      max_seq_length: int = 128,
      train_batch_size: int = 64,
      eval_batch_size: int = 64,
      **kwargs,
  ):
    super().__init__()
    self.model_name_or_path = model_name_or_path
    self.max_seq_length = max_seq_length
    self.train_batch_size = train_batch_size
    self.eval_batch_size = eval_batch_size

    self.text_fields = ['sentence']
    self.num_labels = 2
    self.tokenizer = ElectraTokenizer.from_pretrained(self.model_name_or_path)

  def setup(self, stage: str):
    self.dataset = datasets.load_dataset('csv',
                                         data_files={'train': '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/data/sentiment_analysis/train_clean.csv',
                                                     'validation': '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/data/sentiment_analysis/val_clean.csv',
                                                     'test': '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/data/sentiment_analysis/test_clean.csv'})
    for split in self.dataset.keys():
      self.dataset[split] = self.dataset[split].map(
          self.convert_to_features,
          batched=True,
          remove_columns=['label'],
      ) # 모든 데이터에 함수(convert_to_features)를 적용, batched=True: 배치 단위로 함수 처리
      self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns] # loader_names 에 있는 컬럼만 남기기
      self.dataset[split].set_format(type='torch', columns=self.columns) # getitem 반환 형식을 torch로 반환하도록 설정

    self.eval_splits = [x for x in self.dataset.keys() if 'validation' in x] # validation 데이터가 존재한다면 eval_splits에 따로 저장(['validation'])

  def prepare_data(self):
    dataset = datasets.load_dataset('csv',
                                    data_files={'train': '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/data/sentiment_analysis/train_clean.csv',  
                                              'validation': '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/data/sentiment_analysis/val_clean.csv',
                                              'test': '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/data/sentiment_analysis/test_clean.csv'})
    ElectraTokenizer.from_pretrained(self.model_name_or_path)

  def train_dataloader(self):
    return DataLoader(self.dataset['train'], batch_size=self.train_batch_size, shuffle=True)

  def val_dataloader(self):
    if len(self.eval_splits) == 1:
      return DataLoader(self.dataset['validation'], batch_size=self.eval_batch_size)
    elif len(self.eval_splits) > 1:
      return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

  def test_dataloader(self):
    if len(self.eval_splits) == 1:
      return DataLoader(self.dataset['test'], batch_size=self.eval_batch_size)
    elif len(self.eval_splits) > 1:
      return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

  def convert_to_features(self, example_batch, indices=None): # example_batch는 그냥 임의의 데이터라고 생각
    texts_or_text_pairs = example_batch['sentence']

    features = self.tokenizer.batch_encode_plus(
        texts_or_text_pairs, max_length=self.max_seq_length, padding=True, truncation=True
    ) # pad_to_max_length는 향후 padding으로 대체될 것, padding 사용 권유

    # Rename label to labels to make it easier to pass to model forward
    features['labels'] = example_batch['label']

    return features

In [None]:
seed_everything(42)

Global seed set to 42


42

In [None]:
dm = HashDataModule(model_name_or_path='monologg/koelectra-small-v3-discriminator')

In [None]:
dm.setup('fit')



  0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
model = HashClassification(
    model_name_or_path='monologg/koelectra-small-v3-discriminator',
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/model/checkpoint',
    filename='{epoch:02d}-{val_loss:.3f}',
    verbose=True,
    save_last=False,
    mode='min',
    save_top_k=1,
)

early_stopping = EarlyStopping(
    monitor='val_loss', 
    mode='min',
    patience=5
)

In [None]:
trainer = Trainer(
    max_epochs=30,
    accelerator='auto',
    devices=1 if torch.cuda.is_available() else None,
    callbacks=[checkpoint_callback, early_stopping]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, datamodule=dm)



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/284 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | ElectraForSequenceClassification | 14.1 M
-----------------------------------------------------------
14.1 M    Trainable params
0         Non-trainable params
14.1 M    Total params
56.490    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 4428: 'val_loss' reached 0.20123 (best 0.20123), saving model to '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/model/checkpoint/epoch=00-val_loss=0.201.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 8856: 'val_loss' reached 0.17903 (best 0.17903), saving model to '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/model/checkpoint/epoch=01-val_loss=0.179.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 13284: 'val_loss' reached 0.17590 (best 0.17590), saving model to '/content/drive/MyDrive/wjdckdtn15@hashscraper.com/HASHNLP/model/checkpoint/epoch=02-val_loss=0.176.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 17712: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 22140: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 26568: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 30996: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 35424: 'val_loss' was not in top 1


In [None]:
trainer.validate(model, dm)



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/284 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy            0.9335577487945557
        val_loss            0.21011854708194733
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'accuracy': 0.9335577487945557, 'val_loss': 0.21011854708194733}]

In [None]:
# 모델 불러오기
config = AutoConfig.from_pretrained('monologg/koelectra-small-v3-discriminator', num_labels=2)
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-small-v3-discriminator', config=config)

# 모델이 학습한 체크포인트 불러오기
# 모델의 키: 'epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'
checkpoint = torch.load('checkpoint/epoch=02-val_loss=0.176.ckpt')

# 모델에 체크포인트 적용하기
# 기존 모델과 새로 학습한 모델의 state_dict 의 키가 맞지 않기 때문에 키 값을 매칭시켜줘야 함
# 학습 시 다중 GPU를 사용하면서 model.module.state_dict() 형태로 저장이 됨, 즉 module을 제거해야 함(module은 electra, bert 등의 모듈 명을 의미)
state_dict = checkpoint['state_dict']

new_state_dict = OrderedDict()
for key, value in state_dict.items():
  name = key[6:]
  new_state_dict[name] = value

model.load_state_dict(new_state_dict)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

<All keys matched successfully>

In [None]:
tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-small-v3-discriminator')

In [None]:
# 실제 데이터로 테스트
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [None]:
pipe('배송이 매우 빠릅니다')

[{'label': 'LABEL_0', 'score': 0.8974786400794983}]

In [None]:
# 사전 학습 모델 저장
model.save_pretrained('hash/pytorch-lightning-test')