In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import tqdm
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 3.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [5]:
from transformers import ElectraModel, ElectraTokenizer
from transformers import ElectraForSequenceClassification, AdamW
from transformers import get_cosine_schedule_with_warmup


tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

Downloading:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/486 [00:00<?, ?B/s]

In [181]:
PATH = '/content/gdrive/MyDrive/A3_datasets/KoElectra/koelectra_datasets/'
cate = 'clean'
df = pd.read_csv(PATH + f'{cate}.csv')

In [182]:
df = df[['문장', f'{cate}']]

In [183]:
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
from torch import torch
from sklearn.model_selection import train_test_split

def dataSplit(dataset, y_label):
  X_train, X_val= train_test_split(dataset, test_size = 0.2, stratify = dataset[y_label], random_state =427)
  return X_train, X_val

In [184]:
X_train, X_test = dataSplit(df, cate)

# validation 추가
X_train, X_val = dataSplit(X_train, cate)

In [185]:
class LoadDataset(Dataset):
    def __init__(self, df, tk):
        self.df = df
        self.tokenizer = tk

    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :].values
        # target이 없는경우 (즉, 문장만 입력된 경우)
        if len(row) <= 1:
            text = row[0]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=50,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask     
            
        # target이 있는 경우 (원래 코드)
        else:
            text = row[0]
            y = row[1]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=50,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask, y

In [186]:
train_set = LoadDataset(X_train, tokenizer)
val_set = LoadDataset(X_val, tokenizer)
test_set = LoadDataset(X_test, tokenizer)

In [187]:
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-small-v2-discriminator')


Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [188]:
model.classifier.out_proj =  nn.Sequential( nn.Linear(256, 1),
                                           nn.Sigmoid() )

# model.classifier.out_proj = nn.Linear(256, 1) # model 변경

In [189]:
device = torch.device("cuda:0")
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [190]:
epochs = 300 # epochs 증가
batch_size = 128
warmup_ratio=0.1
t_total = len(train_set) * epochs
optimizer = AdamW(model.parameters(), lr=1e-5, eps = 1e-8) # lr 1/10으로 변경
train_loader = DataLoader(train_set, batch_size=batch_size)
val_loader = DataLoader(val_set, batch_size=batch_size) # val loader 추가
test_loader = DataLoader(test_set, batch_size = batch_size) # test loader 추가
loss_f = nn.BCEWithLogitsLoss() # loss f 변경

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=t_total)

In [191]:
# https://github.com/Bjarten/early-stopping-pytorch
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path=f'/content/gdrive/MyDrive/A3_datasets/KoElectra/koelectra_datasets/koelectra_model5/checkpoint_{cate}.pt'):
        """
        Args:
            patience (int): validation loss가 개선된 후 기다리는 기간
                            Default: 7
            verbose (bool): True일 경우 각 validation loss의 개선 사항 메세지 출력
                            Default: False
            delta (float): 개선되었다고 인정되는 monitered quantity의 최소 변화
                            Default: 0
            path (str): checkpoint저장 경로
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [192]:
early_stopping = EarlyStopping(patience = 7, verbose = True)

In [193]:
from sklearn.metrics import roc_auc_score

for i in tqdm(range(epochs)):
    train_loss_list = [] # 변수 변경
    val_loss_list = []
    val_score_list = []

    epoch_train_loss = []
    epoch_val_loss = []
    epoch_val_score = []
    # train
    model.train()
    for input_ids_batch, attention_masks_batch, y_batch in train_loader:
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(input_ids_batch, attention_mask=attention_masks_batch).logits.reshape(-1)
#        print(y_pred)
        loss = loss_f(y_pred.type(torch.FloatTensor), y_batch.type(torch.FloatTensor))
#        print(loss)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss_list.append(loss.item())

    # validation loss
    model.eval()
    for input_ids_batch_val, attention_masks_batch_val, y_batch_val in val_loader:
        input_ids_batch_val = input_ids_batch_val.to(device)
        attention_masks_batch_val = attention_masks_batch_val.to(device)
        y_batch_val = y_batch_val.to(device)
        y_pred_val = model(input_ids_batch_val, attention_mask = attention_masks_batch_val).logits.reshape(-1)
        loss = loss_f(y_pred_val.type(torch.FloatTensor), y_batch_val.type(torch.FloatTensor))
        val_score = roc_auc_score(y_batch_val.tolist(), y_pred_val.tolist())
        val_loss_list.append(loss.item())
        val_score_list.append(val_score)

    # epoch당 loss 계산 (for early stopping)
    train_loss = np.average(train_loss_list)
    val_loss = np.average(val_loss_list)
    val_score = np.average(val_score_list)

    epoch_train_loss.append(train_loss)
    epoch_val_loss.append(val_loss)
    epoch_val_score.append(val_score)
    epoch_len = len(str(epochs))

    print_msg = (f'[{i:>{epoch_len}}/{epochs:>{epoch_len}}] ' +
                 f'train_loss: {train_loss:.5f} ' +
                 f'valid_loss: {val_loss:.5f} ' +
                 f'valid_score: {val_score:.5f}')

    print(print_msg)
    
    # clear lists to track next epoch
    train_loss_list = []
    val_loss_list = []
    val_score_list = []
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print('early stopping')
        break
    
model.load_state_dict(torch.load(f'/content/gdrive/MyDrive/A3_datasets/KoElectra/koelectra_datasets/koelectra_model5/checkpoint_{cate}.pt'))

  0%|          | 0/300 [00:00<?, ?it/s]

[  0/300] train_loss: 0.71909 valid_loss: 0.71327 valid_score: 0.69627
Validation loss decreased (inf --> 0.713266).  Saving model ...
[  1/300] train_loss: 0.70929 valid_loss: 0.70408 valid_score: 0.74599
Validation loss decreased (0.713266 --> 0.704080).  Saving model ...
[  2/300] train_loss: 0.69929 valid_loss: 0.69004 valid_score: 0.77241
Validation loss decreased (0.704080 --> 0.690042).  Saving model ...
[  3/300] train_loss: 0.68232 valid_loss: 0.66970 valid_score: 0.81478
Validation loss decreased (0.690042 --> 0.669705).  Saving model ...
[  4/300] train_loss: 0.66176 valid_loss: 0.65137 valid_score: 0.82529
Validation loss decreased (0.669705 --> 0.651373).  Saving model ...
[  5/300] train_loss: 0.64651 valid_loss: 0.63843 valid_score: 0.84254
Validation loss decreased (0.651373 --> 0.638426).  Saving model ...
[  6/300] train_loss: 0.63346 valid_loss: 0.62953 valid_score: 0.84128
Validation loss decreased (0.638426 --> 0.629530).  Saving model ...
[  7/300] train_loss: 0.6

<All keys matched successfully>

In [194]:
model.eval()
valid_loader = DataLoader(val_set, batch_size=batch_size)
score_list = []
for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    input_ids_batch = input_ids_batch.to(device)
    attention_masks_batch = attention_masks_batch.to(device)
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch, attention_mask=attention_masks_batch).logits.reshape(-1)
#    print(y_pred)
    try:
        score = roc_auc_score(y_batch.tolist(), y_pred.tolist())
        score_list.append(score)
    except: pass


print("epoch roc_auc:", np.mean(score_list))

  0%|          | 0/15 [00:00<?, ?it/s]

epoch roc_auc: 0.8466938368146065


In [195]:
# score 기록하기
final_score = np.mean(score_list)

In [196]:
#korean_col_name_list = ['clean','local','religion_model','']
#eng_model_name_list = ['clean','지역','종교','인종국적','연령','악플욕설','성소수자','성별','기타혐오','개인지칭']

#translate_dict = {}

torch.save(model.state_dict(), PATH + f'{cate}.pth')

In [197]:
print(final_score, cate)

0.8466938368146065 clean
