In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 11.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
import pandas as pd
import numpy as np

from transformers import ElectraModel, ElectraTokenizer, AdamW, ElectraForSequenceClassification

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from tqdm.notebook import tqdm

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Data load

In [224]:
PATH = '/content/gdrive/MyDrive/A3_datasets/KoElectra/koelectra_datasets/'
cate = 'clean'
df = pd.read_csv(PATH + f'{cate}.csv')

In [225]:
df.head()

Unnamed: 0,문장,성소수자,인종국적,연령,지역,종교,기타혐오,악플욕설,clean,개인지칭,성별
0,그러면 좆국이 웜련들의 보지놀이터가 될 것이다.,0,1,0,0,0,0,0,0,0,1
1,자세히좀써라 자칭 전문직이라는 게이가 글이 이게 뭐냐?인상찌푸려지네,0,0,0,0,0,0,1,0,0,0
2,말없이 진료만 하는 의사 out,0,0,0,0,0,0,1,0,0,0
3,정말이지 이 나라가 뭔가를 하긴하는데.. 왜이리 속는 기분일까나 누구를 위한 우쭈쭈...,0,0,0,0,0,0,0,1,0,0
4,세상에 여자들 상대로 장사하면서 여자 까는 심보는 뭐노?,0,0,0,0,0,0,1,0,0,0


In [226]:
# train valid split
from sklearn.model_selection import train_test_split

X_train, X_valid= train_test_split(df, test_size = 0.2, stratify = df['성별'], random_state =427)

In [227]:
class LoadDataset(Dataset):
    def __init__(self, df, tk):
        self.df = df
        self.tokenizer = tk
        
    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :].values
        # target이 없는경우 (즉, 문장만 입력된 경우)
        if len(row) <= 1:
            text = row[0]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=200,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0].to(device)
            attention_mask = inputs['attention_mask'][0].to(device)

            return input_ids, attention_mask     
            
        # target이 있는 경우 (원래 코드)
        else:
            text = row[0]
            y = row[1]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=200,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0].to(device)
            attention_mask = inputs['attention_mask'][0].to(device)

            return input_ids, attention_mask, y

In [228]:
# load tokenizer
tk = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

In [229]:
train_dataset = LoadDataset(X_train, tk)
valid_dataset = LoadDataset(X_valid, tk)

# modeling

In [230]:
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-small-v2-discriminator')

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [231]:
# 마지막 layer 변경
model.classifier.out_proj = nn.Linear(256, 1)

In [232]:
# freeze
#for param in model.parameters():
#    param.requires_grad = True

In [233]:
model.classifier.add_module('sigmoid', nn.Sigmoid())

In [234]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

# fine tuning

In [235]:
epochs = 5
batch_size = 16

In [236]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
loss_f = nn.BCEWithLogitsLoss()

In [237]:
import warnings
warnings.filterwarnings(action='ignore')

In [238]:
from sklearn.metrics import roc_auc_score

for i in range(epochs):
    total_loss = 0.0
    batches = 0
#    score_list = []
    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_pred = model(input_ids_batch, attention_mask=attention_masks_batch).logits.reshape(-1)
#        print(y_pred)
        loss = loss_f(y_pred.type(torch.FloatTensor), y_batch.type(torch.FloatTensor))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
#        print(y_batch)
        try:
            score = roc_auc_score(y_batch.tolist(), y_pred.tolist())
#            score_list.append(score)
        except: pass
        batches += 1
        if batches % 30 == 0:
            print("Batch Loss:", total_loss, "ROC_AUC:",score)


    print("Train Loss:", total_loss, "epoch roc_auc:", score)

  0%|          | 0/468 [00:00<?, ?it/s]

Batch Loss: 20.30484127998352 ROC_AUC: 1.0
Batch Loss: 37.99096989631653 ROC_AUC: 0.10714285714285715
Batch Loss: 52.79811033606529 ROC_AUC: 0.7333333333333334
Batch Loss: 65.32078063488007 ROC_AUC: 1.0
Batch Loss: 76.6987632215023 ROC_AUC: 0.3333333333333333
Batch Loss: 86.45765468478203 ROC_AUC: 1.0
Batch Loss: 96.2860412299633 ROC_AUC: 0.4
Batch Loss: 105.0431268364191 ROC_AUC: 0.8
Batch Loss: 112.86226919293404 ROC_AUC: 0.1333333333333333
Batch Loss: 120.57160496711731 ROC_AUC: 0.3928571428571429
Batch Loss: 128.3305622190237 ROC_AUC: 1.0
Batch Loss: 136.08413273096085 ROC_AUC: 0.5333333333333333
Batch Loss: 141.95763850212097 ROC_AUC: 0.5714285714285714
Batch Loss: 148.58117248862982 ROC_AUC: 0.8
Batch Loss: 155.36209981143475 ROC_AUC: 0.6666666666666667
Train Loss: 158.69334295392036 epoch roc_auc: 0.4666666666666667


  0%|          | 0/468 [00:00<?, ?it/s]

Batch Loss: 6.900894016027451 ROC_AUC: 0.6
Batch Loss: 14.545883312821388 ROC_AUC: 0.75
Batch Loss: 19.66627485305071 ROC_AUC: 0.06666666666666665
Batch Loss: 24.513817980885506 ROC_AUC: 0.4666666666666667
Batch Loss: 30.96471055597067 ROC_AUC: 0.6666666666666667
Batch Loss: 36.16732316464186 ROC_AUC: 0.9333333333333333
Batch Loss: 43.51970209926367 ROC_AUC: 0.8666666666666667
Batch Loss: 49.87739332765341 ROC_AUC: 0.1333333333333333
Batch Loss: 55.36252275109291 ROC_AUC: 0.06666666666666665
Batch Loss: 61.55847962573171 ROC_AUC: 0.4642857142857143
Batch Loss: 68.33462183177471 ROC_AUC: 0.4666666666666667
Batch Loss: 75.31224356219172 ROC_AUC: 0.0
Batch Loss: 80.18414948135614 ROC_AUC: 0.0357142857142857
Batch Loss: 86.11908722668886 ROC_AUC: 0.6666666666666667
Batch Loss: 92.44863341376185 ROC_AUC: 0.06666666666666665
Train Loss: 95.46273235604167 epoch roc_auc: 0.06666666666666665


  0%|          | 0/468 [00:00<?, ?it/s]

Batch Loss: 6.55431005358696 ROC_AUC: 0.9333333333333333
Batch Loss: 13.814559787511826 ROC_AUC: 0.8214285714285714
Batch Loss: 18.48158462718129 ROC_AUC: 0.6
Batch Loss: 22.735876452177763 ROC_AUC: 1.0
Batch Loss: 28.16636896878481 ROC_AUC: 0.9487179487179487
Batch Loss: 32.13393563777208 ROC_AUC: 1.0
Batch Loss: 37.144101805984974 ROC_AUC: 1.0
Batch Loss: 42.02897946536541 ROC_AUC: 1.0
Batch Loss: 45.67632453516126 ROC_AUC: 1.0
Batch Loss: 49.94716924801469 ROC_AUC: 0.8928571428571428
Batch Loss: 54.04815035313368 ROC_AUC: 1.0
Batch Loss: 59.47469122335315 ROC_AUC: 0.06666666666666665
Batch Loss: 62.42360087670386 ROC_AUC: 0.5714285714285714
Batch Loss: 66.18044548109174 ROC_AUC: 0.8
Batch Loss: 70.32505594752729 ROC_AUC: 0.8
Train Loss: 72.0030718408525 epoch roc_auc: 0.6666666666666667


  0%|          | 0/468 [00:00<?, ?it/s]

Batch Loss: 4.591203087940812 ROC_AUC: 1.0
Batch Loss: 7.590862404555082 ROC_AUC: 1.0
Batch Loss: 10.572640806436539 ROC_AUC: 1.0
Batch Loss: 12.660093627870083 ROC_AUC: 1.0
Batch Loss: 15.622341968119144 ROC_AUC: 0.923076923076923
Batch Loss: 17.428219191730022 ROC_AUC: 1.0
Batch Loss: 21.231236768886447 ROC_AUC: 1.0
Batch Loss: 24.26565120741725 ROC_AUC: 1.0
Batch Loss: 26.764735912904143 ROC_AUC: 1.0
Batch Loss: 29.745681066066027 ROC_AUC: 0.9285714285714286
Batch Loss: 32.99474278278649 ROC_AUC: 0.9333333333333333
Batch Loss: 36.630210284143686 ROC_AUC: 0.8
Batch Loss: 38.492601530626416 ROC_AUC: 1.0
Batch Loss: 41.513588602654636 ROC_AUC: 0.9333333333333333
Batch Loss: 44.61877438146621 ROC_AUC: 1.0
Train Loss: 45.89379145670682 epoch roc_auc: 0.19999999999999996


  0%|          | 0/468 [00:00<?, ?it/s]

Batch Loss: 3.8128458922728896 ROC_AUC: 1.0
Batch Loss: 5.80713398847729 ROC_AUC: 1.0
Batch Loss: 8.230783363804221 ROC_AUC: 1.0
Batch Loss: 9.74655247014016 ROC_AUC: 1.0
Batch Loss: 12.05292728729546 ROC_AUC: 0.7948717948717948
Batch Loss: 13.58952412661165 ROC_AUC: 1.0
Batch Loss: 16.91159504838288 ROC_AUC: 1.0
Batch Loss: 19.48291348479688 ROC_AUC: 1.0
Batch Loss: 21.474714084528387 ROC_AUC: 1.0
Batch Loss: 23.812171014025807 ROC_AUC: 0.9642857142857143
Batch Loss: 26.70770290028304 ROC_AUC: 1.0
Batch Loss: 30.136446989141405 ROC_AUC: 0.06666666666666665
Batch Loss: 31.551200236193836 ROC_AUC: 1.0
Batch Loss: 34.559027802199125 ROC_AUC: 0.8666666666666667
Batch Loss: 36.99443934485316 ROC_AUC: 1.0
Train Loss: 38.186975149437785 epoch roc_auc: 0.8


# evaluate

In [239]:
model.eval()
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
score_list = []
for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_loader):
    y_pred = model(input_ids_batch, attention_mask=attention_masks_batch).logits.reshape(-1)
#    print(y_pred)
    try:
        score = roc_auc_score(y_batch.tolist(), y_pred.tolist())
        score_list.append(score)
    except: pass


print("epoch roc_auc:", np.mean(score_list))

  0%|          | 0/117 [00:00<?, ?it/s]

epoch roc_auc: 0.9405035263244218


# save

In [240]:
torch.save(model.state_dict(), PATH + f'{cate}.pth')