##### 1. Settings

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import random

import torch
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


##### 2. Load Dataset

(1) preprocessing

In [2]:
def preparing_data_1(df:pd.DataFrame) -> pd.DataFrame:
    # SettingWithCopyWarning
    data = df.copy()

    def func(c):
        if c == "sad" : return "sadness"
        elif c == "anger" : return "angry"
        else : return c
        
    data = data[["발화문", "상황"]]
    data.columns = ["sentence", "class"]
    data.reset_index(drop=True, inplace=True)
    data["class"] = data["class"].apply(func)
        
    return data

In [3]:
def preparing_data_2(df:pd.DataFrame) -> pd.DataFrame:
    # SettingWithCopyWarning
    data = df.copy()
    
    def func(c):
        if c == "분노" : return "angry"
        elif c == "혐오" : return "disgust"
        elif c == "중립" : return "neutral"
        elif c == "놀람" : return "surprise"
        elif c == "행복" : return "happiness"
        elif c == "공포" : return "fear"
        elif c == "슬픔" : return "sadness"
        else : return np.nan

    # cleaning data    
    data = data[["Unnamed: 1" ,"Unnamed: 2"]]
    data.columns = ["sentence", "class"]
    data.reset_index(drop=True, inplace=True)
    data["class"] = data["class"].apply(func)
    data.dropna(inplace=True)

    # imbalanced data 
    """
    
    # all data size
    data.shape >= 50000

    # one column data size
    data["neutral"].shape >= 40000
    
    # apply random undersampling
    data["neutral"].shape : 10000

    """
    neutral_index = list(data[data["class"] == "neutral"].index)
    remove_index = random.sample(neutral_index,33786)
    data.drop(remove_index, inplace=True)
    
    return data

In [4]:
#
dataset_1_1 = "./data/year_4.csv"
dataset_1_2 = "./data/year_5_1.csv"
dataset_1_3 = "./data/year_5_2.csv"

train1 = preparing_data_1(pd.read_csv(dataset_1_1, index_col=0, encoding="cp949"))
train2 = preparing_data_1(pd.read_csv(dataset_1_2, index_col=0, encoding="cp949"))
train3 = preparing_data_1(pd.read_csv(dataset_1_3, index_col=0, encoding="cp949"))

#
dataset_2_0 = "./한국어_연속적_대화_데이터셋.xlsx"

train4 = preparing_data_2(pd.read_excel(dataset_2_0, index_col=0))

# concat
train = pd.DataFrame()
train = pd.concat([train, train1], axis=0, ignore_index=True)
train = pd.concat([train, train2], axis=0, ignore_index=True)
train = pd.concat([train, train3], axis=0, ignore_index=True)
train = pd.concat([train, train4], axis=0, ignore_index=True)

In [5]:
Counter(train['class'])

Counter({'angry': 15263,
         'sadness': 15972,
         'fear': 4229,
         'disgust': 4880,
         'neutral': 13262,
         'happiness': 5578,
         'surprise': 6621})

(2) label encoding

In [6]:
# label encoding
d = {"sadness":0,
     "fear":1,
     "disgust":2,
     "neutral":3,
     "happiness":4,
     "angry":5,
     "surprise":6}

train['class'] = train['class'].map(d)
train.head()

Unnamed: 0,sentence,class
0,"어, 청소 니가 대신 해 줘!",5
1,둘 다 청소 하기 싫어. 귀찮아.,5
2,둘 다 하기 싫어서 화내.,5
3,그럼 방세는 어떡해.,5
4,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,0


(3) split train / test

In [7]:
train, test = train_test_split(train, test_size=0.2, shuffle=True, stratify=train['class'], random_state = 1234)
train_label = train['class'].values
test_label = test['class'].values

(4) tokenizing

In [8]:
TOKENIZER_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [9]:
MAX_LEN = 64

tokenized_train_sentences = tokenizer(
    list(train["sentence"]),
    return_tensors="pt",        # pytorch return form
    max_length = MAX_LEN,       # set max token length
    padding = True,             # set zeropadding
    truncation = True,          # set truncate
    add_special_tokens = True   # 
)

tokenized_test_sentences = tokenizer(
    list(test["sentence"]),
    return_tensors="pt",        # pytorch return form
    max_length = MAX_LEN,       # set max token length
    padding = True,             # set zeropadding
    truncation = True,          # set truncate
    add_special_tokens = True   # 
)

(5) make pyTorch form dataset

In [10]:
class MakeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [11]:
train_dataset = MakeDataset(tokenized_train_sentences, train_label)
test_dataset = MakeDataset(tokenized_test_sentences, test_label)

##### 3. Model Training

(1) load model

In [12]:
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7)
model.to(device)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

(2) training arguments

In [13]:
EPOCHS = 1
training_args = TrainingArguments(
    output_dir='./',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=2
)

(3) evaluation metric

In [14]:
# 4가지 평가지표 사용 
# 정확도(accuracy), 정밀도(precision), 재현율(recall), F1 Score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall' : recall
    }

(4) training

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=test_dataset)