# 1. Load DataSet

In [1]:
from datasets import load_dataset
unsmile_labels = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]
dataset = load_dataset('smilegate-ai/kor_unsmile')
dataset["train"][0]

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration smilegate-ai--kor_unsmile-e0f75c6e3be1af78
Reusing dataset parquet (/home/bell/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)
100%|██████████| 2/2 [00:00<00:00, 126.70it/s]


{'문장': '일안하는 시간은 쉬고싶어서 그런게 아닐까',
 '여성/가족': 0,
 '남성': 0,
 '성소수자': 0,
 '인종/국적': 0,
 '연령': 0,
 '지역': 0,
 '종교': 0,
 '기타 혐오': 0,
 '악플/욕설': 0,
 'clean': 1,
 '개인지칭': 0,
 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

# 2. Load Model

In [2]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch
import numpy as np

In [3]:
model_name = 'beomi/kcbert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def preprocess_function(examples):
    tokenized_examples = tokenizer(str(examples["문장"]))
    tokenized_examples['labels'] = torch.tensor(examples["labels"], dtype=torch.float)
    # multi label classification 학습을 위해선 label이 float 형태로 변형되어야 합니다.
    # huggingface datasets 최신 버전에는 'map' 함수에 버그가 있어서 변형이 올바르게 되지 않습니다.
    
    return tokenized_examples

In [5]:
tokenized_dataset = dataset.map(preprocess_function)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels', 'attention_mask', 'token_type_ids'])

Loading cached processed dataset at /home/bell/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121/cache-b23fc448b6317d8d.arrow
Loading cached processed dataset at /home/bell/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121/cache-e35146b5b2321e0a.arrow


In [6]:
tokenized_dataset['train'][0]

{'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
 'input_ids': tensor([    2,  2458, 15751, 24930, 24351, 29278, 17038, 11631,     3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [7]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
num_labels=len(unsmile_labels) # Label 갯수

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels, 
    problem_type="multi_label_classification"
)
model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_labels)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_labels)}

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [9]:
model.config.label2id

{'여성/가족': 0,
 '남성': 1,
 '성소수자': 2,
 '인종/국적': 3,
 '연령': 4,
 '지역': 5,
 '종교': 6,
 '기타 혐오': 7,
 '악플/욕설': 8,
 'clean': 9}

# 3. Model training

In [10]:
from sklearn.metrics import label_ranking_average_precision_score

In [11]:
def compute_metrics(x):
    return {
        'lrap': label_ranking_average_precision_score(x.label_ids, x.predictions),
    }

In [12]:
batch_size = 8 # 64 batch는 colab pro에서 테스트되었습니다.

In [13]:
args = TrainingArguments(
    output_dir="model_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='lrap',
    greater_is_better=True,
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["valid"], 
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 개인지칭, 지역, 종교, 연령, clean, 인종/국적, 성소수자, 여성/가족, 악플/욕설, 문장, 남성, 기타 혐오. If 개인지칭, 지역, 종교, 연령, clean, 인종/국적, 성소수자, 여성/가족, 악플/욕설, 문장, 남성, 기타 혐오 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15005
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9380


Epoch,Training Loss,Validation Loss,Lrap
1,0.1423,0.131905,0.864497
2,0.0923,0.126896,0.879188
3,0.0521,0.143911,0.877595
4,0.0305,0.160567,0.877642
5,0.0175,0.17128,0.878049


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 개인지칭, 지역, 종교, 연령, clean, 인종/국적, 성소수자, 여성/가족, 악플/욕설, 문장, 남성, 기타 혐오. If 개인지칭, 지역, 종교, 연령, clean, 인종/국적, 성소수자, 여성/가족, 악플/욕설, 문장, 남성, 기타 혐오 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3737
  Batch size = 8
Saving model checkpoint to model_output/checkpoint-1876
Configuration saved in model_output/checkpoint-1876/config.json
Model weights saved in model_output/checkpoint-1876/pytorch_model.bin
tokenizer config file saved in model_output/checkpoint-1876/tokenizer_config.json
Special tokens file saved in model_output/checkpoint-1876/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: 개인지칭, 지역, 종교, 연령, clean, 인종/국적, 성소수자, 여성/가족, 악플/욕설

TrainOutput(global_step=9380, training_loss=0.07335278357524098, metrics={'train_runtime': 9856.9573, 'train_samples_per_second': 7.611, 'train_steps_per_second': 0.952, 'total_flos': 1631470471321992.0, 'train_loss': 0.07335278357524098, 'epoch': 5.0})

In [15]:
trainer.save_model()

Saving model checkpoint to model_output
Configuration saved in model_output/config.json
Model weights saved in model_output/pytorch_model.bin
tokenizer config file saved in model_output/tokenizer_config.json
Special tokens file saved in model_output/special_tokens_map.json


# 4. Model test

In [17]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=-1, # 0>= GPU | 0 < CPU
    return_all_scores=True,
    function_to_apply='sigmoid'
    )

In [18]:
# 기학습된 모델을 사용하실 경우, 아래 코드로 실행해주세요

# from transformers import TextClassificationPipeline, BertForSequenceClassification, AutoTokenizer

# model_name = 'smilegate-ai/kor_unsmile'

# model = BertForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# pipe = TextClassificationPipeline(
#     model=model,
#     tokenizer=tokenizer,
#     device=0,     # cpu: -1, gpu: gpu number
#     return_all_scores=True,
#     function_to_apply='sigmoid'
#     )

In [19]:
for result in pipe("이래서 여자는 게임을 하면 안된다")[0]:
    print(result)

{'label': '여성/가족', 'score': 0.8613083958625793}
{'label': '남성', 'score': 0.014456871896982193}
{'label': '성소수자', 'score': 0.0016029539983719587}
{'label': '인종/국적', 'score': 0.012013331986963749}
{'label': '연령', 'score': 0.003350969636812806}
{'label': '지역', 'score': 0.003601288655772805}
{'label': '종교', 'score': 0.002437349408864975}
{'label': '기타 혐오', 'score': 0.004123682156205177}
{'label': '악플/욕설', 'score': 0.034082479774951935}
{'label': 'clean', 'score': 0.10020577907562256}


# 5. model evaluation

In [20]:
def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [21]:
import tqdm
from transformers.pipelines.base import KeyDataset

predicated_labels = []

for out in tqdm.tqdm(pipe(KeyDataset(dataset['valid'], '문장'))):
    predicated_labels.append(get_predicated_label(out, 0.5))

Disabling tokenizer parallelism, we're using DataLoader multithreading already
100%|██████████| 3737/3737 [01:46<00:00, 35.01it/s]


In [22]:
from sklearn.metrics import classification_report

print(classification_report(dataset['valid']['labels'], predicated_labels))

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       394
           1       0.83      0.86      0.85       334
           2       0.87      0.79      0.83       280
           3       0.83      0.85      0.84       426
           4       0.89      0.86      0.88       146
           5       0.86      0.92      0.88       260
           6       0.87      0.87      0.87       290
           7       0.65      0.53      0.58       134
           8       0.73      0.63      0.67       786
           9       0.78      0.73      0.75       935

   micro avg       0.80      0.77      0.78      3985
   macro avg       0.81      0.79      0.79      3985
weighted avg       0.79      0.77      0.78      3985
 samples avg       0.78      0.77      0.77      3985



  _warn_prf(average, modifier, msg_start, len(result))
