In [26]:
from datasets import load_dataset

In [27]:
dataset = load_dataset('smilegate-ai/kor_unsmile')

Using custom data configuration smilegate-ai--kor_unsmile-e0f75c6e3be1af78
Reusing dataset parquet (C:\Users\Owner\.cache\huggingface\datasets\parquet\smilegate-ai--kor_unsmile-e0f75c6e3be1af78\0.0.0\1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)


  0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
dataset["train"][0]

{'문장': '일안하는 시간은 쉬고싶어서 그런게 아닐까',
 '여성/가족': 0,
 '남성': 0,
 '성소수자': 0,
 '인종/국적': 0,
 '연령': 0,
 '지역': 0,
 '종교': 0,
 '기타 혐오': 0,
 '악플/욕설': 0,
 'clean': 1,
 '개인지칭': 0,
 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

In [29]:
unsmile_labels = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

In [30]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch
import numpy as np

In [31]:
model_name = 'beomi/kcbert-base'

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [33]:
def preprocess_function(examples):
    tokenized_examples = tokenizer(str(examples["문장"]))
    tokenized_examples['labels'] = torch.tensor(examples["labels"], dtype=torch.float)
    # multi label classification 학습을 위해선 label이 float 형태로 변형되어야 합니다.
    # huggingface datasets 최신 버전에는 'map' 함수에 버그가 있어서 변형이 올바르게 되지 않습니다.
    
    return tokenized_examples

In [34]:
tokenized_dataset = dataset.map(preprocess_function)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'labels', 'attention_mask', 'token_type_ids'])

Loading cached processed dataset at C:\Users\Owner\.cache\huggingface\datasets\parquet\smilegate-ai--kor_unsmile-e0f75c6e3be1af78\0.0.0\1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121\cache-1c80317fa3b1799d.arrow
Loading cached processed dataset at C:\Users\Owner\.cache\huggingface\datasets\parquet\smilegate-ai--kor_unsmile-e0f75c6e3be1af78\0.0.0\1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121\cache-bdd640fb06671ad1.arrow


In [35]:
tokenized_dataset['train'][0]

{'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
 'input_ids': tensor([    2,  2458, 15751, 24930, 24351, 29278, 17038, 11631,     3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [36]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
num_labels=len(unsmile_labels) # Label 갯수

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels, 
    problem_type="multi_label_classification"
)
model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_labels)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_labels)}

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [38]:
model.config.label2id

{'여성/가족': 0,
 '남성': 1,
 '성소수자': 2,
 '인종/국적': 3,
 '연령': 4,
 '지역': 5,
 '종교': 6,
 '기타 혐오': 7,
 '악플/욕설': 8,
 'clean': 9}

In [39]:
from sklearn.metrics import label_ranking_average_precision_score

In [40]:
def compute_metrics(x):
    return {
        'lrap': label_ranking_average_precision_score(x.label_ids, x.predictions),
    }

In [41]:
batch_size = 32

In [42]:
args = TrainingArguments(
    output_dir="model_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='lrap',
    greater_is_better=True,
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["valid"], 
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [43]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Lrap
1,No log,0.134221,0.871349
2,0.191000,0.124139,0.875649
3,0.103600,0.131697,0.876602
4,0.066500,0.139687,0.876388
5,0.042800,0.15758,0.868638
6,0.028200,0.159045,0.874837
7,0.020500,0.167233,0.876591
8,0.014900,0.176109,0.873434
9,0.011100,0.181788,0.873198
10,0.009900,0.182051,0.873457


TrainOutput(global_step=4690, training_loss=0.05245240481931772, metrics={'train_runtime': 900.3655, 'train_samples_per_second': 166.655, 'train_steps_per_second': 5.209, 'total_flos': 4367333849582004.0, 'train_loss': 0.05245240481931772, 'epoch': 10.0})

In [44]:
trainer.save_model()

In [45]:
torch.save(model,"hate_speech_model_batch32_epoch10.h5")

In [46]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=0,
    return_all_scores=True,
    function_to_apply='sigmoid'
    )



In [47]:
for result in pipe("이래서 여자는 게임을 하면 안된다")[0]:
    print(result)

{'label': '여성/가족', 'score': 0.9120720028877258}
{'label': '남성', 'score': 0.01649666018784046}
{'label': '성소수자', 'score': 0.0041069090366363525}
{'label': '인종/국적', 'score': 0.00719029176980257}
{'label': '연령', 'score': 0.005593992304056883}
{'label': '지역', 'score': 0.005298885516822338}
{'label': '종교', 'score': 0.0032735627610236406}
{'label': '기타 혐오', 'score': 0.006116476375609636}
{'label': '악플/욕설', 'score': 0.01938716322183609}
{'label': 'clean', 'score': 0.08117157220840454}


In [48]:
def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [49]:
import tqdm
from transformers.pipelines.base import KeyDataset

predicated_labels = []

for out in tqdm.tqdm(pipe(KeyDataset(dataset['valid'], '문장'))):
    predicated_labels.append(get_predicated_label(out, 0.5))

100%|█████████████████████████████████████████████████████████████████████████████| 3737/3737 [00:37<00:00, 100.48it/s]


In [50]:
from sklearn.metrics import classification_report

print(classification_report(dataset['valid']['labels'], predicated_labels))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       394
           1       0.85      0.86      0.86       334
           2       0.89      0.81      0.85       280
           3       0.81      0.86      0.83       426
           4       0.88      0.88      0.88       146
           5       0.89      0.93      0.91       260
           6       0.88      0.88      0.88       290
           7       0.58      0.55      0.57       134
           8       0.78      0.55      0.64       786
           9       0.73      0.80      0.76       935

   micro avg       0.80      0.77      0.78      3985
   macro avg       0.81      0.79      0.80      3985
weighted avg       0.80      0.77      0.78      3985
 samples avg       0.77      0.77      0.77      3985



  _warn_prf(average, modifier, msg_start, len(result))
