## Modeling Baseline

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import random
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

## Seed 고정 Gpu 설정

In [2]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Load Tokenizer, Model
Hugging Face Hub에 존재하는 Pretrained Tokenizer와 Model 및 Model Config를 불러옵니다.

In [3]:
MODEL_NAME = 'tunib/electra-ko-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 2

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = config)

Some weights of the model checkpoint at tunib/electra-ko-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at tunib/electra-ko-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [4]:
config

ElectraConfig {
  "_name_or_path": "tunib/electra-ko-base",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

### Tokenizing
Train Data를 Train과 Validation Dataset으로 나누고 각각 데이터를 Tokenizer를 통해 Tokenizing을 합니다.

In [5]:
train_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Data/train_data.csv", encoding='cp949', index_col=0)
eval_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Data/eval_data.csv", encoding='cp949', index_col=0)

tokenized_train = tokenizer(
    list(train_dataset['Sentence']),
    return_tensors="pt",  # pytorch.Tensor로 리턴
    max_length=128, # Max_Length = 75
    padding=True,  # batch 중 가장 긴 시퀀스를 기준으로 pad 채움.
    truncation=True,  # max_length 넘어가면 버림
    add_special_tokens=True
)

tokenized_eval = tokenizer(
    list(eval_dataset['Sentence']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print(tokenized_train['input_ids'][1])
print(tokenizer.decode(tokenized_train['input_ids'][1]))

tensor([    2,    34,  4818,    34, 12249, 26804,    34, 13859, 24988,     5,
           34, 18935,  6364, 19555,     5, 11408, 15658, 28660,    62,    56,
         6033,  6064,    63,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])
[CLS]? 카? 활동쿠폰? 지급완료!? 쿠폰함확인! 좋은하루 되세요 [ URL ] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

## Dataset 클래스 정의

In [6]:
class Elec_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, label):  # 전처리된 데이터 셋이 들어옴
        self.dataset = dataset
        self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):  # 샘플 수
        return len(self.label)

In [7]:
train_label = train_dataset['Label']
eval_label = eval_dataset['Label']

In [8]:
train_dataset = Elec_Dataset(tokenized_train, train_label)
eval_dataset = Elec_Dataset(tokenized_eval, eval_label)

print(train_dataset.__len__())
print(eval_dataset.__len__())
print(train_dataset.__getitem__(19997))
print(tokenizer.decode(train_dataset.__getitem__(19997)['input_ids']))

48030
5337
{'input_ids': tensor([    2,    34,  4818, 28889,    34,    34,  3158,    16,  1824,    34,
           34, 18273,    34,    14,    14,    14,    34, 12394,     9,    34,
        28366,  6140,  6261,    34,    62,    56,  6033,  6064,    63,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auroc': auc
    }

## Trainer 정의

In [10]:
# model

training_ars = TrainingArguments(
    output_dir='./result',
    num_train_epochs=3,
    max_steps=10000,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    save_total_limit=5,
    save_steps=10000,
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps = 1000,
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=model,
    args=training_ars,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()
model.save_pretrained('./result/best_model')

***** Running training *****
  Num examples = 48030
  Num Epochs = 54
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 10000


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auroc
1000,0.6849,0.717125,0.504778,0.500284,0.504577,0.496063,0.504773
2000,0.5532,0.931692,0.494098,0.489796,0.493714,0.485939,0.494093
3000,0.446,1.148312,0.493723,0.476762,0.492992,0.461567,0.493705
4000,0.3913,1.383973,0.486228,0.440636,0.483221,0.404949,0.486183
5000,0.3611,1.525117,0.486603,0.466926,0.485241,0.449944,0.486582
6000,0.3476,1.601268,0.491662,0.477163,0.49088,0.464192,0.491647
7000,0.3351,1.667313,0.48754,0.452891,0.48542,0.424447,0.487504
8000,0.3262,1.724138,0.492411,0.481531,0.49179,0.471691,0.4924
9000,0.3217,1.759537,0.490163,0.473389,0.4892,0.458568,0.490145
10000,0.3184,1.789663,0.490912,0.473348,0.489968,0.457818,0.490894


***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
***** Running Evaluation *****
  Num examples = 5337
  Batch size = 256
Saving model checkpoint to ./result/checkpoint-10000
Configuration saved in ./result/checkpoint-10000/config.json
Model weights saved in ./result/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in ./result/checkpoint-10000/tokenizer_config.json
Special tokens file 

## 추론

In [102]:
test_sen = ["아니 왜 반대로 해"]

In [103]:
test = pd.DataFrame(test_sen, columns=['Sentence'])

In [104]:
test['Label'] = 0

test

Unnamed: 0,Sentence,Label
0,아니 왜 반대로 해,0


In [105]:
MODEL_NAME = './result/best_model'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(tokenizer.vocab_size)
model.to(device)

loading configuration file ./result/best_model/config.json
Model config ElectraConfig {
  "_name_or_path": "./result/best_model",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file ./result/best_model/pytorch_model.bin
All model c

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [106]:
test_label = test['Label']

In [107]:
tokenized_test = tokenizer(
    list(test['Sentence']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

test_dataset = Elec_Dataset(tokenized_test,test_label)

print(test_dataset.__len__())
#print(test_dataset.__getitem__(1665))
#print(tokenizer.decode(test_dataset.__getitem__(6)['input_ids']))

1


In [108]:
dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

model.eval()
output_pred = []
output_prob = []

for i, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
            token_type_ids=data['token_type_ids'].to(device)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
print(pred_answer)
print(output_prob)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 93.21it/s]

[0]
[[0.7885516881942749, 0.21144837141036987]]



