In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [27]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer

In [70]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("device:", device)

device: cuda:0


In [71]:
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=5)

Some weights of the model checkpoint at beomi/KcELECTRA-base-v2022 were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.out_proj.weight

In [72]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
path_train = '/content/drive/MyDrive/에이블스쿨/실습파일/2023.04.03_미니프로젝트4차_실습자료/data/train.csv'

In [74]:
data = pd.read_csv(path_train)

In [75]:
label_dict = {
    '코드1': 0,
    '코드2': 0,
    '웹': 1,
    '이론': 2,
    '시스템 운영': 3,
    '원격': 4
}

In [76]:
data['label'] = data['label'].replace(label_dict)

In [77]:
target = 'label'
x = data.drop(target, axis=1, inplace=False)
y = data.loc[:,target]

In [78]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2023, stratify=y)

In [79]:
tokenized_train_sentences = tokenizer(
    list(x_train["text"]),
    return_tensors="pt",                # pytorch의 tensor 형태로 return
    max_length=200,                     # 최대 토큰길이 설정
    padding=True,                       # 제로패딩 설정
    truncation=True,                    # max_length 초과 토큰 truncate
    add_special_tokens=True,            # special token 추가
    )

In [80]:
tokenized_test_sentences = tokenizer(
    list(x_test["text"]),
    return_tensors="pt",
    max_length=200,
    padding=True,
    truncation=True,
    add_special_tokens=True,
    )

In [81]:
class Q_Ko_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [82]:
Y_train = y_train.values

In [83]:
Y_test = y_test.values

In [84]:
train_dataset = Q_Ko_Dataset(tokenized_train_sentences, Y_train)
test_dataset = Q_Ko_Dataset(tokenized_test_sentences, Y_test)

In [85]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [99]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/에이블스쿨/실습파일/2023.04.03_미니프로젝트4차_실습자료/KoELECTRA_model_full',                    # 학습결과 저장경로
    num_train_epochs=10,                # 학습 epoch 설정
    per_device_train_batch_size=32,      # train batch_size 설정
    per_device_eval_batch_size=64,      # test batch_size 설정
    logging_dir='./logs',               # 학습log 저장경로
    logging_steps=20,                  # 학습log 기록 단위
    save_total_limit=2,                 # 학습결과 저장 최대갯수 
)

In [87]:
from sklearn.metrics import *
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [100]:
trainer = Trainer(
    model=model,                         # 학습하고자하는 🤗 Transformers model
    args=training_args,                  # 위에서 정의한 Training Arguments
    train_dataset=train_dataset,         # 학습 데이터셋
    eval_dataset=test_dataset,           # 평가 데이터셋
    compute_metrics=compute_metrics,     # 평가지표
)

In [103]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
20,0.0054
40,0.0006
60,0.0007
80,0.0013
100,0.0008
120,0.0107
140,0.0064
160,0.0108
180,0.0007
200,0.0007


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=1050, training_loss=0.004379203290279423, metrics={'train_runtime': 1081.3664, 'train_samples_per_second': 30.841, 'train_steps_per_second': 0.971, 'total_flos': 3427730488740000.0, 'train_loss': 0.004379203290279423, 'epoch': 10.0})

In [104]:
trainer.evaluate(eval_dataset=test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 1.1176002025604248,
 'eval_accuracy': 0.8598382749326146,
 'eval_f1': 0.8651408430766926,
 'eval_precision': 0.8501496226906063,
 'eval_recall': 0.8904223436596143,
 'eval_runtime': 4.2773,
 'eval_samples_per_second': 86.738,
 'eval_steps_per_second': 1.403,
 'epoch': 10.0}

---
# TEST

In [None]:
model.load_state_dict(torch.load('KcELECTRA_model_full.pt', map_location=torch.device('cpu')))
model.eval()

In [105]:
test_path = '/content/drive/MyDrive/에이블스쿨/실습파일/2023.04.03_미니프로젝트4차_실습자료/data/test.csv'
test = pd.read_csv(test_path)

In [106]:
model.to('cpu')
model.eval()

result = []
for i in range(len(test)):
    temp = model(**tokenizer(test["text"][i],
                             return_tensors="pt",
                             max_length=200,
                             padding=True,
                             truncation=True,
                             add_special_tokens=True))
    logits = temp.logits
    preds = logits.argmax(dim=-1)
    if preds == 0:
        result.append(0)
    elif preds == 1:
        result.append(1)
    elif preds == 2:
        result.append(2)
    elif preds == 3:
        result.append(3)
    elif preds == 4:
        result.append(4)

print(result)

[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 2, 3, 1, 3, 0, 0, 3, 2, 0, 2, 3, 2, 4, 4, 0, 2, 2, 0, 2, 0, 0, 0, 0, 3, 3, 2, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 3, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 3, 3, 3, 0, 0, 0, 1, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 2, 2, 2, 0, 0, 3, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 3, 0, 2, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 3, 0, 3, 2, 2, 2, 3, 3, 0, 2, 2, 2, 0, 3, 2, 2, 2, 3, 3, 0, 2, 3, 2, 0, 2, 0, 0, 0, 4, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 3, 3, 0, 0, 0, 1, 2, 0, 2, 0, 2, 0, 0, 0, 2, 4, 0, 4, 2, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 2, 0, 3, 2, 2, 3, 0, 3, 0, 0, 2, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 3, 0, 0, 2, 0, 0, 0, 3, 3, 2, 1, 2, 0, 1, 0, 2, 1, 2, 3, 

In [121]:
temp = pd.DataFrame({'label' : result})

In [122]:
temp.reset_index(inplace=True)

In [123]:
temp = temp.rename(columns={'index' : 'id'})

In [124]:
temp.to_csv('result_kc_.csv', index=False)