<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/sobase_tag_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install datasets>=2.18.0 transformers>=4.38.2 sentence-transformers>=2.5.1 setfit>=1.0.3 accelerate>=0.27.2 seqeval>=1.2.2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#데이터 셋 가져오기

from datasets import load_dataset

# 데이터셋의 Concat_Text컬럼은 원 데이터셋에서 Title 컬럼과 Body컬럼을 concat한 값
# Filtered_Tag은 원 데이터셋의 Tags 컬럼에서 <>를 제거하고, 태그들 중에서 top100태그에 들어있지 않은 태그는 제거하고, 태그를 제거했을때, 태그가 하나도 남지 않게 되면, other 태그를 추가함

dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/pj/tag/convert_train.csv',
    split='train'
)
dataset_valid = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/pj/tag/convert_valid.csv',
    split='train'
)

In [None]:
# 상위 100개 태그 리스트를 가져오고, 태그 리스트에 other 태그 추가

import json

with open('/content/drive/My Drive/AiExpertCource/pj/tag/top_100_labels.json', 'r') as f:
    classes = json.load(f)

classes.append('other')

# 문자 태그와 태그 인덱스 매핑 설정
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [None]:
from transformers import AutoTokenizer

#모델은 stackoverflow를 사전학습하여　만든　SOBertBase　 https://arxiv.org/abs/2306.03268

model_path = 'mmukh/SOBertBase'
tokenizer = AutoTokenizer.from_pretrained(model_path)



In [None]:
def preprocess_function(example):
   all_labels = example['Filtered_Tag'].split(' ')
   labels = [0. for i in range(len(classes))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.

   example = tokenizer(example['Concat_Text'], truncation=True)
   example['labels'] = labels
   return example

tokenized_train_dataset = dataset_train.map(preprocess_function)
tokenized_valid_dataset = dataset_valid.map(preprocess_function)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
   id2label=id2class, label2id=class2id,
  problem_type = "multi_label_classification")

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at mmukh/SOBertBase and are newly initialized: ['bert.embeddings.token_type_embeddings.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MegatronBertForSequenceClassification(
  (bert): MegatronBertModel(
    (embeddings): MegatronBertEmbeddings(
      (word_embeddings): Embedding(50048, 768, padding_idx=0)
      (position_embeddings): Embedding(2048, 768)
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MegatronBertEncoder(
      (layer): ModuleList(
        (0-11): 12 x MegatronBertLayer(
          (attention): MegatronBertAttention(
            (ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (self): MegatronBertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MegatronBertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [None]:
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=1,
   per_device_eval_batch_size=1,
   num_train_epochs=1,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_valid_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0265,0.027748,0.991527,0.689862,0.838384,0.586043


TrainOutput(global_step=45000, training_loss=0.03920855623881022, metrics={'train_runtime': 2590.4512, 'train_samples_per_second': 17.371, 'train_steps_per_second': 17.371, 'total_flos': 8079544345917702.0, 'train_loss': 0.03920855623881022, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.02774805575609207,
 'eval_accuracy': 0.9915273927392739,
 'eval_f1': 0.6898617956895718,
 'eval_precision': 0.8383838383838383,
 'eval_recall': 0.5860426929392447,
 'eval_runtime': 252.7326,
 'eval_samples_per_second': 59.351,
 'eval_steps_per_second': 59.351,
 'epoch': 1.0}

In [None]:
model.save_pretrained('/content/drive/My Drive/AiExpertCource/pj/tag/multi-label-bert')
tokenizer.save_pretrained('/content/drive/My Drive/AiExpertCource/pj/tag/multi-label-bert')

('/content/drive/My Drive/AiExpertCource/pj/tag/multi-label-bert/tokenizer_config.json',
 '/content/drive/My Drive/AiExpertCource/pj/tag/multi-label-bert/special_tokens_map.json',
 '/content/drive/My Drive/AiExpertCource/pj/tag/multi-label-bert/tokenizer.json')