<a href="https://colab.research.google.com/github/Jeremy-su1/ai-algorithm/blob/main/sobert_base_tag_ebinna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install datasets>=2.18.0 transformers>=4.38.2 sentence-transformers>=2.5.1 setfit>=1.0.3 accelerate>=0.27.2 seqeval>=1.2.2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from datasets import load_dataset

dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/project/dataset/rev_tag_training_samples.csv',
    split='train'
)
dataset_valid = load_dataset(
    'csv',
    data_files='/content/drive/My Drive/AiExpertCource/project/dataset/rev_tag_validation_samples.csv',
    split='train'
)

In [None]:
classes = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']
class2id = {'Algorithms' :0, 'Backend' : 1, 'Data Science' : 2, 'Databases' : 3, 'Dev Tools' : 4, 'Frontend' : 5, 'Mobile' :6, 'Systems' : 7, 'iOS/macOS' : 8}
id2class = {0 : 'Algorithms', 1: 'Backend', 2 : 'Data Science', 3 : 'Databases', 4 : 'Dev Tools', 5 : 'Frontend', 6 : 'Mobile', 7 : 'Systems', 8 :'iOS/macOS'}

In [None]:
from transformers import AutoTokenizer

model_path = 'mmukh/SOBertBase'
tokenizer = AutoTokenizer.from_pretrained(model_path)



In [None]:
import ast

def preprocess_function(example):
   all_labels =  ast.literal_eval(example['Tags_new'])
   labels = [0. for i in range(len(classes))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.

   example = tokenizer(example['Title'] + ' ' + example['Body'], truncation=True, return_tensors="pt")
   example['labels'] = labels
   example['input_ids'] = example['input_ids'].squeeze(0)
   example['token_type_ids'] = example['token_type_ids'].squeeze(0)
   example['attention_mask'] = example['attention_mask'].squeeze(0)
   return example


tokenized_train_dataset = dataset_train.map(preprocess_function)
tokenized_valid_dataset = dataset_valid.map(preprocess_function)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["Title", "Body", "Tags_filtered", "Tags_list", "Tags_new", "Algorithms", "Backend", "Data Science", "Databases", "Dev Tools", "Frontend", "Mobile", "Systems", "iOS/macOS"])
tokenized_valid_dataset = tokenized_valid_dataset.remove_columns(["Title", "Body", "Tags_filtered", "Tags_list", "Tags_new", "Algorithms", "Backend", "Data Science", "Databases", "Dev Tools", "Frontend", "Mobile", "Systems", "iOS/macOS"])


In [None]:
from transformers import DataCollatorWithPadding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np

def sigmoid(x):
   return 1/(1 + np.exp(-x))


def compute_metrics(eval_pred):

  predictions, labels = eval_pred
  predictions = sigmoid(predictions)
  predictions = (predictions > 0.5).astype(int)
  accuracy = accuracy_score(labels, predictions)
  precision, recall, f1_score_result, _ = precision_recall_fscore_support(labels, predictions, average='micro')

  flat_predictions = predictions.reshape(-1)
  flat_labels = labels.reshape(-1)
  flat_accuracy = accuracy_score(flat_labels, flat_predictions)

  return {
        'flat_accuracy' : flat_accuracy,
        'accuracy' : accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score_result
        }

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(classes),
   id2label=id2class, label2id=class2id,
  problem_type = "multi_label_classification")

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at mmukh/SOBertBase and are newly initialized: ['bert.embeddings.token_type_embeddings.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MegatronBertForSequenceClassification(
  (bert): MegatronBertModel(
    (embeddings): MegatronBertEmbeddings(
      (word_embeddings): Embedding(50048, 768, padding_idx=0)
      (position_embeddings): Embedding(2048, 768)
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MegatronBertEncoder(
      (layer): ModuleList(
        (0-11): 12 x MegatronBertLayer(
          (attention): MegatronBertAttention(
            (ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (self): MegatronBertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MegatronBertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [None]:
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=4,
   per_device_eval_batch_size=4,
   num_train_epochs=5,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_valid_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)




In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Flat Accuracy,Accuracy,Precision,Recall,F1 Score
1,0.1496,0.113344,0.960752,0.717692,0.848308,0.863567,0.855869
2,0.1074,0.113787,0.959282,0.701846,0.821737,0.89169,0.855286
3,0.0754,0.118472,0.958701,0.697077,0.81834,0.891943,0.853558
4,0.0575,0.127632,0.958479,0.696462,0.817622,0.891056,0.852761
5,0.0433,0.132513,0.958821,0.699692,0.819974,0.890296,0.85369


TrainOutput(global_step=25000, training_loss=0.10023628662109375, metrics={'train_runtime': 5736.0642, 'train_samples_per_second': 17.434, 'train_steps_per_second': 4.358, 'total_flos': 4.131525019816022e+16, 'train_loss': 0.10023628662109375, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.11334449797868729,
 'eval_flat_accuracy': 0.9607521367521368,
 'eval_accuracy': 0.7176923076923077,
 'eval_precision': 0.8483076157292185,
 'eval_recall': 0.8635672662781859,
 'eval_f1_score': 0.8558694287507846,
 'eval_runtime': 106.7768,
 'eval_samples_per_second': 60.875,
 'eval_steps_per_second': 15.219,
 'epoch': 5.0}

In [None]:
tuned_model_path = '/content/drive/My Drive/AiExpertCource/project/sobert_base_sto_tag'  # 모델과 토크나이저가 저장된 경로

In [None]:
model.save_pretrained(tuned_model_path)
tokenizer.save_pretrained(tuned_model_path)

('/content/drive/My Drive/AiExpertCource/project/sobert_base_sto_tag/tokenizer_config.json',
 '/content/drive/My Drive/AiExpertCource/project/sobert_base_sto_tag/special_tokens_map.json',
 '/content/drive/My Drive/AiExpertCource/project/sobert_base_sto_tag/tokenizer.json')

In [None]:
tuned_tokenizer = AutoTokenizer.from_pretrained(tuned_model_path)
tuned_model = AutoModelForSequenceClassification.from_pretrained(tuned_model_path)

In [None]:
# 예측 함수 정의
def predict(texts):
    # 텍스트를 토큰화하고 텐서로 변환
    inputs = tuned_tokenizer(texts, padding='max_length', truncation=True, max_length=2048, return_tensors='pt')

    # 모델을 사용해 예측 수행
    with torch.no_grad():
        outputs = tuned_model(**inputs)
        logits = outputs.logits

    # 시그모이드를 사용해 확률로 변환
    probabilities = torch.sigmoid(logits).numpy()

    # 각 클래스에 대해 threshold를 0.5로 설정하여 예측값(0 또는 1)으로 변환
    predictions = (probabilities > 0.5).astype(int)

    return predictions, probabilities

In [None]:
# 샘플 텍스트 입력 및 예측 수행
sample_texts = [
  "FFmpeg hevc rtsp stream decoding with frame loss" +
  "I need to decode my rtsp stream. When i`m using default HEVC and have some corrupted frames, my screen looks like this: Corrupted frame using HEVC, Grey image. But instead of this, I want to have a corrupted picture with pixels issue, like this: Same corrupted frame, but using HEVC_QSV."
]

# 예측 수행
predictions, probabilities = predict(sample_texts)

In [None]:
arr = np.array(predictions[0])

# 값이 1인 인덱스에 해당하는 매핑 가져오기
indices = np.where(arr == 1)[0]
indices

for idx in indices:
    tag = id2class[idx]
    print(tag)