In [None]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # 只使用 GPU 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd


df = pd.read_csv("./dataset/KETI_nopreprocessed_labeled_dropdup+seperate_ver0.3.csv")

In [7]:
df = df[['transcription', 'OCEAN']]
print(df.head())

                   transcription  \
0  저희가 지금 봐야 되는 게 설치 장소랑 뭘 팔지네요.   
1            그러니까 계절이 10월이니까 쌀쌀한   
2                          좀 쌀쌀한   
3         날씨 그럼 뭐 각자 좀 생각을 해볼까요?   
4      아니면 그냥 저희끼리 얘기를 하는 게 편하신지   

                                               OCEAN  
0  [0.39583333333333337, 0.7708333333333333, 0.72...  
1  [0.39583333333333337, 0.7708333333333333, 0.72...  
2  [0.7708333333333333, 0.5416666666666666, 0.770...  
3  [0.39583333333333337, 0.7708333333333333, 0.72...  
4  [0.39583333333333337, 0.7708333333333333, 0.72...  


In [8]:
import ast 

# OCEAN 컬럼을 리스트로 변환 후 소숫점 4자리로 반올림
def round_ocean_values(ocean_string):
    ocean_list = ast.literal_eval(ocean_string)  # 문자열을 리스트로 변환
    return [round(value, 4) for value in ocean_list]  # 각 값을 소숫점 4자리로 반올림

In [None]:
# OCEAN 값 처리
df['OCEAN'] = df['OCEAN'].apply(round_ocean_values)
df['OCEAN'] = df['OCEAN'].apply(lambda lst: ', '.join(['{:.4f}'.format(num) for num in lst]))

print(df[['transcription', 'OCEAN']].head())

                   transcription                                   OCEAN
0  저희가 지금 봐야 되는 게 설치 장소랑 뭘 팔지네요.  0.3958, 0.7708, 0.7292, 0.8333, 0.4375
1            그러니까 계절이 10월이니까 쌀쌀한  0.3958, 0.7708, 0.7292, 0.8333, 0.4375
2                          좀 쌀쌀한  0.7708, 0.5417, 0.7708, 0.8125, 0.3542
3         날씨 그럼 뭐 각자 좀 생각을 해볼까요?  0.3958, 0.7708, 0.7292, 0.8333, 0.4375
4      아니면 그냥 저희끼리 얘기를 하는 게 편하신지  0.3958, 0.7708, 0.7292, 0.8333, 0.4375


In [10]:
instruction = "주어진 대화를 보고, 성격 5요인을 예측하세요."

# Instruction + Transcription 결합
df['input_text'] = df['transcription'].apply(lambda x: f"Instruction: {instruction} Transcription: {x}")

# 필요 없는 컬럼 제거
df = df[['input_text', 'OCEAN']]

print(df.head())  # 데이터 확인

                                          input_text  \
0  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
1  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
2  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
3  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
4  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   

                                    OCEAN  
0  0.3958, 0.7708, 0.7292, 0.8333, 0.4375  
1  0.3958, 0.7708, 0.7292, 0.8333, 0.4375  
2  0.7708, 0.5417, 0.7708, 0.8125, 0.3542  
3  0.3958, 0.7708, 0.7292, 0.8333, 0.4375  
4  0.3958, 0.7708, 0.7292, 0.8333, 0.4375  


In [11]:
from datasets import Dataset

# Hugging Face Dataset으로 변환
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle(seed=42)  # 데이터 섞기
dataset = dataset.train_test_split(test_size=0.2)  # 80% 학습, 20% 검증

# 데이터 확인
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'OCEAN'],
        num_rows: 9045
    })
    test: Dataset({
        features: ['input_text', 'OCEAN'],
        num_rows: 2262
    })
})


In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# 选择一个 Seq2Seq 的 XLM-R 类似模型（XLM-R 本身没有 Seq2Seq 版本）
model_name = "facebook/mbart-large-50"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

2025-03-21 16:42:33.529566: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-21 16:42:33.533782: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 16:42:33.541025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742542953.552889 3028160 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742542953.556258 3028160 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been regist

In [13]:

# 토크나이징 함수 정의
def preprocess_data(examples):
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=256)
    
    # OCEAN 값을 문자열로 변환하여 Target 생성
    targets = [" ".join(map(str, ocean)) for ocean in examples["OCEAN"]]
    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=256)
    
    # 반환할 딕셔너리 생성
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
# 데이터셋 전처리 적용
train_dataset = dataset["train"].map(preprocess_data, batched=True)
eval_dataset = dataset["test"].map(preprocess_data, batched=True)


In [16]:
from sklearn.metrics import mean_absolute_error

def compute_metrics(p):
    # p.predictions와 p.label_ids의 형태를 확인
    predictions, labels = p.predictions, p.label_ids

    # 예측값이 logits인 경우, argmax를 통해 클래스 예측값을 얻을 수 있음
    # (예: 다중 클래스 문제라면, 각 샘플에 대해 최대 확률을 가진 클래스를 예측)
    if predictions.ndim > 1:
        predictions = predictions.argmax(axis=-1)

    # 예측값과 실제값의 길이를 확인
    if len(predictions) != len(labels):
        print(f"Length mismatch: {len(predictions)} vs {len(labels)}")

    # MAE 계산
    mae = mean_absolute_error(labels, predictions)

    # 1-MAE 값을 반환
    return {"eval_1-MAE": 1 - mae}

In [17]:
import os

os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from torch.nn import BCEWithLogitsLoss  
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # ensure no `num_items_in_batch`
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits  # get logits

        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(logits, labels)  # get loss

        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# 학습 설정
training_args = TrainingArguments(
    output_dir="./model/mbart-large-50/epoch100",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=100,  # epoch
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
    learning_rate=5e-5
)

# Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)
print(train_dataset)
# 모델 학습 시작
trainer.train()

  trainer = Trainer(


Dataset({
    features: ['input_text', 'OCEAN', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 9045
})


Epoch,Training Loss,Validation Loss
1,0.0181,0.016928
2,0.0171,0.016568
3,0.1652,0.01657
4,0.0164,0.016166
5,0.0169,0.018491
6,0.0168,0.016148
7,0.0171,0.015949
8,0.0171,0.017248
9,0.016,0.015776
10,0.0162,0.015696


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=226200, training_loss=0.009685572743363131, metrics={'train_runtime': 45835.508, 'train_samples_per_second': 19.734, 'train_steps_per_second': 4.935, 'total_flos': 4.90042218774528e+17, 'train_loss': 0.009685572743363131, 'epoch': 100.0})

In [21]:
savefile = "./mbart-large-50/epoch100"
model.save_pretrained(savefile)
tokenizer.save_pretrained(savefile)

('./mbart-large-50/epoch100/tokenizer_config.json',
 './mbart-large-50/epoch100/special_tokens_map.json',
 './mbart-large-50/epoch100/sentencepiece.bpe.model',
 './mbart-large-50/epoch100/added_tokens.json',
 './mbart-large-50/epoch100/tokenizer.json')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
savefile = "/home/oem/qx/instruction_tuning/model/mbart-large-50/epoch100/checkpoint-226200"
tokenizer = AutoTokenizer.from_pretrained(savefile)
tuned_model = AutoModelForSeq2SeqLM.from_pretrained(savefile).to(device) 

In [None]:
import re
def generate_ocean_score(text):
    prompt = f"### Instruction:\n주어진 대화를 보고, 성격 5요인을 예측하세요.\n\n### Input:\n{text}\n\n### Response:"

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

    output = tuned_model.generate(input_ids=input_ids).to(device)

    return tokenizer.decode(output[0], skip_special_tokens=True)
print()
# 테스트
test_text = "좋은 사람인데"
print("============ 결과 출력 ============")
print(re.sub(r'\s+', '', generate_ocean_score(test_text)))


0.6458,0.6458,0.5208,0.7292,0.6250


In [None]:
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from torch.utils.data import DataLoader
import re
device = "cuda" if torch.cuda.is_available() else "cpu"

tuned_model.to(device)

# eval_dataset을 DataLoader로 감싸기
eval_dataloader = DataLoader(eval_dataset, batch_size=2)  # batch_size는 적절히 설정

# 모델을 평가 모드로 설정
tuned_model.eval()

# 예측값과 실제값 저장할 리스트 초기화
all_predictions = []
all_labels = []
def extract_transcription(text):
    match = re.search(r"Transcription:\s*(.*)", text)
    return match.group(1) if match else text  # 只返回 Transcription 后面的内容
# 테스트 데이터에 대해 반복문 돌리기
with torch.no_grad():  # 평가 시에는 기울기 계산을 하지 않음
    for idx, batch in enumerate(tqdm(eval_dataloader)):
        # 'input_text'를 'input_ids'와 'attention_mask'로 변환
        input_texts = batch['input_text']
        labels = batch['OCEAN']
        for input_the_text, label in zip(input_texts, labels):
            input_the_text = extract_transcription(input_the_text)
            prediction_str = generate_ocean_score(input_the_text)
            prediction_str = re.sub(r'\s+', '', prediction_str)

            try:
                # try to transform the list to float 
                prediction_list = list(map(float, prediction_str.split(',')))
                label_list = list(map(float, label.split(', ')))

                # lenth 확인
                if len(prediction_list) != len(label_list):
                    continue  

                all_predictions.extend(prediction_list)
                all_labels.extend(label_list)

            except ValueError:
                # can`t transform to float type, skip this data 
                print(f"skip this data: {prediction_str}")
                continue

# MAE 계산
mae_score = mean_absolute_error(all_labels, all_predictions)

# 1-MAE 출력
one_minus_mae = 1 - mae_score
print(f"1-MAE: {one_minus_mae}")

 10%|█         | 115/1131 [01:36<14:04,  1.20it/s]

跳过预测值: 0.0.83333,0.6042,0.5625,0.6667


 34%|███▎      | 380/1131 [05:20<10:25,  1.20it/s]

跳过预测值: 0.7708.708,0.5833,0.6875,0.583333


100%|██████████| 1131/1131 [15:50<00:00,  1.19it/s]

1-MAE: 0.9052419964507542





In [24]:
print(all_predictions)
print(all_labels)
print(eval_dataset["OCEAN"])

[0.6042, 0.625, 0.4583, 0.7083, 0.6458, 0.7917, 0.5625, 0.8542, 0.7083, 0.375, 0.8125, 0.75, 0.6042, 0.7083, 0.3125, 0.9375, 0.9583, 0.875, 0.3542, 0.1875, 0.4792, 0.5625, 0.4167, 0.625, 0.3958, 0.8125, 0.75, 0.6042, 0.7083, 0.3125, 0.3958, 0.7708, 0.7292, 0.8333, 0.4375, 0.7292, 0.4792, 0.5, 0.4583, 0.7292, 0.5417, 0.6042, 0.3542, 0.625, 0.5833, 0.8125, 0.75, 0.6042, 0.7083, 0.3125, 0.6042, 0.7917, 0.7083, 0.6042, 0.3125, 0.9375, 0.4583, 0.875, 0.75, 0.5833, 0.5833, 0.4583, 0.8125, 0.5417, 0.4583, 0.6458, 0.625, 0.7708, 0.625, 0.2955, 0.5, 0.4375, 0.4375, 0.7292, 0.4583, 0.4792, 0.5625, 0.4167, 0.625, 0.3958, 0.6458, 0.5208, 0.7917, 0.6667, 0.2292, 0.5625, 0.8333, 0.8125, 0.3125, 0.2708, 0.5833, 0.5625, 0.4792, 0.4583, 0.3125, 0.3958, 0.7708, 0.7292, 0.8333, 0.4375, 0.6875, 0.4792, 0.4375, 0.7708, 0.4375, 0.6042, 0.7917, 0.7083, 0.6042, 0.3125, 0.5625, 0.3125, 0.6875, 0.6042, 0.1042, 0.6042, 0.7917, 0.7083, 0.6042, 0.3125, 0.8333, 0.6042, 0.5625, 0.6667, 0.3333, 0.375, 0.6042, 0.2292,