In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/user/wylee/ kogpt_new/datasets/KETI_nopreprocessed_labeled_dropdup+seperate_ver0.3.csv')

In [3]:
df = df[['transcription', 'OCEAN']]
print(df.head())

                   transcription  \
0  저희가 지금 봐야 되는 게 설치 장소랑 뭘 팔지네요.   
1            그러니까 계절이 10월이니까 쌀쌀한   
2                          좀 쌀쌀한   
3         날씨 그럼 뭐 각자 좀 생각을 해볼까요?   
4      아니면 그냥 저희끼리 얘기를 하는 게 편하신지   

                                               OCEAN  
0  [0.39583333333333337, 0.7708333333333333, 0.72...  
1  [0.39583333333333337, 0.7708333333333333, 0.72...  
2  [0.7708333333333333, 0.5416666666666666, 0.770...  
3  [0.39583333333333337, 0.7708333333333333, 0.72...  
4  [0.39583333333333337, 0.7708333333333333, 0.72...  


In [4]:
import ast 

# OCEAN 컬럼을 리스트로 변환 후 소숫점 4자리로 반올림
def round_ocean_values(ocean_string):
    ocean_list = ast.literal_eval(ocean_string)  # 문자열을 리스트로 변환
    return [round(value, 4) for value in ocean_list]  # 각 값을 소숫점 4자리로 반올림

In [5]:
import ast

# OCEAN 컬럼을 리스트로 변환
def parse_ocean_values(ocean_string):
    ocean_list = ast.literal_eval(ocean_string)  # 문자열을 리스트로 변환
    return ocean_list  # 리스트 반환

# OCEAN 값 처리 - 리스트 형태로 유지
df['OCEAN'] = df['OCEAN'].apply(parse_ocean_values)

# 리스트 형태 확인
print(f"OCEAN 첫 번째 행 타입: {type(df['OCEAN'][0])}")
print(f"OCEAN 첫 번째 행 값: {df['OCEAN'][0]}")
print(f"OCEAN 첫 번째 행의 첫 번째 값 타입: {type(df['OCEAN'][0][0])}")
print(f"OCEAN 컬럼 타입: {type(df['OCEAN'])}")

# 확인
print(df[['transcription', 'OCEAN']].head())

OCEAN 첫 번째 행 타입: <class 'list'>
OCEAN 첫 번째 행 값: [0.39583333333333337, 0.7708333333333333, 0.7291666666666666, 0.8333333333333333, 0.4375]
OCEAN 첫 번째 행의 첫 번째 값 타입: <class 'float'>
OCEAN 컬럼 타입: <class 'pandas.core.series.Series'>
                   transcription  \
0  저희가 지금 봐야 되는 게 설치 장소랑 뭘 팔지네요.   
1            그러니까 계절이 10월이니까 쌀쌀한   
2                          좀 쌀쌀한   
3         날씨 그럼 뭐 각자 좀 생각을 해볼까요?   
4      아니면 그냥 저희끼리 얘기를 하는 게 편하신지   

                                               OCEAN  
0  [0.39583333333333337, 0.7708333333333333, 0.72...  
1  [0.39583333333333337, 0.7708333333333333, 0.72...  
2  [0.7708333333333333, 0.5416666666666666, 0.770...  
3  [0.39583333333333337, 0.7708333333333333, 0.72...  
4  [0.39583333333333337, 0.7708333333333333, 0.72...  


In [6]:
instruction = "주어진 대화를 보고, 성격 5요인을 예측하세요."

# Instruction + Transcription 결합
df['input_text'] = df['transcription'].apply(lambda x: f"Instruction: {instruction} Transcription: {x}")

# 필요 없는 컬럼 제거
df = df[['input_text', 'OCEAN']]

print(df.head())  # 데이터 확인

                                          input_text  \
0  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
1  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
2  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
3  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   
4  Instruction: 주어진 대화를 보고, 성격 5요인을 예측하세요. Transc...   

                                               OCEAN  
0  [0.39583333333333337, 0.7708333333333333, 0.72...  
1  [0.39583333333333337, 0.7708333333333333, 0.72...  
2  [0.7708333333333333, 0.5416666666666666, 0.770...  
3  [0.39583333333333337, 0.7708333333333333, 0.72...  
4  [0.39583333333333337, 0.7708333333333333, 0.72...  


In [7]:
from datasets import Dataset

# Hugging Face Dataset으로 변환
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle(seed=42)  # 데이터 섞기
dataset = dataset.train_test_split(test_size=0.2)  # 80% 학습, 20% 검증

# 데이터 확인
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'OCEAN'],
        num_rows: 9045
    })
    test: Dataset({
        features: ['input_text', 'OCEAN'],
        num_rows: 2262
    })
})


In [8]:
import torch
import torch.nn as nn
from transformers import GPT2PreTrainedModel, GPT2Model

class KoGPT2ForRegression(GPT2PreTrainedModel):
    """
    KoGPT2를 Regression Task(OCEAN 예측) 가능하도록 수정한 모델.
    """
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config)  # 기존 KoGPT2 구조 유지
        self.regression_head = nn.Linear(config.hidden_size, 5)  # 🔹 768 → 5로 변경

        self.init_weights()

    def forward(self, input_ids, attention_mask=None, labels=None):
        """
        Forward pass
        """
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state[:, -1, :]  # 🔹 마지막 토큰의 hidden state 사용
        logits = self.regression_head(hidden_states)  # 🔹 Linear Layer 통과
        
        loss = None
        if labels is not None:
            loss_fn = nn.SmoothL1Loss()  # 🔹 Regression을 위한 MSE Loss 사용
            loss = loss_fn(logits, labels)  # MSE 계산
        
        return {"loss": loss, "logits": logits}

2025-04-01 00:12:13.685086: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743433933.696107    6458 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743433933.699445    6458 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-01 00:12:13.712865: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = KoGPT2ForRegression.from_pretrained('skt/kogpt2-base-v2')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2',
    bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>',
    padding_side="right",
    model_max_length=512,
)

Some weights of KoGPT2ForRegression were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['regression_head.bias', 'regression_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
print(model)

KoGPT2ForRegression(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (regression_head): Linear(in_features=768, out_features=5, bias=True)
)


In [11]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import  pipeline
from transformers import Trainer, TrainingArguments
from copy import deepcopy
from dataclasses import dataclass


import torch
from torch.utils.data import Dataset
import pandas as pd
import transformers
import logging
from typing import Dict, Sequence

In [12]:
class SFT_dataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: transformers.PreTrainedTokenizer, verbose=False):
        logging.warning("Loading data...")

        PROMPT_DICT = {
            "prompt_input": (
                "### Instruction(명령어):\n주어진 대화를 보고, 성격 5요인을 예측하세요.\n\n"
                "### Transcription(대화):\n{transcription}\n\n"
                "### Response(응답):"
            )
        }

        sources = []
        targets = []
        
        for _, row in df.iterrows():
            # 입력 프롬프트
            prompt = PROMPT_DICT["prompt_input"].format(
                transcription=row["input_text"]
            )
            sources.append(prompt)
            
            # OCEAN 값을 Float Tensor로 변환
            if isinstance(row['OCEAN'], str):
                try:
                    ocean_values = eval(row['OCEAN'])
                except:
                    ocean_values = [float(val.strip()) for val in row['OCEAN'].split(',')]
            else:
                ocean_values = row['OCEAN']
                
            targets.append(torch.tensor(ocean_values, dtype=torch.float32))
        
        # 입력 토큰화
        sources_tokenized = self._tokenize_fn(sources, tokenizer)
        self.input_ids = sources_tokenized["input_ids"]
        
        # Float Tensor 레이블 저장
        self.labels = targets
        
        logging.warning("Loading data done!!: %d" % len(self.labels))

    def _tokenize_fn(self, strings, tokenizer):
        tokenized_list = tokenizer(
            strings,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        input_ids = tokenized_list["input_ids"]
        
        return dict(
            input_ids=input_ids
        )

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i):
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

In [13]:
from dataclasses import dataclass
from typing import Dict, Sequence
import torch
import transformers

@dataclass
class DataCollatorForSupervisedDataset:
    tokenizer: transformers.PreTrainedTokenizer
    
    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids = [instance["input_ids"] for instance in instances]
        labels = [instance["labels"] for instance in instances]
        
        # input_ids 패딩
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        
        # labels을 batch로 stack
        labels = torch.stack(labels)
        
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [14]:
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.shape, eval_df.shape)

train_dataset = SFT_dataset(df=train_df, tokenizer=tokenizer)
eval_dataset = SFT_dataset(df=eval_df, tokenizer=tokenizer)

# 🔹 Data Collator 설정
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)



(9045, 2) (2262, 2)




In [15]:
import torch

def decode_kogpt2_output(logits, tokenizer):
    """
    KoGPT2의 output(logits)을 디코딩하는 함수.
    logits을 tokenizer를 사용하여 사람이 읽을 수 있는 텍스트로 변환.
    """
    # 🔹 Logits에서 가장 확률이 높은 토큰 선택 (argmax)
    predicted_token_ids = torch.argmax(logits, dim=-1).tolist()  # 🔹 .tolist() 추가

    # 🔹 선택된 토큰을 텍스트로 변환
    decoded_text = tokenizer.decode(predicted_token_ids, skip_special_tokens=True)
    
    return decoded_text

In [16]:
class KoGPT2ForTextGeneration(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        
    def forward(self, input_ids, attention_mask=None, labels=None):
        # 언어 모델링을 위한 forward 메서드
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels  # 이미 토큰화된 레이블 사용
        )
        
        return outputs

In [17]:
# MSELoss를 사용하도록 Trainer 수정
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kargs):
        labels = inputs.pop("labels")  # 정답 라벨 추출
        outputs = model(**inputs)  # 모델 실행
        logits = outputs["logits"]  # 🔹 (batch_size, 5)

        loss_fn = torch.nn.SmoothL1Loss()
        loss = loss_fn(logits, labels)  # 🔹 MSE Loss 적용

        return (loss, outputs) if return_outputs else loss

In [18]:
training_args = TrainingArguments(
    output_dir="./kogpt_sft/test/dt_col/smooth/",
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=5,
    prediction_loss_only=False,  # 회귀에서는 False
    fp16=True
)

trainer = Trainer(  # 또는 사용자 정의 RegressionTrainer
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(  # 또는 사용자 정의 RegressionTrainer


Step,Training Loss
500,0.0238
1000,0.016
1500,0.0144
2000,0.0144
2500,0.0136
3000,0.0135
3500,0.0134
4000,0.0131
4500,0.0137
5000,0.0127


TrainOutput(global_step=113100, training_loss=0.003832423251606425, metrics={'train_runtime': 3768.8606, 'train_samples_per_second': 119.996, 'train_steps_per_second': 30.009, 'total_flos': 6.162624047180243e+16, 'train_loss': 0.003832423251606425, 'epoch': 50.0})

In [19]:
import torch

def predict_ocean(transcription, model, tokenizer):
    """
    KoGPT2 Regression 모델을 사용하여 OCEAN 점수를 예측하는 함수.
    """
    input_text = (
        "### Instruction(명령어):\n주어진 대화를 보고, 성격 5요인을 예측하세요.\n\n"
        f"### Transcription(대화):\n{transcription}\n\n"
        "### Response(응답):"
    )
    
    # Move model to the correct device (GPU if available, otherwise CPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    # Tokenize input and move input tensors to the same device
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # 모델 실행 (generate() 대신 forward() 호출)
    with torch.no_grad():  # 🔹 Inference 시 Gradient 계산 비활성화
        outputs = model(**inputs)
        logits = outputs["logits"]  # 🔹 OCEAN 값 예측 (batch_size, 5)

    # logits을 0~1 사이 값으로 변환 (Regression Scaling)
    predicted_ocean = torch.sigmoid(logits).squeeze().tolist()  # 🔹 1D 리스트 변환

    return predicted_ocean  # 🔹 5개의 float 값을 리스트로 반환

# ✅ 테스트 실행
sample_transcription = "아."
predicted_ocean = predict_ocean(sample_transcription, model, tokenizer)

print(f"Predicted OCEAN Values: {predicted_ocean}")


Predicted OCEAN Values: [0.6486889719963074, 0.6672560572624207, 0.6590760946273804, 0.6210238933563232, 0.606789767742157]


In [20]:
import numpy as np
import itertools
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)

# eval_dataset을 DataLoader로 감싸기
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

# 모델을 평가 모드로 설정
model.eval()


KoGPT2ForRegression(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (regression_head): Linear(in_features=768, out_features=5, bias=True)
)

In [21]:

# 예측값과 실제값 저장할 리스트 초기화
all_predictions = []
all_labels = []

# 테스트 데이터에 대해 반복문 돌리기
with torch.no_grad():
    for idx, batch in enumerate(tqdm(eval_dataloader)):
        # 데이터 준비
        input_ids = batch['input_ids'].to(device)
        labels_tensor = batch['labels'].to(device)  # 🔹 실제 정답 (OCEAN 값)

        # ✅ 모델 예측 (generate() 대신 forward() 사용)
        outputs = model(input_ids)
        logits = outputs["logits"]  # 🔹 OCEAN 예측값 (batch_size, 5)

         # ✅ 예측값을 0~1 범위로 변환 (sigmoid 사용)
        predicted_ocean = torch.sigmoid(logits).squeeze().tolist()  # 🔹 (batch_size, 5) → 리스트 변환
        # ✅ 실제 정답값 저장
        all_labels.append(labels_tensor.squeeze().tolist())  # 🔹 리스트 변환
        # ✅ 예측값 저장
        all_predictions.append(predicted_ocean)

  0%|          | 0/2262 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


100%|██████████| 2262/2262 [00:14<00:00, 156.98it/s]


In [22]:
print(all_predictions)


[[0.6525735855102539, 0.6644314527511597, 0.6700687408447266, 0.6423202753067017, 0.5970206260681152], [0.6544531583786011, 0.6658452153205872, 0.6728693246841431, 0.642432451248169, 0.5993679165840149], [0.6484663486480713, 0.6695287823677063, 0.6686639189720154, 0.6491339206695557, 0.5933736562728882], [0.6505782604217529, 0.6681228280067444, 0.6740504503250122, 0.6450081467628479, 0.5950808525085449], [0.6581978797912598, 0.6746938228607178, 0.6752294301986694, 0.6433292627334595, 0.5981361269950867], [0.6525735855102539, 0.6659538745880127, 0.670284628868103, 0.6433292627334595, 0.5970206260681152], [0.6530162692070007, 0.6668223142623901, 0.6713628172874451, 0.643441379070282, 0.5970206260681152], [0.6523521542549133, 0.6659538745880127, 0.6724392771720886, 0.6428809762001038, 0.5988988280296326], [0.6499119997024536, 0.6681228280067444, 0.6742650270462036, 0.6457903385162354, 0.5941392183303833], [0.6524628400802612, 0.6646491289138794, 0.6698528528213501, 0.6426568031311035, 0.5

In [23]:
print(all_labels)

[[0.6875, 0.4791666567325592, 0.4375, 0.7708333134651184, 0.4375], [0.5625, 0.8333333134651184, 0.8125, 0.3125, 0.2708333432674408], [0.4375, 0.7083333134651184, 0.5625, 0.3958333432674408, 0.4166666567325592], [0.7291666865348816, 0.4791666567325592, 0.5, 0.4583333432674408, 0.7291666865348816], [0.6041666865348816, 0.875, 0.8333333134651184, 0.5, 0.625], [0.7708333134651184, 0.7083333134651184, 0.6666666865348816, 0.5208333134651184, 0.5416666865348816], [0.4166666567325592, 0.5416666865348816, 0.5416666865348816, 0.5, 0.6875], [0.5625, 0.8333333134651184, 0.8125, 0.3125, 0.2708333432674408], [0.6458333134651184, 0.5833333134651184, 0.8541666865348816, 0.5833333134651184, 0.2708333432674408], [0.9791666865348816, 0.9791666865348816, 0.8333333134651184, 0.6458333134651184, 0.2083333283662796], [0.5625, 0.8333333134651184, 0.8125, 0.3125, 0.2708333432674408], [0.5416666865348816, 0.5625, 0.8125, 0.5833333134651184, 0.2916666567325592], [0.8333333134651184, 0.6041666865348816, 0.5625, 0

In [24]:
import numpy as np

# 🔹 numpy 배열 변환
labels_array = np.array(all_labels)
predictions_array = np.array(all_predictions)

# 🔹 소수점 4자리까지 반올림
labels_rounded = np.round(labels_array, 4)
predictions_rounded = np.round(predictions_array, 4)

# ✅ 출력
print("🔹 Rounded Predictions:")
print(predictions_rounded.tolist())  # 리스트 형태로 변환하여 출력

print("\n🔹 Rounded Labels:")
print(labels_rounded.tolist())  # 리스트 형태로 변환하여 출력


🔹 Rounded Predictions:
[[0.6526, 0.6644, 0.6701, 0.6423, 0.597], [0.6545, 0.6658, 0.6729, 0.6424, 0.5994], [0.6485, 0.6695, 0.6687, 0.6491, 0.5934], [0.6506, 0.6681, 0.6741, 0.645, 0.5951], [0.6582, 0.6747, 0.6752, 0.6433, 0.5981], [0.6526, 0.666, 0.6703, 0.6433, 0.597], [0.653, 0.6668, 0.6714, 0.6434, 0.597], [0.6524, 0.666, 0.6724, 0.6429, 0.5989], [0.6499, 0.6681, 0.6743, 0.6458, 0.5941], [0.6525, 0.6646, 0.6699, 0.6427, 0.5969], [0.6538, 0.6716, 0.6731, 0.6427, 0.5919], [0.6494, 0.6673, 0.6679, 0.6449, 0.5956], [0.6567, 0.6747, 0.6716, 0.646, 0.5967], [0.6552, 0.666, 0.6682, 0.6428, 0.5978], [0.6522, 0.6669, 0.6699, 0.6424, 0.595], [0.653, 0.6648, 0.6697, 0.6424, 0.6011], [0.6551, 0.6665, 0.6725, 0.6455, 0.5983], [0.652, 0.6637, 0.6694, 0.6425, 0.599], [0.6541, 0.6645, 0.6676, 0.6428, 0.5999], [0.6537, 0.6646, 0.6676, 0.6436, 0.6006], [0.653, 0.6645, 0.6704, 0.6434, 0.5999], [0.6527, 0.6655, 0.6694, 0.6434, 0.5968], [0.6521, 0.6639, 0.6707, 0.6419, 0.5982], [0.6574, 0.6667, 0.668, 

In [25]:
# 🔹 MAE 계산
mae_score = mean_absolute_error(labels_rounded, predictions_rounded)

In [26]:
# 🔹 1-MAE 출력
one_minus_mae = 1 - mae_score
print(f"✅ 1-MAE Score: {one_minus_mae}")

✅ 1-MAE Score: 0.855655605658709


In [27]:
# 🔹 각 OCEAN 요소별 MAE 계산
mae_per_trait = []
for i in range(5):
    mae = mean_absolute_error(labels_array[:, i], predictions_array[:, i])  # i번째 요소에 대해 MAE 계산
    mae_per_trait.append(mae)

In [28]:
# 🔹 1 - MAE 계산
one_minus_mae_per_trait = [1 - mae for mae in mae_per_trait]

# ✅ 결과 출력
trait_names = ["O", "C", "E", "A", "N"]
for name, score in zip(trait_names, one_minus_mae_per_trait):
    print(f"1-MAE for {name}: {score:.4f}")

# ✅ 평균 1-MAE 출력
mean_one_minus_mae = np.mean(one_minus_mae_per_trait)
print(f"✅ Mean 1-MAE Score: {mean_one_minus_mae:.4f}")

1-MAE for O: 0.8729
1-MAE for C: 0.8785
1-MAE for E: 0.8639
1-MAE for A: 0.8716
1-MAE for N: 0.7913
✅ Mean 1-MAE Score: 0.8557


In [29]:
# ✅ 테스트 실행
sample_transcription = "아 그럼 3시 반."
predicted_ocean = predict_ocean(sample_transcription, model, tokenizer)

print(f"Predicted OCEAN Values: {predicted_ocean}")

Predicted OCEAN Values: [0.671470582485199, 0.6342035531997681, 0.6639958620071411, 0.6787529587745667, 0.6157812476158142]


In [30]:
def convert_ocean_to_string(ocean_values, precision=4, separator=", "):
    """
    OCEAN 값을 문자열로 변환하는 함수
    
    Args:
        ocean_values: 변환할 OCEAN 값 (리스트 또는 문자열)
        precision: 소수점 자릿수 (기본값: 4)
        separator: 구분자 (기본값: ", ")
    
    Returns:
        OCEAN 값을 포맷팅한 문자열
    """
    # 입력이 문자열인 경우 리스트로 변환
    if isinstance(ocean_values, str):
        try:
            ocean_values = eval(ocean_values)
        except:
            ocean_values = [float(val.strip()) for val in ocean_values.split(',')]
    
    # 각 값을 지정된 정밀도로 포맷팅
    formatted_values = [f"{float(val):.{precision}f}" for val in ocean_values]
    
    # 지정된 구분자로 값 결합
    return separator.join(formatted_values)





