In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("mbti_grouped_dataset.csv", encoding='utf-8-sig')  # 현재처럼 변환한 파일
df = df[['text', 'mbti']].dropna()

le = LabelEncoder()
df['label'] = le.fit_transform(df['mbti'])
label2mbti = dict(zip(le.transform(le.classes_), le.classes_))


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['mbti'])  # ← 정확한 컬럼명 확인
label2mbti = dict(zip(le.transform(le.classes_), le.classes_))

print("클래스 매핑:", label2mbti)

클래스 매핑: {np.int64(0): 'enfj', np.int64(1): 'enfp', np.int64(2): 'entj', np.int64(3): 'entp', np.int64(4): 'esfj', np.int64(5): 'esfp', np.int64(6): 'estj', np.int64(7): 'estp', np.int64(8): 'infj', np.int64(9): 'infp', np.int64(10): 'intj', np.int64(11): 'intp', np.int64(12): 'isfj', np.int64(13): 'isfp', np.int64(14): 'istj', np.int64(15): 'istp'}


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "monologg/kobert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=16, trust_remote_code=True)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import torch
from torch.utils.data import Dataset

class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(
            list(texts),
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors='pt'
        )
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [5]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, stratify=df['label'])

train_dataset = MBTIDataset(train_texts, train_labels.tolist(), tokenizer)
val_dataset = MBTIDataset(val_texts, val_labels.tolist(), tokenizer)

print("Vocab size:", tokenizer.vocab_size)
for i, item in enumerate(train_dataset):
    if item['input_ids'].max() >= tokenizer.vocab_size:
        print(f"🚨 {i}번째 샘플에서 문제 발생. max token id: {item['input_ids'].max()}")
        break

model = AutoModelForSequenceClassification.from_pretrained(
    "skt/kobert-base-v1",
    num_labels=len(label2mbti)  # ← 반드시 이렇게 맞춰줘야 함
)

training_args = TrainingArguments(
    output_dir="./kobert_mbti",
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    logging_steps=100,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to='none'
)

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    acc = (preds == p.label_ids).mean()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



Vocab size: 8002


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


In [None]:
model.save_pretrained("./kobert_mbti/best_model")
tokenizer.save_pretrained("./kobert_mbti/best_model")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 저장된 모델 경로
model_path = "./kobert_mbti/best_model"

# 모델 & 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()


In [None]:
import torch

# 사용자로부터 입력받은 5문장 (예시)
utterances = [
    "요즘 감정 기복이 너무 심해.",
    "나는 왜 이렇게 사람 눈치를 볼까?",
    "하고 싶은 일은 많은데 의욕이 없어.",
    "나 자신이 너무 무기력하게 느껴져.",
    "이런 내가 싫어."
]

# 5문장을 하나로 합치기 (구분 토큰 사용)
input_text = " [SEP] ".join(utterances)

# 토큰화
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512
)

# 예측
with torch.no_grad():
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=-1).item()

# 예측 결과 매핑
predicted_mbti = label2mbti[pred]
print("예측된 MBTI:", predicted_mbti)
