In [9]:
import pandas as pd

# 저장된 데이터를 불러오기
final_df = pd.read_csv('data/final_df_preprocessed.csv', encoding='utf-8-sig')
test_df = pd.read_csv('data/test_df_preprocessed.csv', encoding='utf-8-sig')


In [11]:
print(final_df.head())
print(test_df.head())

                                                text  persona-id  emotion-id  \
0              해도 없을까 화가 난다 그냥 해결 하는 나아 부담 주고 싶지도 않고           3         368   
1  급여 깎였어 물가 오르는데 월급 자꾸 깎이니까 너무 화가 최대한 지출 억제 해야겠어...           3         368   
2  회사 신입 들어왔는데 말투 거슬려 그런 매일 봐야 한다고 생각 하니까 스트레스 받아...           3         368   
3  직장 막내 이유 온갖 심부름 시켜 일도 많은 정말 분하고 섭섭해 사람 솔직하게 이야...           3         368   
4  입사 신입사원 나를 무시 하는 같아서 너무 화가 상사 먼저 인사 하지 않아서 매일 한다고           3         368   

   emotion  situation  disease  text_pca_1  text_pca_2  
0        8          5        1   -0.045439    0.059560  
1        8          5        1    0.036776    0.019723  
2        8          5        1   -0.196787    0.278481  
3        8          5        1   -0.082461    0.201029  
4        8          5        1    0.063519    0.064646  
                                                text  persona-id  emotion-id  \
0  이번 프로젝트에서 발표를 하는데 내가 실수하는 바람에 우리 팀이 감점을 받았어 너무...           2         381   
1  회사에서 중요한 프로젝트를

In [13]:
from sklearn.preprocessing import LabelEncoder

# 감정 라벨 인코딩
emotion_encoder = LabelEncoder()
final_df["emotion"] = emotion_encoder.fit_transform(final_df["emotion"])
test_df["emotion"] = emotion_encoder.transform(test_df["emotion"])

# 상황 라벨 인코딩
situation_encoder = LabelEncoder()
final_df["situation"] = situation_encoder.fit_transform(final_df["situation"])
test_df["situation"] = situation_encoder.transform(test_df["situation"])

# 질병 라벨 인코딩
disease_encoder = LabelEncoder()
final_df["disease"] = disease_encoder.fit_transform(final_df["disease"])
test_df["disease"] = disease_encoder.transform(test_df["disease"])

# 레이블 인코딩: persona-id, emotion-id 숫자로 변환
persona_encoder = LabelEncoder()
final_df['persona-id'] = persona_encoder.fit_transform(final_df['persona-id'])
test_df['persona-id'] = persona_encoder.transform(test_df['persona-id'])

emotionid_encoder = LabelEncoder()
final_df['emotion-id'] = emotionid_encoder.fit_transform(final_df['emotion-id'])
test_df['emotion-id'] = emotionid_encoder.transform(test_df['emotion-id'])



In [15]:
print(final_df.head())
print(test_df.head())

                                                text  persona-id  emotion-id  \
0              해도 없을까 화가 난다 그냥 해결 하는 나아 부담 주고 싶지도 않고           3         368   
1  급여 깎였어 물가 오르는데 월급 자꾸 깎이니까 너무 화가 최대한 지출 억제 해야겠어...           3         368   
2  회사 신입 들어왔는데 말투 거슬려 그런 매일 봐야 한다고 생각 하니까 스트레스 받아...           3         368   
3  직장 막내 이유 온갖 심부름 시켜 일도 많은 정말 분하고 섭섭해 사람 솔직하게 이야...           3         368   
4  입사 신입사원 나를 무시 하는 같아서 너무 화가 상사 먼저 인사 하지 않아서 매일 한다고           3         368   

   emotion  situation  disease  text_pca_1  text_pca_2  
0        8          5        1   -0.045439    0.059560  
1        8          5        1    0.036776    0.019723  
2        8          5        1   -0.196787    0.278481  
3        8          5        1   -0.082461    0.201029  
4        8          5        1    0.063519    0.064646  
                                                text  persona-id  emotion-id  \
0  이번 프로젝트에서 발표를 하는데 내가 실수하는 바람에 우리 팀이 감점을 받았어 너무...           2         381   
1  회사에서 중요한 프로젝트를

In [29]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# 🔹 Step 1: 텍스트 데이터 벡터화 (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # 최대 100개 특성 추출

# final_df에서 TF-IDF 벡터화
tfidf_matrix_train = tfidf_vectorizer.fit_transform(final_df['text'])

# test_df에서 TF-IDF 벡터화 (학습된 vectorizer 사용)
tfidf_matrix_test = tfidf_vectorizer.transform(test_df['text'])

# 🔹 Step 2: 텍스트 데이터 차원 축소 (PCA)
pca = PCA(n_components=2)  # 2D로 축소

# final_df에서 PCA 적용
text_pca_train = pca.fit_transform(tfidf_matrix_train.toarray())

# test_df에서 PCA 적용 (학습된 PCA 모델 사용)
text_pca_test = pca.transform(tfidf_matrix_test.toarray())

# 🔹 Step 3: 텍스트 벡터화된 데이터프레임에 추가
final_df['text_pca_1'] = text_pca_train[:, 0]
final_df['text_pca_2'] = text_pca_train[:, 1]

test_df['text_pca_1'] = text_pca_test[:, 0]
test_df['text_pca_2'] = text_pca_test[:, 1]

# 🔹 Step 4: 상관 행렬 계산
# 수치형 변수에 대해서만 상관 관계를 계산합니다.
correlation_matrix_train = final_df[['text_pca_1', 'text_pca_2', 'emotion', 'situation', 'persona-id', 'emotion-id']].apply(pd.to_numeric, errors='coerce').corr()

# # 🔹 Step 5: 상관 행렬 시각화 (final_df)
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix_train, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title('Correlation Matrix: Text, Persona-id, Emotion-id, Emotion, Situation (Train Data)')
# plt.show()

# # 🔹 Step 6: 상황에 따른 감정 분포 (final_df)
# plt.figure(figsize=(12, 6))
# sns.countplot(data=final_df, x='situation', hue='emotion', palette='Set2')
# plt.title('Emotion Distribution per Situation (Train Data)')
# plt.xlabel('Situation')
# plt.ylabel('Count')
# plt.xticks(rotation=90)
# plt.legend(title='Emotion')
# plt.show()

# # 🔹 Step 7: 페르소나 아이디와 감정 아이디에 따른 감정 분포 (final_df)
# plt.figure(figsize=(12, 6))
# sns.countplot(data=final_df, x='persona-id', hue='emotion', palette='Set1')
# plt.title('Emotion Distribution by Persona-id (Train Data)')
# plt.xlabel('Persona-id')
# plt.ylabel('Count')
# plt.legend(title='Emotion')
# plt.xticks(rotation=90)
# plt.show()


In [17]:
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, pca_features=None):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pca_features = pca_features  # PCA 특성 (선택 사항)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        text = self.dataframe.iloc[index]['text']
        emotion = self.dataframe.iloc[index]['emotion']
        situation = self.dataframe.iloc[index]['situation']
        persona_id = self.dataframe.iloc[index]['persona-id']
        emotion_id = self.dataframe.iloc[index]['emotion-id']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)  # (1, seq_len) -> (seq_len,)
        attention_mask = encoding['attention_mask'].squeeze(0)  # (1, seq_len) -> (seq_len,)
        token_type_ids = encoding['token_type_ids'].squeeze(0) if 'token_type_ids' in encoding else torch.zeros_like(input_ids)

        # pca_features를 포함하도록 수정
        pca_feature = self.pca_features[index] if self.pca_features is not None else torch.zeros(2)  # 2D PCA 특성 예시 (필요한 특성에 맞게 수정)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'emotion': torch.tensor(emotion, dtype=torch.long),
            'situation': torch.tensor(situation, dtype=torch.long),
            'persona-id': torch.tensor(persona_id, dtype=torch.long),
            'emotion-id': torch.tensor(emotion_id, dtype=torch.long),
            'pca_features': pca_feature  # pca_features 추가
        }


NameError: name 'Dataset' is not defined

In [33]:
# PCA 적용 (예시)
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(final_df['text'])

# PCA 적용 (2D로 축소)
pca = PCA(n_components=2)
pca_features_train = pca.fit_transform(tfidf_matrix.toarray())

# 데이터셋에 PCA 특성 추가
train_dataset = CustomDataset(final_df, tokenizer, max_len=128, pca_features=pca_features_train)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# PCA 특성을 test 데이터에도 적용
tfidf_matrix_test = tfidf_vectorizer.transform(test_df['text'])
pca_features_test = pca.transform(tfidf_matrix_test.toarray())

test_dataset = CustomDataset(test_df, tokenizer, max_len=128, pca_features=pca_features_test)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [47]:
from transformers import BertModel
import torch
import torch.nn as nn

class KoBERTMultiOutputWithEmbedding(nn.Module):
    def __init__(self, model_name, num_emotions, num_situations, num_personas, num_emotion_ids, embedding_dim=10):
        super(KoBERTMultiOutputWithEmbedding, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)  # KoBERT 기본 모델
        self.dropout = nn.Dropout(0.1)
        
        # 감정 분류 레이어 (입력 크기 확인)
        self.emotion_classifier = nn.Linear(768 + 2 * embedding_dim, num_emotions)  # BERT 출력 + 임베딩 차원 크기
        self.situation_classifier = nn.Linear(768, num_situations)  # 상황 예측은 BERT만 사용

        # persona-id와 emotion-id를 위한 임베딩 레이어
        self.persona_embedding = nn.Embedding(num_personas, embedding_dim)  # persona-id 임베딩
        self.emotion_id_embedding = nn.Embedding(num_emotion_ids, embedding_dim)  # emotion-id 임베딩

    def forward(self, input_ids, attention_mask, token_type_ids, persona_id, emotion_id, pca_features):
        # BERT 모델에서 출력값 얻기
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output  # (batch_size, 768)

        # persona-id와 emotion-id 임베딩
        persona_embedding = self.persona_embedding(persona_id)  # (batch_size, embedding_dim)
        emotion_id_embedding = self.emotion_id_embedding(emotion_id)  # (batch_size, embedding_dim)

        # 임베딩 벡터를 BERT 출력과 결합
        combined_output = torch.cat((pooled_output, persona_embedding, emotion_id_embedding), dim=1)  # (batch_size, 788)

        # 감정 및 상황 예측
        emotion_logits = self.emotion_classifier(combined_output)
        situation_logits = self.situation_classifier(pooled_output)

        return emotion_logits, situation_logits


In [53]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(model, test_dataloader, loss_fn, device, emotion_encoder, situation_encoder):
    model.eval()
    total_loss = 0
    correct_emotion = 0
    correct_situation = 0
    total_emotion = 0
    total_situation = 0
    all_emotion_preds = []
    all_situation_preds = []
    all_emotion_labels = []
    all_situation_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, token_type_ids, persona_id, emotion_id, pca_features, emotion_labels, situation_labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            persona_id = persona_id.to(device)
            emotion_id = emotion_id.to(device)
            pca_features = pca_features.to(device)  # PCA feature가 있다면 사용할 수 있음
            emotion_labels = emotion_labels.to(device)
            situation_labels = situation_labels.to(device)

            # 모델에 입력
            emotion_logits, situation_logits = model(input_ids, attention_mask, token_type_ids, persona_id, emotion_id, pca_features)

            # 손실 계산
            loss = loss_fn(emotion_logits, emotion_labels) + loss_fn(situation_logits, situation_labels)
            total_loss += loss.item()

            # 예측값 얻기
            emotion_pred = torch.argmax(emotion_logits, dim=1)
            situation_pred = torch.argmax(situation_logits, dim=1)

            # 정확도 계산
            correct_emotion += (emotion_pred == emotion_labels).sum().item()
            correct_situation += (situation_pred == situation_labels).sum().item()

            total_emotion += emotion_labels.size(0)
            total_situation += situation_labels.size(0)

            # 예측값과 실제값 저장
            all_emotion_preds.extend(emotion_pred.cpu().numpy())
            all_situation_preds.extend(situation_pred.cpu().numpy())
            all_emotion_labels.extend(emotion_labels.cpu().numpy())
            all_situation_labels.extend(situation_labels.cpu().numpy())

    # 평균 손실 계산
    avg_loss = total_loss / len(test_dataloader)

    # 감정과 상황 정확도 계산
    emotion_accuracy = correct_emotion / total_emotion
    situation_accuracy = correct_situation / total_situation

    # F1-Score 계산 (감정, 상황)
    emotion_f1 = f1_score(all_emotion_labels, all_emotion_preds, average='weighted')
    situation_f1 = f1_score(all_situation_labels, all_situation_preds, average='weighted')

    # Exact Match 계산
    emotion_exact_match = (emotion_encoder.inverse_transform(all_emotion_labels) == emotion_encoder.inverse_transform(all_emotion_preds)).mean()
    situation_exact_match = (situation_encoder.inverse_transform(all_situation_labels) == situation_encoder.inverse_transform(all_situation_preds)).mean()

    return avg_loss, emotion_accuracy, situation_accuracy, emotion_f1, situation_f1, emotion_exact_match, situation_exact_match

# 모델 초기화
model_name = "monologg/kobert"  # KoBERT 모델명
num_emotions = len(final_df["emotion"].unique())  # 감정 클래스 개수
num_situations = len(final_df["situation"].unique())  # 상황 클래스 개수
model = KoBERTMultiOutputWithEmbedding(model_name, num_emotions, num_situations, num_personas, num_emotion_ids)

# device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# best_model.pth 로드
model.load_state_dict(torch.load("kobert_emotion_situation/best_model.pth"))
model.to(device)

# 평가
test_loss, emotion_accuracy, situation_accuracy, emotion_f1, situation_f1, emotion_exact_match, situation_exact_match = evaluate_model(
    model, test_dataloader, nn.CrossEntropyLoss(), device, emotion_encoder, situation_encoder
)

# 결과 출력
print(f"✅ Test Loss: {test_loss:.4f}")
print(f"✅ Emotion Accuracy: {emotion_accuracy:.4f}")
print(f"✅ Situation Accuracy: {situation_accuracy:.4f}")
print(f"✅ Emotion F1-Score: {emotion_f1:.4f}")
print(f"✅ Situation F1-Score: {situation_f1:.4f}")
print(f"✅ Emotion Exact Match: {emotion_exact_match:.4f}")
print(f"✅ Situation Exact Match: {situation_exact_match:.4f}")


ValueError: not enough values to unpack (expected 8, got 7)

In [49]:
# import torch
# import torch.nn as nn
# from transformers import BertModel, AutoTokenizer
# from sklearn.metrics import accuracy_score, f1_score
# import matplotlib.pyplot as plt


# # ✅ 2. 감정과 상황의 클래스 수 계산
# num_emotions = len(final_df["emotion"].unique())  # 감정 클래스 개수
# num_situations = len(final_df["situation"].unique())  # 상황 클래스 개수

# # ✅ 3. 모델 초기화
# model_name = "monologg/kobert"  # KoBERT 모델명
# model = KoBERTMultiOutputWithEmbedding(model_name, num_emotions, num_situations)  # num_emotions, num_situations 정의 완료

# # ✅ 4. device 설정 (GPU/CPU)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # CUDA (GPU) 사용 가능 여부 확인
# model.to(device)  # 모델을 해당 device로 이동

# # 모델 체크포인트 불러오기
# model.load_state_dict(torch.load("kobert_emotion_situation/best_model.pth"))
# model.to(device)  # 다시 device로 모델 이동

# # ✅ 5. 학습 기록 불러오기
# history = torch.load("kobert_emotion_situation/history.pth", weights_only=True)
# train_loss_history = history['train_loss']
# train_accuracy_history = history['train_accuracy']

# # ✅ 6. 테스트 데이터셋 준비 (test_dataloader)
# test_texts = test_df["text"].tolist()  # 'text' 컬럼에 테스트 데이터가 있다고 가정
# test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# emotion_labels_test = emotion_encoder.transform(test_df['emotion'].values)  # 감정 라벨
# situation_labels_test = situation_encoder.transform(test_df['situation'].values)  # 상황 라벨

# # 텐서로 변환
# emotion_labels_test = torch.tensor(emotion_labels_test)
# situation_labels_test = torch.tensor(situation_labels_test)

# # Test Dataset과 DataLoader 생성
# test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_encodings['token_type_ids'],
#                              emotion_labels_test, situation_labels_test)
# test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# # ✅ 7. 모델 평가 함수 (F1-Score와 Exact Match 추가)
# def evaluate_model(model, test_dataloader, loss_fn, device, emotion_encoder, situation_encoder):
#     model.eval()
#     total_loss = 0
#     correct = 0
#     total = 0
#     all_emotion_preds = []
#     all_situation_preds = []
#     all_emotion_labels = []
#     all_situation_labels = []

#     with torch.no_grad():
#         for batch in test_dataloader:
#             input_ids, attention_mask, token_type_ids, emotion_labels, situation_labels = batch
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)
#             emotion_labels = emotion_labels.to(device)
#             situation_labels = situation_labels.to(device)

#             emotion_logits, situation_logits = model(input_ids, attention_mask, token_type_ids)
#             loss = loss_fn(emotion_logits, emotion_labels) + loss_fn(situation_logits, situation_labels)

#             total_loss += loss.item()

#             emotion_pred = torch.argmax(emotion_logits, dim=1)
#             situation_pred = torch.argmax(situation_logits, dim=1)

#             correct += (emotion_pred == emotion_labels).sum().item()
#             correct += (situation_pred == situation_labels).sum().item()
#             total += emotion_labels.size(0) + situation_labels.size(0)

#             all_emotion_preds.extend(emotion_pred.cpu().numpy())
#             all_situation_preds.extend(situation_pred.cpu().numpy())
#             all_emotion_labels.extend(emotion_labels.cpu().numpy())
#             all_situation_labels.extend(situation_labels.cpu().numpy())

#     avg_loss = total_loss / len(test_dataloader)
#     accuracy = correct / total

#     # 감정과 상황에 대해 F1-Score 계산
#     emotion_f1 = f1_score(all_emotion_labels, all_emotion_preds, average='weighted')
#     situation_f1 = f1_score(all_situation_labels, all_situation_preds, average='weighted')

#     # Exact Match 계산 (True는 1.0, False는 0.0으로 변환)
#     # 예측값을 디코딩하여 정확히 일치하는지 확인
#     emotion_exact_match = (emotion_encoder.inverse_transform(all_emotion_labels) == emotion_encoder.inverse_transform(all_emotion_preds)).mean()
#     situation_exact_match = (situation_encoder.inverse_transform(all_situation_labels) == situation_encoder.inverse_transform(all_situation_preds)).mean()

#     # 감정 정확도, 상황 정확도 계산
#     emotion_accuracy = accuracy_score(all_emotion_labels, all_emotion_preds)
#     situation_accuracy = accuracy_score(all_situation_labels, all_situation_preds)

#     return avg_loss, accuracy, emotion_accuracy, situation_accuracy, emotion_f1, situation_f1, emotion_exact_match, situation_exact_match



# # ✅ 8. 손실 함수 정의
# loss_fn = nn.CrossEntropyLoss()

# # ✅ 9. 모델 평가 실행
# test_loss, test_accuracy, emotion_accuracy, situation_accuracy, emotion_f1, situation_f1, emotion_exact_match, situation_exact_match = evaluate_model(
#     model, test_dataloader, loss_fn, device, emotion_encoder, situation_encoder
# )


# # ✅ 10. 결과 출력
# print(f"✅ Test Loss: {test_loss:.4f}")
# print(f"✅ Test Accuracy: {test_accuracy:.4f}")
# print(f"✅ Emotion Accuracy: {emotion_accuracy:.4f}")
# print(f"✅ Situation Accuracy: {situation_accuracy:.4f}")
# print(f"✅ Emotion F1-Score: {emotion_f1:.4f}")
# print(f"✅ Situation F1-Score: {situation_f1:.4f}")
# print(f"✅ Emotion Exact Match: {emotion_exact_match:.4f}")
# print(f"✅ Situation Exact Match: {situation_exact_match:.4f}")


TypeError: KoBERTMultiOutputWithEmbedding.__init__() missing 2 required positional arguments: 'num_personas' and 'num_emotion_ids'