# Bi-LSTM 기반 E/I 이진 분류기 학습 노트북

In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pickle


In [2]:
# ------------------------------------------
# 1. 데이터 로딩 및 전처리
df = pd.read_csv('mbti_binary_multi_sentence.csv')
df = df.dropna(subset=['text', 'EI'])

X = df['text']
y = df['EI']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
# 2. 텍스트 토큰화 및 시퀀스 생성
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_len = 300
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

In [4]:
# 3. 클래스 불균형 보정을 위한 class_weight 계산
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print(f"[클래스 가중치]: {class_weights}")

[클래스 가중치]: {np.int64(0): np.float64(0.7655169432918395), np.int64(1): np.float64(1.4415594986163112)}


In [5]:
# 4. Bi-LSTM 이진 분류기 모델 구성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003),
    metrics=['accuracy']
)



In [6]:
# 5. 학습
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=2,
    batch_size=32,
    class_weight=class_weights,  # ⭐ 클래스 가중치 적용
    callbacks=[early_stop]
)


Epoch 1/2
[1m554/554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 268ms/step - accuracy: 0.5336 - loss: 0.6927 - val_accuracy: 0.5278 - val_loss: 0.6912
Epoch 2/2
[1m273/554[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m1:09[0m 246ms/step - accuracy: 0.6030 - loss: 0.6796

In [None]:
# 6. 검증 정확도 출력
val_loss, val_acc = model.evaluate(X_val_pad, y_val)
print(f"\n[검증 정확도]: {val_acc:.4f} / [검증 손실]: {val_loss:.4f}")

[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - accuracy: 0.6261 - loss: 0.6902
검증 정확도: 0.6161, 검증 손실: 0.6987


In [None]:
# 7. 토크나이저 및 모델 저장
with open("ei_tokenizer2.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

model.save("ei_bilstm_model2.h5")
print("모델 및 토크나이저 저장 완료")