# Bi-LSTM 기반 E/I 이진 분류기 학습 노트북

In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [2]:

# 데이터 로드
df = pd.read_csv('mbti_binary_multi_sentence.csv')
df = df.dropna(subset=['text', 'EI'])


In [3]:

# 텍스트 및 이진 라벨 준비
X = df['text']
y = df['EI']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [4]:

# 토큰화 및 시퀀스 패딩
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_len = 300
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')


In [5]:

# Bi-LSTM 이진 분류 모델
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003),
              metrics=['accuracy'])




In [7]:

# 학습
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop]
)


Epoch 1/10
[1m554/554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 318ms/step - accuracy: 0.7650 - loss: 0.5017 - val_accuracy: 0.6161 - val_loss: 0.6987
Epoch 2/10
[1m554/554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 329ms/step - accuracy: 0.8287 - loss: 0.4046 - val_accuracy: 0.5960 - val_loss: 0.7667
Epoch 3/10
[1m554/554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 325ms/step - accuracy: 0.8813 - loss: 0.3161 - val_accuracy: 0.5901 - val_loss: 0.8743


In [8]:

# 최종 성능 평가
val_loss, val_acc = model.evaluate(X_val_pad, y_val)
print(f"검증 정확도: {val_acc:.4f}, 검증 손실: {val_loss:.4f}")


[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - accuracy: 0.6261 - loss: 0.6902
검증 정확도: 0.6161, 검증 손실: 0.6987


In [10]:
# 토크나이저 저장
import pickle

with open("ei_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [9]:
# 모델 저장
model.save("ei_bilstm_model.h5")


