In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# 데이터 로드
df = pd.read_csv('mbti_binary_multi_sentence.csv')
df = df.dropna(subset=['text', 'JP'])

In [3]:
# 텍스트 및 이진 라벨 준비
X = df['text']
y = df['JP']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:

# 토큰화 및 시퀀스 패딩
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_len = 300
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')


In [5]:

# Bi-LSTM 이진 분류 모델
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003),
              metrics=['accuracy'])




In [6]:

# 학습
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=2,
    batch_size=32
)



Epoch 1/2
[1m554/554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 294ms/step - accuracy: 0.5843 - loss: 0.6823 - val_accuracy: 0.5842 - val_loss: 0.6781
Epoch 2/2
[1m554/554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 319ms/step - accuracy: 0.5923 - loss: 0.6653 - val_accuracy: 0.5840 - val_loss: 0.6811


In [7]:

# 최종 성능 평가
val_loss, val_acc = model.evaluate(X_val_pad, y_val)
print(f"검증 정확도: {val_acc:.4f}, 검증 손실: {val_loss:.4f}")


[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 72ms/step - accuracy: 0.5793 - loss: 0.6847
검증 정확도: 0.5840, 검증 손실: 0.6811


In [9]:
# 토크나이저 저장
import pickle

with open("jp_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [8]:
# 모델 저장
model.save("jp_bilstm_model.h5")

