In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

# Kaggle 데이터 다운로드 (kagglehub 라이브러리 필요)
import kagglehub
path = kagglehub.dataset_download("zeyadkhalid/mbti-personality-types-500-dataset")
print("Path to dataset files:", path)

# 데이터 로드 및 전처리
df = pd.read_csv(f"{path}/MBTI 500.csv")
df = df.dropna(subset=['type', 'posts'])

# TF-IDF 변환
vectorizer = TfidfVectorizer(max_features=1000, max_df=0.9)
X_full = vectorizer.fit_transform(df['posts']).toarray()

# 각 차원별 레이블 생성
df['I_E'] = df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
df['N_S'] = df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
df['T_F'] = df['type'].apply(lambda x: 1 if x[2] == 'F' else 0)
df['J_P'] = df['type'].apply(lambda x: 1 if x[3] == 'P' else 0)

# 그리드 서치 함수 정의
def grid_search_for_dimension(X, y, label, max_depth_values):
    accuracies = []
    print(f"Grid search for {label} dimension:")

    for max_depth in max_depth_values:
        # 모델 초기화
        model = GradientBoostingClassifier(
            n_estimators=200,      # 부스팅 단계 수
            learning_rate=0.1,     # 학습률
            max_depth=max_depth,   # 트리 깊이
            random_state=42
        )

        # 데이터 분할
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # 모델 학습
        model.fit(X_train, y_train)

        # 예측 및 정확도 평가
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append((max_depth, accuracy))
        print(f"Max Depth: {max_depth}, Accuracy: {accuracy:.4f}")

    # 최적 하이퍼파라미터 도출
    best_max_depth, best_accuracy = max(accuracies, key=lambda x: x[1])
    print(f"Best Max Depth for {label}: {best_max_depth}, Best Accuracy: {best_accuracy:.4f}")
    print("-" * 50)

    return best_max_depth, best_accuracy

# 차원별로 그리드 서치 및 모델 학습
max_depth_values = [2,3,5,7,9]
best_models = {}

for dimension, label in zip(['I/E', 'N/S', 'T/F', 'J/P'], ['I_E', 'N_S', 'T_F', 'J_P']):
    y_dimension = df[label]
    best_max_depth, best_accuracy = grid_search_for_dimension(X_full, y_dimension, dimension, max_depth_values)
    best_models[dimension] = {'max_depth': best_max_depth, 'accuracy': best_accuracy}

# 최종 Total 정확도 계산 함수
def calculate_total_accuracy(X, df, best_models):
    X_train, X_test, df_train, df_test = train_test_split(X, df, test_size=0.2, random_state=42)
    correct_predictions = 0

    for idx, x in enumerate(X_test):
        x = x.reshape(1, -1)

        # 각 차원의 예측 수행
        predictions = []
        for dimension, model_info in best_models.items():
            label = dimension.replace("/", "_")  # 예: I/E -> I_E
            model = GradientBoostingClassifier(
                n_estimators=200,
                learning_rate=0.1,
                max_depth=model_info['max_depth'],
                random_state=42
            )
            model.fit(X_train, df_train[label])
            pred = model.predict(x)[0]
            predictions.append(pred)

        # MBTI 유형 변환
        ie = 'E' if predictions[0] == 1 else 'I'
        ns = 'S' if predictions[1] == 1 else 'N'
        tf = 'F' if predictions[2] == 1 else 'T'
        jp = 'P' if predictions[3] == 1 else 'J'
        predicted_type = ie + ns + tf + jp

        # 실제 MBTI 유형과 비교
        if predicted_type == df_test.iloc[idx]['type']:
            correct_predictions += 1

    total_accuracy = correct_predictions / len(X_test)
    return total_accuracy

# 최종 Total 정확도 출력
total_accuracy = calculate_total_accuracy(X_full, df, best_models)
print(f"Total MBTI Prediction Accuracy: {total_accuracy:.4f}")


Path to dataset files: /root/.cache/kagglehub/datasets/zeyadkhalid/mbti-personality-types-500-dataset/versions/1
Grid search for I/E dimension:
Max Depth: 2, Accuracy: 0.8670
Max Depth: 3, Accuracy: 0.8790
Max Depth: 5, Accuracy: 0.8847
Max Depth: 7, Accuracy: 0.8889


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import kagglehub

# Kaggle 데이터 다운로드
path = kagglehub.dataset_download("zeyadkhalid/mbti-personality-types-500-dataset")
print("Path to dataset files:", path)

# 데이터 로드 및 전처리
df = pd.read_csv(f"{path}/MBTI 500.csv")
df = df.dropna(subset=['type', 'posts'])


# TF-IDF 변환
vectorizer = TfidfVectorizer(max_features=1000, max_df=0.9)
X = vectorizer.fit_transform(df['posts']).toarray()

# 각 차원별 레이블 생성
df['I_E'] = df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
df['N_S'] = df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
df['T_F'] = df['type'].apply(lambda x: 1 if x[2] == 'F' else 0)
df['J_P'] = df['type'].apply(lambda x: 1 if x[3] == 'P' else 0)

# DownSampling 함수 정의
def downsample(df, target_col):
    minority_class_size = df[target_col].value_counts().min()
    df_majority = df[df[target_col] == df[target_col].value_counts().idxmax()]
    df_minority = df[df[target_col] == df[target_col].value_counts().idxmin()]
    df_majority_downsampled = resample(
        df_majority,
        replace=False,
        n_samples=minority_class_size,
        random_state=42
    )
    df_balanced = pd.concat([df_majority_downsampled, df_minority])
    return df_balanced

# 각 차원별 DownSampling 수행
df_ie_balanced = downsample(df, 'I_E')
df_ns_balanced = downsample(df, 'N_S')
df_tf_balanced = downsample(df, 'T_F')
df_jp_balanced = downsample(df, 'J_P')

# DownSampling 후 데이터 확인
print("I_E Balance:\n", df_ie_balanced['I_E'].value_counts())
print("N_S Balance:\n", df_ns_balanced['N_S'].value_counts())
print("T_F Balance:\n", df_tf_balanced['T_F'].value_counts())
print("J_P Balance:\n", df_jp_balanced['J_P'].value_counts())

# 학습 및 평가 함수 정의 (파라미터 수정 포함)
def train_and_predict(X_train, y_train, X_test, y_test, label):
    # GradientBoostingClassifier 모델 정의 및 파라미터 설정
    model = GradientBoostingClassifier(
        n_estimators=200,      # 부스팅 단계 수
        learning_rate=0.05,    # 학습률
        max_depth=2,           # 트리 최대 깊이
        subsample=0.9,         # 데이터 샘플링 비율
        min_samples_split=5,   # 노드 분할에 필요한 최소 샘플 수
        min_samples_leaf=2,    # 리프 노드에 필요한 최소 샘플 수
        random_state=42        # 랜덤 시드 설정
    )

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {label}: {accuracy}")
    return model

# 각 차원별 데이터 분할 및 모델 학습
def prepare_and_train(df_balanced, label):
    X_balanced = vectorizer.transform(df_balanced['posts']).toarray()
    y_balanced = df_balanced[label]
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
    model = train_and_predict(X_train, y_train, X_test, y_test, label)
    return model, X_test, y_test

# 각 차원별 학습
model_ie, X_test_ie, y_test_ie = prepare_and_train(df_ie_balanced, 'I_E')
model_ns, X_test_ns, y_test_ns = prepare_and_train(df_ns_balanced, 'N_S')
model_tf, X_test_tf, y_test_tf = prepare_and_train(df_tf_balanced, 'T_F')
model_jp, X_test_jp, y_test_jp = prepare_and_train(df_jp_balanced, 'J_P')

# 최종 MBTI 예측 및 정확도 계산
def calculate_final_accuracy(X_test, y_test_full):
    predictions = []
    for x in X_test:
        x = x.reshape(1, -1)
        ie = 'E' if model_ie.predict(x)[0] == 1 else 'I'
        ns = 'S' if model_ns.predict(x)[0] == 1 else 'N'
        tf = 'F' if model_tf.predict(x)[0] == 1 else 'T'
        jp = 'P' if model_jp.predict(x)[0] == 1 else 'J'
        predictions.append(ie + ns + tf + jp)

    correct_predictions = sum([1 for true, pred in zip(y_test_full, predictions) if true == pred])
    final_accuracy = correct_predictions / len(y_test_full)
    return final_accuracy

# 전체 MBTI 유형 데이터로 최종 예측
X_full_test, y_full_test = vectorizer.transform(df['posts']).toarray(), df['type']
final_accuracy = calculate_final_accuracy(X_full_test, y_full_test)
print("Final MBTI Prediction Accuracy with tuned parameters:", final_accuracy)

Downloading from https://www.kaggle.com/api/v1/datasets/download/zeyadkhalid/mbti-personality-types-500-dataset?dataset_version_number=1...


100%|██████████| 123M/123M [00:03<00:00, 36.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/zeyadkhalid/mbti-personality-types-500-dataset/versions/1
I_E Balance:
 I_E
0    25390
1    25390
Name: count, dtype: int64
N_S Balance:
 N_S
0    9201
1    9201
Name: count, dtype: int64
T_F Balance:
 T_F
0    36864
1    36864
Name: count, dtype: int64
J_P Balance:
 J_P
1    44435
0    44435
Name: count, dtype: int64
Accuracy for I_E: 0.8171524222134698
Accuracy for N_S: 0.8489540885628906
Accuracy for T_F: 0.8567747185677472
Accuracy for J_P: 0.8255316754810397
Final MBTI Prediction Accuracy with tuned parameters: 0.5740428219898743
