In [1]:
import pandas as pd 
import numpy as np
import pickle

DATA_PATH = './data/'

In [2]:
with open(f"{DATA_PATH}train.pkl", 'rb') as f:
    data = pickle.load(f)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score
train, valid = train_test_split(data, test_size=0.2, random_state=42, stratify=data['mbti'], shuffle=True)

y_train = train['mbti']
X_train = train.drop('mbti', axis=1)
y_valid = valid['mbti']
X_valid = valid.drop('mbti', axis=1)

# Convert y_train and y_valid to separate binary columns for each MBTI character
y_train_binary = pd.DataFrame({
    'I-E': y_train.apply(lambda x: 1 if x[0] == 'I' else 0),
    'N-S': y_train.apply(lambda x: 1 if x[1] == 'N' else 0),
    'T-F': y_train.apply(lambda x: 1 if x[2] == 'T' else 0),
    'J-P': y_train.apply(lambda x: 1 if x[3] == 'J' else 0)
})

y_valid_binary = pd.DataFrame({
    'I-E': y_valid.apply(lambda x: 1 if x[0] == 'I' else 0),
    'N-S': y_valid.apply(lambda x: 1 if x[1] == 'N' else 0),
    'T-F': y_valid.apply(lambda x: 1 if x[2] == 'T' else 0),
    'J-P': y_valid.apply(lambda x: 1 if x[3] == 'J' else 0)
})

In [4]:
from catboost import CatBoostClassifier

# List to store models and performance metrics
models = {}
accuracy_scores = {}
f1_scores = {}
mcc_scores = {}
balanced_accuracy_scores = {}

print("CatBoost Results")
# Train and evaluate a model for each binary classification problem
for column in y_train_binary.columns:
    # Initialize the CatBoostClassifier
    model = CatBoostClassifier(verbose=0, task_type="GPU", devices='0:1', auto_class_weights="Balanced")
    
    # Fit the model
    model.fit(X_train, y_train_binary[column])
    
    # Predict on the validation set
    y_pred = model.predict(X_valid)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_valid_binary[column], y_pred)
    f1 = f1_score(y_valid_binary[column], y_pred)
    mcc = matthews_corrcoef(y_valid_binary[column], y_pred)
    b_accuracy = balanced_accuracy_score(y_valid_binary[column], y_pred)
    
    # Store the model and metrics
    models[column] = model
    accuracy_scores[column] = accuracy
    f1_scores[column] = f1
    mcc_scores[column] = mcc
    balanced_accuracy_scores[column] = b_accuracy
    
    
    # Print the results
    print(f"Results for {column}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"MCC: {mcc}")
    print(f"Balanced Accuracy: {b_accuracy}")
    print("-" * 30)

CatBoost Results
Results for I-E:
Accuracy: 0.5747063254947632
F1 Score: 0.6809974179138345
MCC: 0.12041243593219175
Balanced Accuracy: 0.5734348758982699
------------------------------
Results for N-S:
Accuracy: 0.5811542134703716
F1 Score: 0.7192095502951992
MCC: 0.09714719630339974
Balanced Accuracy: 0.5919778953089392
------------------------------
Results for T-F:
Accuracy: 0.621217723486249
F1 Score: 0.646157881321182
MCC: 0.23870814782814875
Balanced Accuracy: 0.6194664695936424
------------------------------
Results for J-P:
Accuracy: 0.5781092127091214
F1 Score: 0.5182412203261247
MCC: 0.14713168166080184
Balanced Accuracy: 0.5746841826708493
------------------------------


In [5]:
from xgboost import XGBClassifier

# List to store models and performance metrics
models = {}
accuracy_scores = {}
f1_scores = {}
mcc_scores = {}
balanced_accuracy_scores = {}


print("XGBoost Results")
# Train and evaluate a model for each binary classification problem
for column in y_train_binary.columns:
    # Initialize the CatBoostClassifier
    model = XGBClassifier(device="cuda")
    
    # Fit the model
    model.fit(X_train, y_train_binary[column])
    
    # Predict on the validation set
    y_pred = model.predict(X_valid)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_valid_binary[column], y_pred)
    f1 = f1_score(y_valid_binary[column], y_pred)
    mcc = matthews_corrcoef(y_valid_binary[column], y_pred)
    b_accuracy = balanced_accuracy_score(y_valid_binary[column], y_pred)
    
    # Store the model and metrics
    models[column] = model
    accuracy_scores[column] = accuracy
    f1_scores[column] = f1
    mcc_scores [column] = mcc
    balanced_accuracy_scores[column] = b_accuracy
    
    # Print the results
    print(f"Results for {column}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"MCC: {mcc}")
    print(f"Balanced Accuracy: {b_accuracy}")
    print("-" * 30)

XGBoost Results
Results for I-E:
Accuracy: 0.789082288179663
F1 Score: 0.8819406399516566
MCC: 0.044971229733140175
Balanced Accuracy: 0.5025019540008622
------------------------------
Results for N-S:
Accuracy: 0.9262813224794215
F1 Score: 0.9617108767072361
MCC: 0.06286912107341662
Balanced Accuracy: 0.5031489930147138
------------------------------
Results for T-F:
Accuracy: 0.6284624525701586
F1 Score: 0.6793414267040276
MCC: 0.2463223996726144
Balanced Accuracy: 0.6199122553433498
------------------------------
Results for J-P:
Accuracy: 0.6161875404105215
F1 Score: 0.32465399789858956
MCC: 0.149019797620079
Balanced Accuracy: 0.5556106788424128
------------------------------


In [6]:
from lightgbm import LGBMClassifier 


# List to store models and performance metrics
models = {}
accuracy_scores = {}
f1_scores = {}
mcc_scores = {}
balanced_accuracy_scores = {}

print("LightGBM Results")
# Train and evaluate a model for each binary classification problem
for column in y_train_binary.columns:
    # Initialize the CatBoostClassifier
    model = LGBMClassifier(verbose=0, is_unbalance=True)
        
    # Fit the model
    model.fit(X_train, y_train_binary[column])
    
    # Predict on the validation set
    y_pred = model.predict(X_valid)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_valid_binary[column], y_pred)
    f1 = f1_score(y_valid_binary[column], y_pred)
    mcc = matthews_corrcoef(y_valid_binary[column], y_pred)
    b_accuracy = balanced_accuracy_score(y_valid_binary[column], y_pred)
    
    # Store the model and metrics
    models[column] = model
    accuracy_scores[column] = accuracy
    f1_scores[column] = f1
    mcc_scores[column] = mcc
    balanced_accuracy_scores[column] = b_accuracy
    
    # Print the results
    print(f"Results for {column}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"MCC: {mcc}")
    print(f"Balanced Accuracy: {b_accuracy}")
    print("-" * 30)

LightGBM Results
Results for I-E:
Accuracy: 0.5760607576515531
F1 Score: 0.6826698793468866
MCC: 0.11988924335941593
Balanced Accuracy: 0.5730784104707725
------------------------------
Results for N-S:
Accuracy: 0.5840311460077865
F1 Score: 0.7217824963995382
MCC: 0.09698417816655301
Balanced Accuracy: 0.5917365099954532
------------------------------
Results for T-F:
Accuracy: 0.6203536550884138
F1 Score: 0.6453526709604966
MCC: 0.23697056562078328
Balanced Accuracy: 0.6185967701511876
------------------------------
Results for J-P:
Accuracy: 0.5781368945342237
F1 Score: 0.5158218655822229
MCC: 0.14561633368752608
Balanced Accuracy: 0.5738694659968057
------------------------------
