In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
import json
from joblib import dump

In [2]:
df = pd.read_csv("./csv/combined_mbti_df.csv")

df = df.drop_duplicates().dropna()

df.tail(5)

Unnamed: 0,mbti,function_pair,danceability_mean,danceability_stdev,energy_mean,energy_stdev,loudness_mean,loudness_stdev,mode_mean,mode_stdev,...,Aminor_count,AMajor_count,A#/Bbminor_count,BMajor_count,Dminor_count,D#_Ebminor_count,Gminor_count,A#/BbMajor_count,F#/GbMajor_count,Bminor_count
4076,ESTJ,SJ,0.552889,0.1551,0.595611,0.137195,-7.224889,2.101033,0.5,0.514496,...,2.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,2.0
4077,ESTJ,SJ,0.51778,0.142557,0.67494,0.182267,-7.9022,2.822676,0.6,0.494872,...,3.0,3.0,0.0,5.0,1.0,1.0,2.0,2.0,1.0,4.0
4078,ESTJ,SJ,0.585313,0.181908,0.694375,0.173636,-5.307063,1.531874,0.5625,0.512348,...,1.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4079,ESTJ,SJ,0.63604,0.152382,0.65242,0.176042,-6.55302,2.813042,0.54,0.503457,...,2.0,0.0,3.0,1.0,0.0,1.0,2.0,2.0,1.0,3.0
4080,ESTJ,SJ,0.640733,0.145205,0.678333,0.182482,-5.763733,2.075266,0.555556,0.502519,...,2.0,0.0,4.0,1.0,2.0,0.0,3.0,1.0,2.0,2.0


In [3]:
playlist_features = [
    "danceability_mean",
    "energy_mean",
    "mode_mean",
    "speechiness_mean",
    "liveness_mean",
    "valence_mean",
    "tempo_mean",
    "instrumentalness_mean",
]

### CONVERT TO 4 LETTER COLUMNS

In [4]:
df["IE"] = df["mbti"].str[0].map({'I': 0, 'E': 1})
df["NS"] = df["mbti"].str[1].map({'N': 0, 'S': 1})
df["TF"] = df["mbti"].str[2].map({'T': 0, 'F': 1})
df["JP"] = df["mbti"].str[3].map({'J': 0, 'P': 1})

df.head(5)

Unnamed: 0,mbti,function_pair,danceability_mean,danceability_stdev,energy_mean,energy_stdev,loudness_mean,loudness_stdev,mode_mean,mode_stdev,...,Dminor_count,D#_Ebminor_count,Gminor_count,A#/BbMajor_count,F#/GbMajor_count,Bminor_count,IE,NS,TF,JP
0,INFP,NF,0.557841,0.155011,0.553325,0.225178,-8.352591,3.273317,0.659091,0.479495,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,1
1,INFP,NF,0.587636,0.135644,0.556273,0.191642,-8.215697,3.356867,0.636364,0.488504,...,1.0,1.0,1.0,1.0,0.0,0.0,0,0,1,1
2,INFP,NF,0.677,0.128009,0.85128,0.128336,-5.0461,2.180554,0.58,0.498569,...,1.0,0.0,0.0,1.0,2.0,9.0,0,0,1,1
3,INFP,NF,0.517,0.169477,0.513412,0.258345,-10.172833,4.93514,0.785714,0.4153,...,0.0,1.0,0.0,3.0,0.0,0.0,0,0,1,1
4,INFP,NF,0.5604,0.14145,0.445862,0.242592,-10.57224,5.685179,0.82,0.388088,...,0.0,0.0,0.0,5.0,1.0,1.0,0,0,1,1


In [5]:
x = df[playlist_features]

scaler = StandardScaler()
x = scaler.fit_transform(x)
dump(scaler, "models/mbti_scaler.pkl") 

['models/mbti_scaler.pkl']

### TRAIN MODEL (BY EACH LETTER)

In [6]:
def train_model(x, y):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, 
        test_size=0.3, 
        random_state=42, 
        shuffle=True)

    model = Sequential([
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.1),

        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),

        keras.layers.Dense(128, activation='relu', input_dim=x.shape[1]),
        keras.layers.Dropout(0.2),
    
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
    )

    model.fit(x_train, y_train,
              epochs = 30, 
              validation_split=0.1)
    
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    
    return model, accuracy, x_test, y_test


In [7]:
ie_model, ie_accuracy, x_ie_test, y_ie_test = train_model(x, df["IE"])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.6370 - loss: 0.6567 - val_accuracy: 0.7071 - val_loss: 0.6017
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6794 - loss: 0.6166 - val_accuracy: 0.7107 - val_loss: 0.6011
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7038 - loss: 0.5953 - val_accuracy: 0.6929 - val_loss: 0.6061
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7037 - loss: 0.5916 - val_accuracy: 0.7036 - val_loss: 0.6066
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7002 - loss: 0.5841 - val_accuracy: 0.6893 - val_loss: 0.6054
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7106 - loss: 0.5870 - val_accuracy: 0.7036 - val_loss: 0.6085
Epoch 7/30
[1m79/79[0m [32m━━━━

In [8]:
ns_model, ns_accuracy, x_ns_test, y_ns_test = train_model(x, df["NS"])

Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.5163 - loss: 0.6966 - val_accuracy: 0.5643 - val_loss: 0.6798
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5494 - loss: 0.6886 - val_accuracy: 0.5821 - val_loss: 0.6732
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.6028 - loss: 0.6727 - val_accuracy: 0.5929 - val_loss: 0.6643
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6007 - loss: 0.6633 - val_accuracy: 0.5893 - val_loss: 0.6665
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5965 - loss: 0.6675 - val_accuracy: 0.5929 - val_loss: 0.6691
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6201 - loss: 0.6574 - val_accuracy: 0.5929 - val_loss: 0.6654
Epoch 7/30
[1m79/79[0m [32m━━━━

In [9]:
tf_model, tf_accuracy, x_tf_test, y_tf_test = train_model(x, df["TF"])

Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.5655 - loss: 0.6693 - val_accuracy: 0.6821 - val_loss: 0.6282
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6968 - loss: 0.6102 - val_accuracy: 0.6821 - val_loss: 0.6141
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6992 - loss: 0.6041 - val_accuracy: 0.6893 - val_loss: 0.6147
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7112 - loss: 0.5884 - val_accuracy: 0.6821 - val_loss: 0.6091
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7135 - loss: 0.5901 - val_accuracy: 0.6929 - val_loss: 0.6082
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7117 - loss: 0.5737 - val_accuracy: 0.6857 - val_loss: 0.6087
Epoch 7/30
[1m79/79[0m [32m━━━━

In [10]:
jp_model, jp_accuracy, x_jp_test, y_jp_test = train_model(x, df["JP"])

Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 41ms/step - accuracy: 0.5495 - loss: 0.6897 - val_accuracy: 0.6607 - val_loss: 0.6440
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.5989 - loss: 0.6681 - val_accuracy: 0.6429 - val_loss: 0.6479
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.6099 - loss: 0.6579 - val_accuracy: 0.6393 - val_loss: 0.6435
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.6137 - loss: 0.6549 - val_accuracy: 0.6500 - val_loss: 0.6411
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.6145 - loss: 0.6536 - val_accuracy: 0.6571 - val_loss: 0.6335
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.6091 - loss: 0.6545 - val_accuracy: 0.6750 - val_loss: 0.6303
Epoch 7/30
[1m79/79[0m [32m━━━

In [11]:
model_info = {
    "ie_model": (ie_model, ie_accuracy),
    "ns_model": (ns_model, ns_accuracy),
    "tf_model": (tf_model, tf_accuracy),
    "jp_model": (jp_model, jp_accuracy),
}

### SAVE MODELS

In [12]:
# load record
with open("./models/accuracy_record.json", "r") as f:
    config = json.load(f)
    
def new_save(model, accuracy, name):
    print(accuracy, config[name])
    if accuracy > config[name]:
        config[name] = accuracy
        model.save(f"./models/{name}.keras")  

for name, (model, accuracy) in model_info.items():
    new_save(model, accuracy, name)

# save record
with open("./models/accuracy_record.json", "w") as f:
    json.dump(config, f, indent=4)

0.7331109046936035 0.7289407849311829
0.6221851706504822 0.6230191588401794
0.7122602462768555 0.7289407849311829
0.606338620185852 0.597998321056366


### LOAD MODEL FROM SAVE

In [13]:
def load_from_save(name):
    return load_model(f"./models/{name}.keras")

# for name, (model, _) in model_info.items():
#     model = load_from_save(name)

ie_model = load_from_save("ie_model")
ns_model = load_from_save("ns_model")
tf_model = load_from_save("tf_model")
jp_model = load_from_save("jp_model")

### TEST PREDICTION

In [14]:
ie_pred = (ie_model.predict(x_ie_test) > 0.5).astype(int).reshape(-1)
ns_pred = (ns_model.predict(x_ns_test) > 0.5).astype(int).reshape(-1)
tf_pred = (tf_model.predict(x_tf_test) > 0.5).astype(int).reshape(-1)
jp_pred = (jp_model.predict(x_jp_test) > 0.5).astype(int).reshape(-1)

# Map to MBTI
def get_mbti(ie, ns, tf, jp):
    return f"{'E' if ie else 'I'}{'S' if ns else 'N'}{'F' if tf else 'T'}{'P' if jp else 'J'}"

pred_mbti = [get_mbti(ie_pred[i], ns_pred[i], tf_pred[i], jp_pred[i]) for i in range(len(ie_pred))]

actual_mbti = [
    get_mbti(y_ie_test.iloc[i], y_ns_test.iloc[i], y_tf_test.iloc[i], y_jp_test.iloc[i])
    for i in range(len(y_ie_test))
]

# So sánh
for p, a in zip(pred_mbti[20:], actual_mbti[20:]):
    print(f"Pred: {p:<10} | Actual: {a}")


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Pred: ENTP       | Actual: ENTJ
Pred: ESTP       | Actual: ESTP
Pred: INFP       | Actual: ISFP
Pred: INTJ       | Actual: ISTJ
Pred: INFP       | Actual: ISFJ
Pred: INTP       | Actual: ISTP
Pred: INTJ       | Actual: INFP
Pred: INTJ       | Actual: ENTJ
Pred: ESTP       | Actual: ENTP
Pred: ISFJ       | Actual: ISFP
Pred: ISFJ       | Actual: INFJ
Pred: ENTJ       | Actual: ENTJ
Pred: ENFP       | Actual: ENFJ
Pred: ISFP       | Actual: ISFP
Pred: ENTP       | Actual: ENFP
Pred: ESTP       | Actual: ENTP
Pred: INTP       | Actual: INTJ
Pred: INFP       | Actual: INTP
Pred: ENFJ       | Actual: ISTJ
Pred: ESTP       | Actual: ESTP
Pred: ENTP       | Actual: ENTJ
Pred: ISFJ       | Actual: ENFJ
Pred

### MORE TEST

In [15]:
mbti = "INTJ"

test_df = pd.read_csv(f"./csv/{mbti}_df.csv")

test_data = test_df[playlist_features]

ie_pred = (ie_model.predict(test_data) > 0.5).astype(int).reshape(-1)
ns_pred = (ns_model.predict(test_data) > 0.5).astype(int).reshape(-1)
tf_pred = (tf_model.predict(test_data) > 0.5).astype(int).reshape(-1)
jp_pred = (jp_model.predict(test_data) > 0.5).astype(int).reshape(-1)

# Map to MBTI
def get_mbti(ie, ns, tf, jp):
    return f"{'E' if ie else 'I'}{'S' if ns else 'N'}{'F' if tf else 'T'}{'P' if jp else 'J'}"

res = []

for i in range(len(ie_pred)):
    mbti_pred = get_mbti(ie_pred[i], ns_pred[i], tf_pred[i], jp_pred[i])
    res.append(mbti_pred)

table = pd.DataFrame(res).drop_duplicates()
table


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Unnamed: 0,0
0,ESFJ
31,INTP


### TRAIN ACCORDING TO FUNCTION PAIR

In [13]:
x = df[playlist_features]
# print(x.corr())
x = scaler.fit_transform(x)
dump(scaler, "models/func_pair_scaler.pkl") 

encoder = LabelEncoder()
y = encoder.fit_transform(df["function_pair"])
y

array([0, 0, 0, ..., 2, 2, 2])

In [14]:
x_test, x_train, y_test, y_train = train_test_split(
    x, y,
    test_size=0.2,
    random_state=42,
    shuffle = True)

In [15]:
func_pair_model = Sequential([
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dropout(0.1),

    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.2),

    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(256, activation="relu", input_dim=x_train.shape[1]),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(4, activation="softmax")  
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
func_pair_model.compile(
        optimizer= "adam",
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'],
    )

In [17]:
func_pair_model.fit(
        x_train, y_train,
        validation_split = 0.1,
        epochs = 30,
        verbose = 1)

loss, func_pair_accuracy = func_pair_model.evaluate(x_test, y_test, verbose=0)

Epoch 1/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 65ms/step - accuracy: 0.3087 - loss: 1.3735 - val_accuracy: 0.4625 - val_loss: 1.3546
Epoch 2/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.4496 - loss: 1.3011 - val_accuracy: 0.4500 - val_loss: 1.3184
Epoch 3/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.3913 - loss: 1.2954 - val_accuracy: 0.4500 - val_loss: 1.2840
Epoch 4/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.4942 - loss: 1.2317 - val_accuracy: 0.4375 - val_loss: 1.2578
Epoch 5/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.4327 - loss: 1.2559 - val_accuracy: 0.4250 - val_loss: 1.2562
Epoch 6/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.4344 - loss: 1.2628 - val_accuracy: 0.4250 - val_loss: 1.2735
Epoch 7/30
[1m23/23[0m [32m━━━

In [18]:
with open("./models/accuracy_record.json") as f:
    config = json.load(f)
    print(func_pair_accuracy, config["func_pair_model"])
    
    if func_pair_accuracy > config.get("func_pair_model", 0):
        config["func_pair_model"] = func_pair_accuracy
        func_pair_model.save("./models/func_pair_model.keras")

with open("./models/accuracy_record.json", "w") as f:
    json.dump(config, f, indent=4)

0.4356807470321655 0.44976526498794556
