In [135]:
import pandas as pd
import numpy as np
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
import json

In [None]:
df = pd.read_csv("./csv/combined_mbti_df.csv")

df = df.drop_duplicates().dropna()

df.tail(5)

In [97]:
playlist_features = [
    "danceability_mean",
    "energy_mean",
    "loudness_mean",
    "mode_mean",
    "speechiness_mean",
    "acousticness_mean",
    "liveness_mean",
    "valence_mean",
    "tempo_mean",
    "instrumentalness_mean",
]

### CONVERT TO 4 LETTER COLUMNS

In [98]:
df["IE"] = df["mbti"].str[0].map({'I': 0, 'E': 1})
df["NS"] = df["mbti"].str[1].map({'N': 0, 'S': 1})
df["TF"] = df["mbti"].str[2].map({'T': 0, 'F': 1})
df["JP"] = df["mbti"].str[3].map({'J': 0, 'P': 1})

df.head(5)

Unnamed: 0,mbti,function_pair,danceability_mean,danceability_stdev,energy_mean,energy_stdev,loudness_mean,loudness_stdev,mode_mean,mode_stdev,...,Dminor_count,D#_Ebminor_count,Gminor_count,A#/BbMajor_count,F#/GbMajor_count,Bminor_count,IE,NS,TF,JP
0,INFP,NF,0.557841,0.155011,0.553325,0.225178,-8.352591,3.273317,0.659091,0.479495,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,1
1,INFP,NF,0.587636,0.135644,0.556273,0.191642,-8.215697,3.356867,0.636364,0.488504,...,1.0,1.0,1.0,1.0,0.0,0.0,0,0,1,1
2,INFP,NF,0.677,0.128009,0.85128,0.128336,-5.0461,2.180554,0.58,0.498569,...,1.0,0.0,0.0,1.0,2.0,9.0,0,0,1,1
3,INFP,NF,0.517,0.169477,0.513412,0.258345,-10.172833,4.93514,0.785714,0.4153,...,0.0,1.0,0.0,3.0,0.0,0.0,0,0,1,1
4,INFP,NF,0.5604,0.14145,0.445862,0.242592,-10.57224,5.685179,0.82,0.388088,...,0.0,0.0,0.0,5.0,1.0,1.0,0,0,1,1


In [99]:
x = df[playlist_features]

scaler = StandardScaler()
x = scaler.fit_transform(x)

### TRAIN MODEL (BY EACH LETTER)

In [151]:
def train_model(x, y):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, 
        test_size=0.3, 
        random_state=42, 
        shuffle=True)

    model = Sequential([
        keras.layers.Dense(128, activation='relu', input_dim=x.shape[1]),
        keras.layers.Dropout(0.2),
        
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),

        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.1),

        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
    )

    model.fit(x_train, y_train,
              epochs = 30, 
              validation_split=0.1)
    
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    
    return model, accuracy, x_test, y_test


In [152]:
ie_model, ie_accuracy, x_ie_test, y_ie_test = train_model(x, df["IE"])

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6601 - loss: 0.6460 - val_accuracy: 0.7250 - val_loss: 0.6096
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6878 - loss: 0.6134 - val_accuracy: 0.7071 - val_loss: 0.6070
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6993 - loss: 0.5986 - val_accuracy: 0.6964 - val_loss: 0.5910
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7042 - loss: 0.5828 - val_accuracy: 0.7000 - val_loss: 0.6037
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7093 - loss: 0.5827 - val_accuracy: 0.7000 - val_loss: 0.5915
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7131 - loss: 0.5679 - val_accuracy: 0.7036 - val_loss: 0.5848
Epoch 7/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━

In [153]:
ns_model, ns_accuracy, x_ns_test, y_ns_test = train_model(x, df["NS"])

Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5315 - loss: 0.6958 - val_accuracy: 0.5786 - val_loss: 0.6707
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5969 - loss: 0.6774 - val_accuracy: 0.5821 - val_loss: 0.6671
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5869 - loss: 0.6676 - val_accuracy: 0.5964 - val_loss: 0.6664
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6060 - loss: 0.6659 - val_accuracy: 0.6036 - val_loss: 0.6678
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6062 - loss: 0.6619 - val_accuracy: 0.6107 - val_loss: 0.6679
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6016 - loss: 0.6626 - val_accuracy: 0.5786 - val_loss: 0.6672
Epoch 7/30
[1m79/79[0m [32m━━━━━━━━━━

In [154]:
tf_model, tf_accuracy, x_tf_test, y_tf_test = train_model(x, df["TF"])

Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6058 - loss: 0.6653 - val_accuracy: 0.6571 - val_loss: 0.6160
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7107 - loss: 0.6064 - val_accuracy: 0.6786 - val_loss: 0.6003
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7276 - loss: 0.5792 - val_accuracy: 0.6786 - val_loss: 0.6011
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7220 - loss: 0.5738 - val_accuracy: 0.6929 - val_loss: 0.5915
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7256 - loss: 0.5682 - val_accuracy: 0.6964 - val_loss: 0.5904
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7457 - loss: 0.5607 - val_accuracy: 0.6929 - val_loss: 0.5887
Epoch 7/30
[1m79/79[0m [32m━━━━━━━━━━

In [155]:
jp_model, jp_accuracy, x_jp_test, y_jp_test = train_model(x, df["JP"])

Epoch 1/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5584 - loss: 0.6787 - val_accuracy: 0.6821 - val_loss: 0.6300
Epoch 2/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6175 - loss: 0.6556 - val_accuracy: 0.7179 - val_loss: 0.6291
Epoch 3/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6295 - loss: 0.6502 - val_accuracy: 0.6857 - val_loss: 0.6254
Epoch 4/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6022 - loss: 0.6611 - val_accuracy: 0.6750 - val_loss: 0.6242
Epoch 5/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6394 - loss: 0.6447 - val_accuracy: 0.6786 - val_loss: 0.6211
Epoch 6/30
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6341 - loss: 0.6416 - val_accuracy: 0.6821 - val_loss: 0.6201
Epoch 7/30
[1m79/79[0m [32m━━━━━━━━━━

In [156]:
model_info = {
    "ie_model": (ie_model, ie_accuracy),
    "ns_model": (ns_model, ns_accuracy),
    "tf_model": (tf_model, tf_accuracy),
    "jp_model": (jp_model, jp_accuracy),
}

### SAVE MODELS

In [157]:
# load record
with open("./models/accuracy_record.json", "r") as f:
    config = json.load(f)
    
def new_save(model, accuracy, name):
    print(accuracy, config[name])
    if accuracy > config[name]:
        config[name] = accuracy
        model.save(f"./models/{name}.keras")  

for name, (model, accuracy) in model_info.items():
    new_save(model, accuracy, name)

# save record
with open("./models/accuracy_record.json", "w") as f:
    json.dump(config, f, indent=4)

0.7314428687095642 0.7314428687095642
0.6196830868721008 0.6280233263969421
0.7222685813903809 0.7306088209152222
0.6280233263969421 0.6288573741912842


### LOAD MODEL FROM SAVE

In [158]:
def load_from_save(name):
    return load_model(f"./models/{name}.keras")

# for name, (model, _) in model_info.items():
#     model = load_from_save(name)

ie_model = load_from_save("ie_model")
ns_model = load_from_save("ns_model")
tf_model = load_from_save("tf_model")
jp_model = load_from_save("jp_model")

### TEST PREDICTION

In [149]:
ie_pred = (ie_model.predict(x_ie_test) > 0.5).astype(int).reshape(-1)
ns_pred = (ns_model.predict(x_ns_test) > 0.5).astype(int).reshape(-1)
tf_pred = (tf_model.predict(x_tf_test) > 0.5).astype(int).reshape(-1)
jp_pred = (jp_model.predict(x_jp_test) > 0.5).astype(int).reshape(-1)

# Map to MBTI
def get_mbti(ie, ns, tf, jp):
    return f"{'E' if ie else 'I'}{'S' if ns else 'N'}{'F' if tf else 'T'}{'P' if jp else 'J'}"

pred_mbti = [get_mbti(ie_pred[i], ns_pred[i], tf_pred[i], jp_pred[i]) for i in range(len(ie_pred))]

actual_mbti = [
    get_mbti(y_ie_test.iloc[i], y_ns_test.iloc[i], y_tf_test.iloc[i], y_jp_test.iloc[i])
    for i in range(len(y_ie_test))
]

# So sánh
for p, a in zip(pred_mbti[20:], actual_mbti[20:]):
    print(f"Pred: {p:<10} | Actual: {a}")


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 920us/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 863us/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 891us/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Pred: ENTP       | Actual: ENTJ
Pred: ESTP       | Actual: ESTP
Pred: INFP       | Actual: ISFP
Pred: INTP       | Actual: ISTJ
Pred: INFP       | Actual: ISFJ
Pred: INTJ       | Actual: ISTP
Pred: INTJ       | Actual: INFP
Pred: INTJ       | Actual: ENTJ
Pred: ESTP       | Actual: ENTP
Pred: ISFJ       | Actual: ISFP
Pred: ISFJ       | Actual: INFJ
Pred: ESTJ       | Actual: ENTJ
Pred: ENFP       | Actual: ENFJ
Pred: ISFP       | Actual: ISFP
Pred: ESTP       | Actual: ENFP
Pred: ESFJ       | Actual: ENTP
Pred: INTP       | Actual: INTJ
Pred: INFP       | Actual: INTP
Pred: ENFJ       | Actual: ISTJ
Pred: ESTP       | Actual: ESTP
Pred: ENTP       | Actual: ENTJ
Pred: ENFJ       | Actual: EN

### MORE TEST

In [159]:
mbti = "INTJ"

test_df = pd.read_csv(f"./csv/{mbti}_df.csv")

test_data = test_df[playlist_features]

ie_pred = (ie_model.predict(test_data) > 0.5).astype(int).reshape(-1)
ns_pred = (ns_model.predict(test_data) > 0.5).astype(int).reshape(-1)
tf_pred = (tf_model.predict(test_data) > 0.5).astype(int).reshape(-1)
jp_pred = (jp_model.predict(test_data) > 0.5).astype(int).reshape(-1)

# Map to MBTI
def get_mbti(ie, ns, tf, jp):
    return f"{'E' if ie else 'I'}{'S' if ns else 'N'}{'F' if tf else 'T'}{'P' if jp else 'J'}"

res = []

for i in range(len(ie_pred)):
    mbti_pred = get_mbti(ie_pred[i], ns_pred[i], tf_pred[i], jp_pred[i])
    res.append(mbti_pred)

table = pd.DataFrame(res).drop_duplicates()
table


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


Unnamed: 0,0
0,ESFJ
31,INTP
