In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
import json

In [73]:
df = pd.read_csv("./csv/combined_mbti_df.csv")

df = df.drop_duplicates().dropna()

df.tail(5)

Unnamed: 0,mbti,function_pair,danceability_mean,danceability_stdev,energy_mean,energy_stdev,loudness_mean,loudness_stdev,mode_mean,mode_stdev,...,Aminor_count,AMajor_count,A#/Bbminor_count,BMajor_count,Dminor_count,D#_Ebminor_count,Gminor_count,A#/BbMajor_count,F#/GbMajor_count,Bminor_count
4076,ESTJ,SJ,0.552889,0.1551,0.595611,0.137195,-7.224889,2.101033,0.5,0.514496,...,2.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,2.0
4077,ESTJ,SJ,0.51778,0.142557,0.67494,0.182267,-7.9022,2.822676,0.6,0.494872,...,3.0,3.0,0.0,5.0,1.0,1.0,2.0,2.0,1.0,4.0
4078,ESTJ,SJ,0.585313,0.181908,0.694375,0.173636,-5.307063,1.531874,0.5625,0.512348,...,1.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4079,ESTJ,SJ,0.63604,0.152382,0.65242,0.176042,-6.55302,2.813042,0.54,0.503457,...,2.0,0.0,3.0,1.0,0.0,1.0,2.0,2.0,1.0,3.0
4080,ESTJ,SJ,0.640733,0.145205,0.678333,0.182482,-5.763733,2.075266,0.555556,0.502519,...,2.0,0.0,4.0,1.0,2.0,0.0,3.0,1.0,2.0,2.0


### CONVERT TO 4 LETTER COLUMNS

In [74]:
df["IE"] = df["mbti"].str[0].map({'I': 0, 'E': 1})
df["NS"] = df["mbti"].str[1].map({'N': 0, 'S': 1})
df["TF"] = df["mbti"].str[2].map({'T': 0, 'F': 1})
df["JP"] = df["mbti"].str[3].map({'J': 0, 'P': 1})

### STANDARDIZATION

In [75]:
x = df.drop(columns=["mbti", "function_pair", "IE", "NS", "TF", "JP"])

scaler = StandardScaler()
x = scaler.fit_transform(x)

### TRAIN MODEL (BY EACH LETTER)

In [86]:
from sklearn.utils import class_weight

In [126]:
def train_model(x, y):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True)
    
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = dict(enumerate(class_weights))

    model = Sequential([
        keras.layers.Dense(128, input_dim = x.shape[1], activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),

        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.2),

        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.1),

        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
    )

    model.fit(x_train, y_train,
              epochs=30, 
              validation_split=0.1,
              class_weight=class_weights)
    
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    
    return model, accuracy


In [127]:
ie_model, ie_accuracy = train_model(x, df["IE"])
ns_model, ns_accuracy = train_model(x, df["NS"])
tf_model, tf_accuracy = train_model(x, df["TF"])
jp_model, jp_accuracy = train_model(x, df["JP"])


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5987 - loss: 0.7380 - val_accuracy: 0.6406 - val_loss: 0.6557
Epoch 2/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6714 - loss: 0.6202 - val_accuracy: 0.6875 - val_loss: 0.6258
Epoch 3/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7028 - loss: 0.5937 - val_accuracy: 0.7188 - val_loss: 0.6012
Epoch 4/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7234 - loss: 0.5665 - val_accuracy: 0.7031 - val_loss: 0.5995
Epoch 5/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7309 - loss: 0.5443 - val_accuracy: 0.6906 - val_loss: 0.6094
Epoch 6/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7262 - loss: 0.5413 - val_accuracy: 0.7188 - val_loss: 0.5944
Epoch 7/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5265 - loss: 0.7756 - val_accuracy: 0.5219 - val_loss: 0.6924
Epoch 2/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5537 - loss: 0.7070 - val_accuracy: 0.5406 - val_loss: 0.6895
Epoch 3/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5770 - loss: 0.6841 - val_accuracy: 0.5813 - val_loss: 0.6686
Epoch 4/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5701 - loss: 0.6819 - val_accuracy: 0.6156 - val_loss: 0.6580
Epoch 5/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6094 - loss: 0.6704 - val_accuracy: 0.6062 - val_loss: 0.6569
Epoch 6/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6202 - loss: 0.6460 - val_accuracy: 0.6094 - val_loss: 0.6526
Epoch 7/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6312 - loss: 0.7085 - val_accuracy: 0.6938 - val_loss: 0.6049
Epoch 2/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6849 - loss: 0.6073 - val_accuracy: 0.6781 - val_loss: 0.5794
Epoch 3/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7088 - loss: 0.5840 - val_accuracy: 0.6938 - val_loss: 0.5632
Epoch 4/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7218 - loss: 0.5591 - val_accuracy: 0.6906 - val_loss: 0.5653
Epoch 5/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7274 - loss: 0.5435 - val_accuracy: 0.7000 - val_loss: 0.5707
Epoch 6/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7339 - loss: 0.5460 - val_accuracy: 0.7000 - val_loss: 0.5724
Epoch 7/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.4792 - loss: 0.8477 - val_accuracy: 0.6156 - val_loss: 0.6614
Epoch 2/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5584 - loss: 0.7066 - val_accuracy: 0.6500 - val_loss: 0.6396
Epoch 3/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6076 - loss: 0.6650 - val_accuracy: 0.6719 - val_loss: 0.6349
Epoch 4/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6258 - loss: 0.6416 - val_accuracy: 0.6438 - val_loss: 0.6364
Epoch 5/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6440 - loss: 0.6247 - val_accuracy: 0.6219 - val_loss: 0.6431
Epoch 6/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6473 - loss: 0.6225 - val_accuracy: 0.6656 - val_loss: 0.6243
Epoch 7/30
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━

In [128]:
model_info = {
    "ie_model": (ie_model, ie_accuracy),
    "ns_model": (ns_model, ns_accuracy),
    "tf_model": (tf_model, tf_accuracy),
    "jp_model": (jp_model, jp_accuracy),
}

### SAVE MODELS

In [129]:
# load record
with open("./models/accuracy_record.json", "r") as f:
    config = json.load(f)
    
def new_save(model, accuracy, name):
    print(accuracy, config[name])
    if accuracy > config[name]:
        config[name] = accuracy
        model.save(f"./models/{name}.keras")  

for name, (model, accuracy) in model_info.items():
    new_save(model, accuracy, name)

# save record
with open("./models/accuracy_record.json", "w") as f:
    json.dump(config, f, indent=4)

0.7121402025222778 0.7409261465072632
0.5994993448257446 0.6408010125160217
0.72465580701828 0.7334167957305908
0.610763430595398 0.6395494341850281


### LOAD MODEL FROM SAVE

In [139]:
def load_from_save(name):
    return load_model(f"./models/{name}.keras")

# for name, (model, _) in model_info.items():
#     model = load_from_save(name)

ie_model = load_from_save("ie_model")
ns_model = load_from_save("ns_model")
tf_model = load_from_save("tf_model")
jp_model = load_from_save("jp_model")

### TEST PREDICTION

In [140]:
# Predict according to each letter              // flatten
ie_pred = (ie_model.predict(x) > 0.5).astype(int).reshape(-1)
ns_pred = (ns_model.predict(x) > 0.5).astype(int).reshape(-1)
tf_pred = (tf_model.predict(x) > 0.5).astype(int).reshape(-1)
jp_pred = (jp_model.predict(x) > 0.5).astype(int).reshape(-1)

# Map to MBTI
def get_mbti(ie, ns, tf, jp):
    return f"{'E' if ie else 'I'}{'S' if ns else 'N'}{'F' if tf else 'T'}{'P' if jp else 'J'}"

res = []

for i in range(len(ie_pred)):
    mbti_pred = get_mbti(ie_pred[i], ns_pred[i], tf_pred[i], jp_pred[i])
    res.append(mbti_pred)

print(res[:20])


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
['ISFP', 'INFJ', 'ISTP', 'INFP', 'ISFP', 'INTP', 'INFJ', 'INTP', 'ISFJ', 'INTP', 'INTP', 'INFP', 'INTJ', 'INFJ', 'ISFP', 'ENFP', 'INFP', 'ISFJ', 'INFP', 'INFP']


### MORE TEST

In [150]:
mbti = "INFJ"

test_df = pd.read_csv(f"./csv/{mbti}_df.csv")

test_data = test_df.drop(columns=["mbti", "function_pair", "playlist_name", "playlist_id", "track_count"])

ie_pred = (ie_model.predict(test_data) > 0.5).astype(int).reshape(-1)
ns_pred = (ns_model.predict(test_data) > 0.5).astype(int).reshape(-1)
tf_pred = (tf_model.predict(test_data) > 0.5).astype(int).reshape(-1)
jp_pred = (jp_model.predict(test_data) > 0.5).astype(int).reshape(-1)

# Map to MBTI
def get_mbti(ie, ns, tf, jp):
    return f"{'E' if ie else 'I'}{'S' if ns else 'N'}{'F' if tf else 'T'}{'P' if jp else 'J'}"

res = []

for i in range(len(ie_pred)):
    mbti_pred = get_mbti(ie_pred[i], ns_pred[i], tf_pred[i], jp_pred[i])
    res.append(mbti_pred)

table = pd.DataFrame(res).drop_duplicates()
table


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Unnamed: 0,0
0,ISTJ
1,INTP
3,ISTP
55,INTJ
283,ESTP
