## mbti-SVM

### 1 Import packages and load preprocessed dataframe

In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import pickle as pk
import time
from pprint import pprint
from tqdm import tqdm
from joblib import Parallel, delayed
from utilities import clean_posts, postVectorizer
from RF import RandomForest, cross_validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier,plot_importance

method_dict = {
    'RF': 'RandomForest',
    'SVM': 'SVM',
    'XGB': 'XGBoost',
    'DL': 'DeepLearning'
}

type_dict = {
    0: ['I', 'N', 'T', 'J'],
    1: ['E', 'S', 'F', 'P']
}

with open('pickles/type_explanation.pk', 'rb') as pkl:
    type_explanation = pk.load(pkl)
    
type_keys = list(type_explanation.keys())

In [2]:
with open('pickles/tfidf_df.pk', 'rb') as pkl:
    tfidf_df = pk.load(pkl)

with open('pickles/df.pk', 'rb') as pkl:
    df = pk.load(pkl)

# with open('pickles/clf_ie.pk', 'rb') as pkl:
#     clf_ie = pk.load(pkl)

# with open('pickles/clf_ns.pk', 'rb') as pkl:
#     clf_ns = pk.load(pkl)

# with open('pickles/clf_tf.pk', 'rb') as pkl:
#     clf_tf = pk.load(pkl)

# with open('pickles/clf_jp.pk', 'rb') as pkl:
#     clf_jp = pk.load(pkl)

### 2 Training and Benchmarking

In [42]:
benchmark_df = pd.DataFrame(np.zeros((3, 6)), index=['Scale','ACC', 'F1'], columns=['IE', 'NS','TF', 'JP', 'Full Type(Strict)', 'Full Type(Loose)'])

In [43]:
# train_by_type
# @params:
#     _type: (string), type to be classified; types:[IE, NS, TF, JP]
#    method: (string), type of the classifier; methods:['RF', 'SVM', 'XGB']
# benchmark: (boolean), whether benchmark on dataset; benchmark:[True, False]
##

def train_by_type(_type, method='RF', benchmark=False):
    print(">>> Training Type {} ".format(_type)+"="*60)
    y = df[_type].values
    if method == 'RF':
        y = y.reshape(-1, 1)
    
    X_train, X_test, y_train, y_test = train_test_split(tfidf_df.values, y, test_size=0.2, train_size=0.8,
                                                        random_state=None, shuffle=True, stratify=y)
    
    print("# @Training START #")
    if method == 'RF':
        clf = RandomForest(n_estimators=100, verbose=0, min_leaf_size=3)
        clf.fit(np.concatenate((X_train, y_train), axis=1))
    
    elif method == 'SVM':
        clf = LinearSVC(tol=1e-5)
#         clf = SVC(kernel='linear', verbose=True, probability=True)
        clf.fit(X_train, y_train)
    
    elif method == 'XGB':
        clf = XGBClassifier()
        clf.fit(X_train, y_train, 
                early_stopping_rounds=10,
                eval_metric="logloss",
                eval_set=[(X_test, y_test)],
                verbose=False)
    else:
        raise ValueError("Invalid Method.")
        
    print("# @Training END #\n")
    
    time.sleep(0.5)
    
    if benchmark:
        print("# @Scoring START # --- {:s}".format(method_dict[method]))
        time.sleep(0.5)
        print("Type: {}: {} : {} = {} : {}".format(_type, _type[0], _type[1], sum(y)/len(y), 1-sum(y)/len(y)))
        benchmark_df.loc['Scale', _type] = "{}% : {}%".format(int(sum(y)/len(y)*100+0.5), int((1 - sum(y)/len(y))*100+0.5))
        time.sleep(0.5)
        pred_train = [clf.predict(i.reshape(1,-1))[0] for i in tqdm(X_train)]
        acc = accuracy_score(y_train, pred_train)
        print("Accuracy on training set - %s" % _type, acc)
        benchmark_df.loc['ACC', _type] = acc
        f1 = f1_score(y_train, pred_train) if len(set(pred_train))>1 else 0.0
        print("F1 Score on training set - %s" % _type, f1)
        benchmark_df.loc['F1', _type] = f1
        time.sleep(0.5)
        pred_test = [clf.predict(i.reshape(1,-1))[0] for i in tqdm(X_test)]
        print("Accuracy on testing set - %s" % _type, accuracy_score(y_test, pred_test))
        print("F1 Score on testing set - %s" % _type, \
              (f1_score(y_test, pred_test) if len(set(pred_test))>1 else 0.0))
        print("# @Scoring END #\n")
    
    return clf

In [44]:
# Separately benchmarking
clf_ie = train_by_type('IE', 'SVM', benchmark=True)
clf_ns = train_by_type('NS', 'SVM', benchmark=True)
clf_tf = train_by_type('TF', 'SVM', benchmark=True)
clf_jp = train_by_type('JP', 'SVM', benchmark=True)

# @Training START #
# @Training END #

# @Scoring START # --- SVM
Type: IE: I : E = 0.2304322766570605 : 0.7695677233429394


100%|██████████| 6940/6940 [00:00<00:00, 19779.33it/s]


Accuracy on training set - IE 0.8223342939481268
F1 Score on training set - IE 0.492383696994648


100%|██████████| 1735/1735 [00:00<00:00, 16758.13it/s]


Accuracy on testing set - IE 0.7585014409221902
F1 Score on testing set - IE 0.3252818035426731
# @Scoring END #

# @Training START #
# @Training END #

# @Scoring START # --- SVM
Type: NS: N : S = 0.13798270893371758 : 0.8620172910662824


100%|██████████| 6940/6940 [00:00<00:00, 18281.32it/s]


Accuracy on training set - NS 0.8829971181556195
F1 Score on training set - NS 0.30716723549488056


100%|██████████| 1735/1735 [00:00<00:00, 15692.10it/s]


Accuracy on testing set - NS 0.8582132564841498
F1 Score on testing set - NS 0.174496644295302
# @Scoring END #

# @Training START #
# @Training END #

# @Scoring START # --- SVM
Type: TF: T : F = 0.5410951008645534 : 0.45890489913544663


100%|██████████| 6940/6940 [00:00<00:00, 20408.52it/s]


Accuracy on training set - TF 0.8243515850144092
F1 Score on training set - TF 0.8390334081605705


100%|██████████| 1735/1735 [00:00<00:00, 18275.86it/s]


Accuracy on testing set - TF 0.7498559077809799
F1 Score on testing set - TF 0.7701271186440677
# @Scoring END #

# @Training START #
# @Training END #

# @Scoring START # --- SVM
Type: JP: J : P = 0.604149855907781 : 0.39585014409221897


100%|██████████| 6940/6940 [00:00<00:00, 20663.34it/s]


Accuracy on training set - JP 0.7342939481268012
F1 Score on training set - JP 0.79285553808133


100%|██████████| 1735/1735 [00:00<00:00, 14812.69it/s]

Accuracy on testing set - JP 0.654178674351585
F1 Score on testing set - JP 0.7326203208556149
# @Scoring END #






In [46]:
# Store clfs
with open('pickles/clf_ie.pk', 'wb') as pkl:
    pk.dump(clf_ie, pkl)

with open('pickles/clf_ns.pk', 'wb') as pkl:
    pk.dump(clf_ns, pkl)

with open('pickles/clf_tf.pk', 'wb') as pkl:
    pk.dump(clf_tf, pkl)

with open('pickles/clf_jp.pk', 'wb') as pkl:
    pk.dump(clf_jp, pkl)

In [37]:
# predict_full_type
# @params:
#      text: (string), text(post) to predict.
#     _type: (string or None), True type; types:[INTJ~ESFP],
#                              if None: predict a non-recorde sample.
#    strict: (boolean), if True:
#                           return a tuple of index (predicted, true).
#                       if False:
#                           return a match rate determined by each subtype;
#                           e.g.: INTJ - INTP : 75% matched.
#                       benchmark:[True, False]
##
def predict_full_type(text, _type=None, strict=True):
#     text = postVectorizer(clean_posts(text))
    IE = clf_ie.predict(text)[0]
    NS = clf_ns.predict(text)[0]
    TF = clf_tf.predict(text)[0]
    JP = clf_jp.predict(text)[0]
    match_rate = 0
    pred_type = type_dict[IE][0] + type_dict[NS][1] + type_dict[TF][2] + type_dict[JP][3]

    if _type is None:
        return pred_type
    
    if strict:
        return type_keys.index(pred_type), type_keys.index(_type)
    else:
        for i in range(4):
            if _type[i] == pred_type[i]:
                match_rate += 25
        print("Predicted Type: {}  |  True Type: {}  |  [{}{}%]Matched".format(
            pred_type, _type, "" if match_rate==100 else " ", match_rate))
        return match_rate

In [49]:
benchmark_df

Unnamed: 0,IE,NS,TF,JP,Full Type(Strict),Full Type(Loose)
Scale,23% : 77%,14% : 86%,54% : 46%,60% : 40%,,0.0
ACC,0.822334,0.882997,0.824352,0.734294,0.421787,0.0
F1,0.492384,0.307167,0.839033,0.792856,0.226198,0.0


In [48]:
# Overall benchmarking strictly

test_size = df.shape[0]
# tests = [postVectorizer(clean_posts(df.sep_posts[i])) for i in range(test_size)]

preds = Parallel(n_jobs=4, prefer="threads")\
(delayed(predict_full_type)(r.reshape(1,-1), df.type[i], strict=True) for i, r in tqdm(enumerate(tfidf_df.values)))

acc = accuracy_score([i[1] for i in preds], [i[0] for i in preds])
f1 = f1_score([i[1] for i in preds], [i[0] for i in preds], average='macro')
print("Overall Accuracy: {}".format(acc))
print("Overall F1 Score: {}".format(f1))

benchmark_df.loc['Scale', 'Full Type(Strict)'] = np.nan
benchmark_df.loc['ACC', 'Full Type(Strict)'] = acc
benchmark_df.loc['F1', 'Full Type(Strict)'] = f1

8675it [00:02, 2961.90it/s]

Overall Accuracy: 0.42178674351585016
Overall F1 Score: 0.22619845195939503





In [50]:
# Overall benchmarking non-strictly
test_size = df.shape[0]

acc = Parallel(n_jobs=4, prefer="threads")\
(delayed(predict_full_type)(r.reshape(1,-1), df.type[i], strict=False) for i, r in tqdm(enumerate(tfidf_df.values)))

acc = np.sum(acc) / (100*test_size)
print("Overall Accuracy: {}".format(acc))

benchmark_df.loc['Scale', 'Full Type(Strict)'] = np.nan
benchmark_df.loc['ACC', 'Full Type(Strict)'] = acc
benchmark_df.loc['F1', 'Full Type(Strict)'] = np.nan

383it [00:00, 1803.98it/s]

Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: ENTJ  |  [ 50%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: ENFJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: INFP  |  True Type: INFJ  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched


Predicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Matched

Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Match

773it [00:00, 1873.50it/s]


Predicted Type: INTP  |  True Type: ESTP  |  [ 50%]Matched

Predicted Type: ENFJ  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INFP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]MatchedPredicted Type: ISFJ  |  True Type: ISFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched

Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: ISFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched


Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matc

1181it [00:00, 1953.19it/s]



Predicted Type: INTJ  |  True Type: INTP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: ENFJ  |  [ 50%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: ENFJ  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: ISTP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: ENTJ  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INFJ  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched

Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched

Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: ENTJ  |  True Type: ENTJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Match

1613it [00:00, 2054.93it/s]


Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: INFP  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ISTJ  |  [ 25%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matched

Predicted Type: ENFJ  |  True Type: INFJ  |  [ 75%]Matched

Predicted Type: INTJ  |  True Type: ISTJ  |  [ 75%]MatchedPredicted Type: ENTP  |  True Type: ESTP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ESTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTJ  |  [ 50%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched

Predicted Type: INTJ  |  True Type: ENTP  |  [ 50%]MatchedPredicted Type: ENFP  |  True Type: ENTP  |  [ 75%]Match

2018it [00:01, 2038.25it/s]

Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Matched



Predicted Type: ISTJ  |  True Type: ISTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTJ  |  [ 50%]MatchedPredicted Type: INTP  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: ISFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched


Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: ENFP  |  True Type: ENFJ  |  [ 75%]Matched


Predicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: ENTP  |  True Type: ENFP  |  [ 75%]Matc

2415it [00:01, 1960.76it/s]


Predicted Type: INTP  |  True Type: ISTP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: ENTJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: ESFJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched


Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched

Predicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matche

2807it [00:01, 1944.90it/s]

Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched


Predicted Type: INFP  |  True Type: ENFJ  |  [ 50%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: INFP  |  True Type: INTJ  |  [ 50%]Matched
Predicted Type: ENFP  |  True Type: ESFJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matched

Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INFJ  |  [ 75%]Matched

Predicted Type: ENTP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]MatchedPredicted Type: INFP  |  True Type: ENTP  |  [ 50%]MatchedPredicted Type: INFP  |  True Type: INFJ  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matche

3009it [00:01, 1958.07it/s]




Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: INFP  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: ISFP  |  True Type: ENTP  |  [ 25%]Matched
Predicted Type: INFP  |  True Type: ENFJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched


Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: ISFP  |  [ 75%]Ma

3390it [00:01, 1732.03it/s]

Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: ISFP  |  [ 75%]Matched
Predicted Type: ENFP  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: ENTJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INTP  |  [ 50%]MatchedPredicted Type: ENTP  |  True Type: INFP  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched

Predicted Type: INTJ  |  True Type: ENTJ  |  [ 75%]Match

3780it [00:01, 1830.45it/s]

Predicted Type: INTP  |  True Type: ISTP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: ESFJ  |  [ 25%]Matched

Predicted Type: INTP  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: INFJ  |  True Type: INTP  |  [ 50%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched

Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INFJ  |  True Type: ISFP  |  [ 50%]MatchedPredicted Type: INTP  |  True Type: ESTP  |  [ 50%]Matched


Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Mat

4182it [00:02, 1910.83it/s]

Predicted Type: ENTJ  |  True Type: ISTJ  |  [ 50%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: ENFP  |  True Type: ENFJ  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: ENFP  |  [ 50%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched

Predicted Type: ENTP  |  True Type: ENFP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched

Predicted Type: ENTJ  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: INFP  |  True Type: ISTP  |  [ 50%]Matched
Predicted Type: ENFJ  |  True Type: ESFJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: ISTP  |  True Type: ISTJ  |  [ 75%]Matched

Predicted Type: INTP  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Match

4585it [00:02, 1960.02it/s]



Predicted Type: INTP  |  True Type: ISTP  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: ISFP  |  [ 50%]Matched

Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INFJ  |  [ 50%]MatchedPredicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched

Predicted Type: INTJ  |  True Type: INTJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: ISTP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: INFJ  |  [ 25%]Matched
Predicted Type: INFJ  |  True Type: ENFP  |  [ 50%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched


Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matc

4999it [00:02, 2003.68it/s]

Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched


Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: ENFP  |  [ 50%]Matched

Predicted Type: INTJ  |  True Type: INTJ  |  [100%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched

Predicted Type: INTP  |  True Type: INFP  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: ENTJ  |  [ 75%]Matched

Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Match

5406it [00:02, 1989.28it/s]


Predicted Type: ISTP  |  True Type: ISTP  |  [100%]MatchedPredicted Type: INTJ  |  True Type: ESTJ  |  [ 50%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: ENTJ  |  True Type: ENTJ  |  [100%]Matched


Predicted Type: INTP  |  True Type: ENFJ  |  [ 25%]MatchedPredicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched

Predicted Type: INFP  |  True Type: ESTJ  |  [ 0%]Matched
Predicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]MatchedPredicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched

Predicted Type: ENFP  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched

Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matc

5809it [00:02, 1979.41it/s]


Predicted Type: INFP  |  True Type: INTP  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched


Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: ENFP  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INFP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: ENTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched


Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matc

6206it [00:03, 1956.73it/s]




Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched

Predicted Type: INFJ  |  True Type: ENTJ  |  [ 50%]MatchedPredicted Type: INFP  |  True Type: ISTJ  |  [ 25%]MatchedPredicted Type: INTP  |  True Type: ISTJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INFJ  |  True Type: INTJ  |  [ 75%]Matched



Predicted Type: INFJ  |  True Type: INTP  |  [ 50%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: ENTJ  |  [ 50%]Mat

6603it [00:03, 1964.89it/s]



Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: INTJ  |  [ 50%]MatchedPredicted Type: INFJ  |  True Type: INFP  |  [ 75%]MatchedPredicted Type: INTP  |  True Type: ENTP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: ESFP  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: ISTP  |  [ 50%]Matched
Predicted Type: ISFJ  |  True Type: ENFP  |  [ 25%]Matched



Predicted Type: INFJ  |  True Type: ENFJ  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]MatchedPredicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: ISTP  |  [ 75%]Match

7026it [00:03, 2034.90it/s]





Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFP  |  True Type: ENFP  |  [ 75%]Matched
Predicted Type: INFJ  |  True Type: ENFP  |  [ 50%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matched

Predicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched

Predicted Type: INTJ  |  True Type: ENTJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: ENTJ  |  True Type: ENTJ  |  [100%]Matched

Predicted Type: INTP  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched

Predicted Type: INTP  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]M

7446it [00:03, 2052.11it/s]


Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: ISTJ  |  True Type: ISTJ  |  [100%]Matched

Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFP  |  [ 75%]Matched

Predicted Type: INTJ  |  True Type: INFJ  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched

Predicted Type: INFJ  |  True Type: ISFJ  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: ESTP  |  True Type: ENTP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: INTP  |  [ 75%]MatchedPredicted Type: INFP  |  True Type: INFJ  |  [ 75%]Matched

Predicted Type: INFJ  |  True Type: INFJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: ISFJ  |  True Type: ISFP  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Mat

7846it [00:04, 1777.88it/s]

Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: ENTP  |  True Type: INTJ  |  [ 50%]Matched

Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: ISTP  |  True Type: ISTP  |  [100%]MatchedPredicted Type: INTP  |  True Type: INFJ  |  [ 50%]Matched

Predicted Type: INTJ  |  True Type: INTJ  |  [100%]MatchedPredicted Type: INFP  |  True Type: INFP  |  [100%]MatchedPredicted Type: INTJ  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: ENFJ  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched


Predicted Type: INTP  |  True Type: INFJ  |  [ 50%]Matched
Predicted Type: INFJ  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: ENTP  |  [ 75%]MatchedPredicted Type: ENFJ  |  True Type: ENFJ  |  [100%]MatchedPredicted Type: INTP  |  True Type: ENTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matche

8213it [00:04, 1797.12it/s]


Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INTJ  |  [ 75%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTP  |  [ 75%]Matched

Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: ENTP  |  [ 50%]Matched
Predicted Type: ENFP  |  True Type: ENFP  |  [100%]Matched
Predicted Type: ENFJ  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: ENTP  |  True Type: ENTJ  |  [ 75%]MatchedPredicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INFP  |  True Type: INFP  |  [100%]Matched

Predicted Type: INFP  |  True Type: ENTJ  |  [ 25%]Matched
Predicted Type: INTP  |  True Type: ENTJ  |  [ 50%]Mat

8675it [00:04, 1938.44it/s]


Predicted Type: INTP  |  True Type: INTP  |  [100%]Matched
Predicted Type: INTJ  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: ESTP  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: ISTP  |  True Type: ISTP  |  [100%]MatchedPredicted Type: INTP  |  True Type: ENTP  |  [ 75%]MatchedPredicted Type: INFJ  |  True Type: INFJ  |  [100%]Matched
Predicted Type: INFJ  |  True Type: INTP  |  [ 50%]Matched

Predicted Type: INFP  |  True Type: ISTP  |  [ 50%]Matched
Predicted Type: INTP  |  True Type: ISTP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: ISTP  |  [ 75%]Matched
Predicted Type: ENTP  |  True Type: ISTP  |  [ 50%]Matched

Predicted Type: INTJ  |  True Type: INFJ  |  [ 75%]Matched
Predicted Type: INTJ  |  True Type: INTJ  |  [100%]Matched
Predicted Type: INTJ  |  True Type: ENTJ  |  [ 75%]Matched
Predicted Type: INFP  |  True Type: INTP  |  [ 75%]Matched
Predicted Type: INTP  |  True Type: INTP  |  [100%]Matc




Overall Accuracy: 0.8038328530259365


### 3 Further study on Kaggle ForumMessage

In [27]:
k_data = pd.read_csv('../dataset/ForumMessages.csv').Message.dropna().values
k_texts = []
for i in k_data:
    if len(i) > 1000:
        k_texts.append(i)

In [28]:
def k_clean(texts):
    texts = [re.sub(r'<code>.*</code>', " ", s) for s in texts]
    texts = [re.sub(r'<[/]?[a-z]+>', "", s) for s in texts]
    return texts

k_texts = k_clean(k_texts)

In [32]:
res = predict_full_type(postVectorizer(clean_posts(k_texts[1])), _type=None)
pprint(clean_posts(k_texts[1]))
print(res)
pprint(type_explanation[res])

('hi all i have a question about crossvalidation i am fitting a glm r model to '
 'my training dataset and it gives me an auc numbernumber then i do a '
 'numberfold crossvalidation each of my cross validations comes out with a auc '
 'numbernumber but then when i submit my model it has a aucnumbernumber on '
 'leaderboard what am i doing wrong QST if i am overfitting so badly on '
 'training set i dont understand why doesnt cross validation show that QST '
 'here is my code for crossvalidation data is a dataframe k is number of folds '
 'kfoldglmltfunctiondatak nltasintegernrowdatak errvectltrepnak for i in '
 'numberk snumberltinumbernnumbernumber snumberltin subsetltsnumbersnumber '
 'trainltdatasubset testltdatasubset fit lt glmaction '
 'datatrainfamilyquotbinomialquot prediction lt '
 'predictfitnewdatatesttypequotresponsequot '
 'labelsltasnumericascharactertestnumber err lt rocarealabelspredictiona '
 'errvectilterr returnerrvect cheers anna')
ISTP
['1.冷静旁观者—安静、预留余地、弹性及会以无偏见的好奇

## END
@ Karl