# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# feature engineering
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download("stopwords")

# sentiment scoring
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# pos tagging
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# accuracy scores
from sklearn.metrics import (
    f1_score,
    accuracy_score,
)

# performance check
import time

# sparse to dense
from sklearn.base import TransformerMixin


class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()


# importing model
from joblib import load

# code formatter
%load_ext nb_black

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

In [2]:
# reading the test dataset
df = pd.read_csv("data_ekta/df_holdout.csv")

<IPython.core.display.Javascript object>

In [3]:
# checking top records
df.head(2)

Unnamed: 0,type,posts
0,INFJ,'I have never seen so many poorly used memes.....
1,INFJ,'Wow! You are obviously her muse... Be flatter...


<IPython.core.display.Javascript object>

In [4]:
def categorize_types(personality_data):

    personality_data["is_Extrovert"] = personality_data["type"].apply(
        lambda x: 1 if x[0] == "E" else 0
    )
    personality_data["is_Sensing"] = personality_data["type"].apply(
        lambda x: 1 if x[1] == "S" else 0
    )
    personality_data["is_Thinking"] = personality_data["type"].apply(
        lambda x: 1 if x[2] == "T" else 0
    )
    personality_data["is_Judging"] = personality_data["type"].apply(
        lambda x: 1 if x[3] == "J" else 0
    )

    # rearranging the dataframe columns
    personality_data = personality_data[
        ["type", "is_Extrovert", "is_Sensing", "is_Thinking", "is_Judging", "posts"]
    ]


#######################################################################################################3


def clean_posts(personality_data):

    # converting posts into lower case
    personality_data["clean_posts"] = personality_data["posts"].str.lower()

    # replacing ||| with space
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"\|\|\|"), " "
    )

    # replacing urls with domain name
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
        ""
        #         lambda match: match.group(2),
    )

    # dropping emails
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"\S+@\S+"), ""
    )

    # dropping punctuations
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"[^a-z\s]"), " "
    )

    # dropping MBTIs mentioned in the posts. There are quite a few mention of these types in these posts.
    mbti = personality_data["type"].unique()
    for type_word in mbti:
        personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
            type_word.lower(), ""
        )

    # lemmitizing
    lemmatizer = WordNetLemmatizer()

    personality_data["clean_posts"] = personality_data["clean_posts"].apply(
        lambda x: " ".join(
            [
                lemmatizer.lemmatize(word)
                for word in x.split(" ")
                if word not in stopwords.words("english")
            ]
        )
    )

    # tag_posts will be a list of 50 lists. need it for word stats (per post for each user)
    # replacing urls with domain name
    personality_data["tag_posts"] = personality_data["posts"].str.replace(
        re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
        lambda match: match.group(2),
    )
    # replacing ||| with space
    personality_data["tag_posts"] = [
        post for post in personality_data["tag_posts"].str.split("\|\|\|")
    ]


#  )

#################################################################################################################


def sentiment_score(personality_data):

    analyzer = SentimentIntensityAnalyzer()

    nlp_sentiment_score = []

    for post in personality_data["clean_posts"]:
        score = analyzer.polarity_scores(post)["compound"]
        nlp_sentiment_score.append(score)

    personality_data["compound_sentiment"] = nlp_sentiment_score


###############################################################################################################


def pos_tagging(personality_data):

    personality_data["tagged_words"] = personality_data["tag_posts"].apply(
        lambda x: [nltk.pos_tag(word_tokenize(line)) for line in x]
    )

    # grouping pos tags based on stanford list
    tags_dict = {
        "ADJ": ["JJ", "JJR", "JJS"],
        "ADP": ["EX", "TO"],
        "ADV": ["RB", "RBR", "RBS", "WRB"],
        "CONJ": ["CC", "IN"],
        "DET": ["DT", "PDT", "WDT"],
        "NOUN": ["NN", "NNS", "NNP", "NNPS"],
        "NUM": ["CD"],
        "PRT": ["RP"],
        "PRON": ["PRP", "PRP$", "WP", "WP$"],
        "VERB": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
        ".": ["#", "$", "''", "(", ")", ",", ".", ":"],
        "X": ["FW", "LS", "UH"],
    }

    def stanford_tag(x, tag):
        tags_list = [len([y for y in line if y[1] in tags_dict[col]]) for line in x]
        return tags_list

    for col in tags_dict.keys():
        personality_data["S_" + col + "_med"] = personality_data["tagged_words"].apply(
            lambda x: np.median(stanford_tag(x, col))
        )
        personality_data["S_" + col + "_std"] = personality_data["tagged_words"].apply(
            lambda x: np.std(stanford_tag(x, col))
        )


###############################################################################################################


def get_counts(personality_data):
    def unique_words(s):
        unique = set(s.split(" "))
        return len(unique)

    def emojis(post):
        # does not include emojis made purely from symbols, only :word:
        emoji_count = 0
        words = post.split()
        for e in words:
            if "http" not in e:
                if e.count(":") == 2:
                    emoji_count += 1
        return emoji_count

    def colons(post):
        # Includes colons used in emojis
        colon_count = 0
        words = post.split()
        for e in words:
            if "http" not in e:
                colon_count += e.count(":")
        return colon_count

    personality_data["qm"] = personality_data["posts"].apply(lambda s: s.count("?"))
    personality_data["em"] = personality_data["posts"].apply(lambda s: s.count("!"))
    personality_data["colons"] = personality_data["posts"].apply(colons)
    personality_data["emojis"] = personality_data["posts"].apply(emojis)

    personality_data["word_count"] = personality_data["posts"].apply(
        lambda s: s.count(" ") + 1
    )
    personality_data["unique_words"] = personality_data["posts"].apply(unique_words)

    personality_data["upper"] = personality_data["posts"].apply(
        lambda x: len([x for x in x.split() if x.isupper()])
    )
    personality_data["link_count"] = personality_data["posts"].apply(
        lambda s: s.count("http")
    )
    ellipses_count = [
        len(re.findall(r"\.\.\.\ ", posts)) for posts in personality_data["posts"]
    ]
    personality_data["ellipses"] = ellipses_count
    personality_data["img_count"] = [
        len(re.findall(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.png)", post))
        for post in personality_data["posts"]
    ]


<IPython.core.display.Javascript object>

In [5]:
def prep_data(personality_data):

    t = time.time()

    categorize_types(personality_data)

    clean_posts(personality_data)

    sentiment_score(personality_data)

    pos_tagging(personality_data)

    get_counts(personality_data)

    features = personality_data[
        [
            "clean_posts",
            "compound_sentiment",
            "S_ADJ_med",
            "S_ADJ_std",
            "S_ADP_med",
            "S_ADP_std",
            "S_ADV_med",
            "S_ADV_std",
            "S_CONJ_med",
            "S_CONJ_std",
            "S_DET_med",
            "S_DET_std",
            "S_NOUN_med",
            "S_NOUN_std",
            "S_NUM_med",
            "S_NUM_std",
            "S_PRT_med",
            "S_PRT_std",
            "S_PRON_med",
            "S_PRON_std",
            "S_VERB_med",
            "S_VERB_std",
            "qm",
            "em",
            "colons",
            "emojis",
            "word_count",
            "unique_words",
            "upper",
            "link_count",
            "ellipses",
            "img_count",
        ]
    ]

    X = features
    y = personality_data.iloc[:, 2:6]

    print(f"Total Preprocessing Time: {time.time()-t} seconds\n")

    return X, y

<IPython.core.display.Javascript object>

### Modelling

In [6]:
def combine_classes(y_pred1, y_pred2, y_pred3, y_pred4):
    
    combined = []
    for i in range(len(y_pred1)):
        combined.append(
            str(y_pred1[i]) + str(y_pred2[i]) + str(y_pred3[i]) + str(y_pred4[i])
        )
    
    result = trace_back(combined)
    return result
    

def trace_back(combined):
        
    type_list = [
    {"0": "I", "1": "E"},
    {"0": "N", "1": "S"},
    {"0": "F", "1": "T"},
    {"0": "P", "1": "J"},
    ]

    result = []
    for num in combined:
        s = ""
        for i in range(len(num)):
            s += type_list[i][num[i]]
        result.append(s)
        
    return result


<IPython.core.display.Javascript object>

In [7]:
def predict(path_to_csv):

    df = pd.read_csv(path_to_csv)

    X, y = prep_data(df)

    # loading the 4 models
    EorI_model = load("clf_is_Extrovert.joblib")
    SorN_model = load("clf_is_Sensing.joblib")
    TorF_model = load("clf_is_Thinking.joblib")
    JorP_model = load("clf_is_Judging.joblib")

    # predicting
    EorI_pred = EorI_model.predict(X)
    print(
        "Extrovert vs Introvert Accuracy: ",
        accuracy_score(y["is_Extrovert"], EorI_pred),
    )
    print("y_true", y["is_Extrovert"].values)
    print("preds", EorI_pred)

    SorN_pred = SorN_model.predict(X)
    print(
        "\nSensing vs Intuition Accuracy: ", accuracy_score(y["is_Sensing"], SorN_pred)
    )
    print("y_true", y["is_Sensing"].values)
    print("preds", SorN_pred)

    TorF_pred = TorF_model.predict(X)
    print(
        "\nThinking vs Feeling Accuracy: ", accuracy_score(y["is_Thinking"], TorF_pred)
    )
    print("y_true", y["is_Thinking"].values)
    print("preds", TorF_pred)

    JorP_pred = JorP_model.predict(X)
    print(
        "\nJudging vs Perceiving Accuracy: ", accuracy_score(y["is_Judging"], JorP_pred)
    )
    print("y_true", y["is_Judging"].values)
    print("preds", JorP_pred)

    # combining the predictions from the 4 models
    result = combine_classes(EorI_pred, SorN_pred, TorF_pred, JorP_pred)

    return result

<IPython.core.display.Javascript object>

In [8]:
if __name__ == "__main__":

    predictions = predict("data_ekta/df_holdout.csv")
    y_truth = pd.read_csv("data_ekta/df_holdout.csv")["type"].values


Total Preprocessing Time: 53.677297830581665 seconds

Extrovert vs Introvert Accuracy:  0.632183908045977
y_true [0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 0 0]
preds [0 1 1 1 1 1 1 1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0
 0 1 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 1 1 0
 1 1 0 0 1 0 0 0 1 0 0 0 0]

Sensing vs Intuition Accuracy:  0.735632183908046
y_true [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 0 0 1]
preds [0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 1 1]

Thinking vs Feeling Accuracy:  0.7701149425287356
y_true [0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 

<IPython.core.display.Javascript object>

In [10]:
df["result"] = predictions
df

Unnamed: 0,type,posts,result
0,INFJ,'I have never seen so many poorly used memes.....,INFP
1,INFJ,'Wow! You are obviously her muse... Be flatter...,ENFJ
2,ENFP,'Are you shitting me? He's so type 7 its not e...,ESTP
3,ENTP,"'Oh man, this is serious. Good luck with her! ...",ENTJ
4,INFP,'Haha. Thank you! You guys have been so nice. ...,ENFP
...,...,...,...
82,ISTP,"'Hey, so incidentally, that's the exact same t...",ENTP
83,INTJ,'I would answer your questions but I don't kno...,ISTP
84,INFJ,"'I am the same. What I do, is send them occas...",INFJ
85,INTP,'Important that i'm attracted to her at least ...,ISTP


<IPython.core.display.Javascript object>