# 1. Data Preparation

In [18]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from mbti_type_from_text.db_utils import create_connection
import pandas as pd
import plotly.express as px
import re
import numpy as np

In [19]:
db_connection = create_connection("../data/reddit.db")

In [20]:
comments_df = pd.read_sql(sql="SELECT * FROM Comments", con=db_connection)

In [21]:
comments_df.head()

Unnamed: 0,id,user_id,parent_comment_id,title,content,created_datetime,upvotes,subreddit
0,ltmgwm,6i0rnp1p,,Always felt different and misunderstood by family,"I am an INFJ female, with no INFJ’s in my fami...",2021-02-27 12:45:56,271,infj
1,gozeenz,1s8dnq6p,ltmgwm,,Yupppp. No one understands me in my family. I ...,2021-02-27 13:02:48,85,infj
2,gp0l91l,b708k,gozeenz,,Same. Doing my own thing whether it's outside ...,2021-02-27 16:04:27,30,infj
3,gp0ffo6,mkfu3,ltmgwm,,I moved across the country straight out of col...,2021-02-27 15:40:01,46,infj
4,gp0g9eh,vh9kmmx,gp0ffo6,,Are you me? I thought I was the only weirdo w...,2021-02-27 15:43:01,18,infj


In [22]:
users_df = pd.read_sql(sql="SELECT * FROM Users", con=db_connection)

In [23]:
users_df.head()

Unnamed: 0,id,name,flair_text
0,6i0rnp1p,igid221,
1,1s8dnq6p,Sheilaahmad,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG
3,mkfu3,ShannyPantsxo,
4,vh9kmmx,lala2love,


In [24]:
def unbold_text(string):
    unbolded_string = ""
    for c in string:
        int_code = ord(c)
        if int_code >= ord("𝐀") and int_code <= ord("𝐙"):
            unbolded_string += chr(int_code - 119743)
        else:
            unbolded_string += c
    return unbolded_string


def extract_mbti_from_flair_text(df):
    regex = r"((I|E|X)(S|N|X)(F|T|X)(J|P|X))"
    return df["flair_text"].str.upper().apply(unbold_text).str.extract(regex)[0]


users_df["mbti_type"] = extract_mbti_from_flair_text(df=users_df)

In [25]:
users_df

Unnamed: 0,id,name,flair_text,mbti_type
0,6i0rnp1p,igid221,,
1,1s8dnq6p,Sheilaahmad,,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG,INFJ
3,mkfu3,ShannyPantsxo,,
4,vh9kmmx,lala2love,,
...,...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP,INFP
4807,pktgyjt,Squamply,,
4808,76fgxdug,LoviEnthusiast,,
4809,sh56t,sotaponi,,


In [26]:
# Some users mark two types, by convention we keep the first one for now...
users_df[users_df["id"] == "mpa2p"]

Unnamed: 0,id,name,flair_text,mbti_type
2304,mpa2p,Mortallyinsane21,INFJ masking as INTP,INFJ


In [27]:
# Bold font is supported!
users_df[users_df["id"] == "1vtv4bck"]

Unnamed: 0,id,name,flair_text,mbti_type
2533,1vtv4bck,yahgirlamberrr,𝐈𝐒𝐅𝐉,ISFJ


In [28]:
count_by_mbti_df = users_df.groupby("mbti_type")["id"].count()
px.bar(count_by_mbti_df.sort_values(ascending=False))

In [29]:
def extract_mbti_from_message(df):
    mbti_regex = "(I|E|X)(S|N|X)(F|T|X)(J|P|X)"
    regex_dict = {
        "i_am_mbti_regex": {"regex": "I('m| am)(?: (an|a))? ({})".format(mbti_regex), "group_index": 2},
        "my_mbti_regex": {"regex": "(M|m)y ({}) (personality|experience)".format(mbti_regex), "group_index": 1},
        "mbti_here_regex": {"regex": "({})(?: \((m|f)\))? here".format(mbti_regex), "group_index": 0},
        "fellow_mbti_regex": {"regex": "(F|f)ellow ({})".format(mbti_regex), "group_index": 1},
        "i_mbti_regex": {"regex": "(Me|I)(?: )?\(({})\)".format(mbti_regex), "group_index": 1},
        "looking_for_regex": {"regex": "({}) looking for".format(mbti_regex), "group_index": 0}
    }
    result_df = pd.DataFrame()
    for regex_name, regex_item in regex_dict.items():
        result_df["{}__on__title".format(regex_name)] = df["title"].str.extract(regex_item["regex"])[regex_item["group_index"]]
        result_df["{}__on__content".format(regex_name)] = df["content"].str.extract(regex_item["regex"])[regex_item["group_index"]]
    return result_df


extract_mbti_from_message_df = extract_mbti_from_message(comments_df)

In [30]:
(~extract_mbti_from_message_df.isna()).sum(axis=0)

i_am_mbti_regex__on__title          6
i_am_mbti_regex__on__content      232
my_mbti_regex__on__title            1
my_mbti_regex__on__content          0
mbti_here_regex__on__title          6
mbti_here_regex__on__content       70
fellow_mbti_regex__on__title        3
fellow_mbti_regex__on__content     43
i_mbti_regex__on__title             3
i_mbti_regex__on__content           2
looking_for_regex__on__title        5
looking_for_regex__on__content      1
dtype: int64

In [31]:
# Several regex can match for one user, but they seem to give the same results
extract_mbti_from_message_df[(~extract_mbti_from_message_df.isna()).sum(axis=1) == 2]

Unnamed: 0,i_am_mbti_regex__on__title,i_am_mbti_regex__on__content,my_mbti_regex__on__title,my_mbti_regex__on__content,mbti_here_regex__on__title,mbti_here_regex__on__content,fellow_mbti_regex__on__title,fellow_mbti_regex__on__content,i_mbti_regex__on__title,i_mbti_regex__on__content,looking_for_regex__on__title,looking_for_regex__on__content
52,,,,,ENTP,ENTP,,,,,,
6395,,INTP,,,,,,INTP,,,,
6710,,INFP,,,,INFP,,,,,,
9056,,ENFP,,,,,,,,,ENFP,
9563,ENTJ,ENTJ,,,,,,,,,,
10512,,ENFP,,,,ENFP,,,,,,
10977,,ENFJ,,,,,,ENFJ,,,,
14123,,ENFP,,,,,,,,,ENFP,
14225,,INFP,,,,INFP,,,,,,
14531,,ENFP,,,,,,,,,ENFP,


In [32]:
def get_extracted_mbti_if_exists(row):
    if row.last_valid_index() is None:
        return np.nan
    else:
        return row[row.last_valid_index()]

    
comments_df["extracted_mbti"] = extract_mbti_from_message_df.apply(get_extracted_mbti_if_exists, axis=1)

In [33]:
comments_df

Unnamed: 0,id,user_id,parent_comment_id,title,content,created_datetime,upvotes,subreddit,extracted_mbti
0,ltmgwm,6i0rnp1p,,Always felt different and misunderstood by family,"I am an INFJ female, with no INFJ’s in my fami...",2021-02-27 12:45:56,271,infj,INFJ
1,gozeenz,1s8dnq6p,ltmgwm,,Yupppp. No one understands me in my family. I ...,2021-02-27 13:02:48,85,infj,
2,gp0l91l,b708k,gozeenz,,Same. Doing my own thing whether it's outside ...,2021-02-27 16:04:27,30,infj,
3,gp0ffo6,mkfu3,ltmgwm,,I moved across the country straight out of col...,2021-02-27 15:40:01,46,infj,
4,gp0g9eh,vh9kmmx,gp0ffo6,,Are you me? I thought I was the only weirdo w...,2021-02-27 15:43:01,18,infj,
...,...,...,...,...,...,...,...,...,...
15172,gp7jyjl,3c5ent8f,lunkpn,,I do this as well. It is definitely not becaus...,2021-02-28 20:11:41,10,mbti,
15173,gp7n1dp,681obxgg,lunkpn,,I do this too. I think it's just an Fi thing t...,2021-02-28 20:27:52,6,mbti,
15174,gp7tp8p,sh56t,lunkpn,,Sometimes? Not sure how simply using yourself ...,2021-02-28 21:02:51,2,mbti,
15175,gp8q0wi,1jceigvp,lunkpn,,It can appear selfish since it might look like...,2021-03-01 00:24:43,1,mbti,


In [34]:
users_df

Unnamed: 0,id,name,flair_text,mbti_type
0,6i0rnp1p,igid221,,
1,1s8dnq6p,Sheilaahmad,,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG,INFJ
3,mkfu3,ShannyPantsxo,,
4,vh9kmmx,lala2love,,
...,...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP,INFP
4807,pktgyjt,Squamply,,
4808,76fgxdug,LoviEnthusiast,,
4809,sh56t,sotaponi,,


In [35]:
# Merge users and comments because we want to inject the extracted type from comments into the users table
merged_comment_df = comments_df.merge(users_df[["id", "mbti_type"]], left_on="user_id", right_on="id")

In [36]:
extracted_mbti_by_user_df = comments_df[~comments_df["extracted_mbti"].isna()][["user_id", "extracted_mbti"]].groupby("user_id")["extracted_mbti"].unique()

In [37]:
extracted_mbti_by_user_df

user_id
101tuq      [INTJ]
115zer      [ENTP]
11bj6s      [INFP]
11nzmg      [INTJ]
123994is    [ESTJ]
             ...  
x6twfh2     [ENTP]
xrxhv       [INFJ]
yelqs       [ENFJ]
zf59s       [ESTJ]
zfqgy       [INFJ]
Name: extracted_mbti, Length: 317, dtype: object

In [38]:
extracted_mbti_by_user_df[extracted_mbti_by_user_df.str.len() > 1]

user_id
64ga0       [ENFJ, ENTP]
6ddnv9ui    [ENFJ, ESTP]
870mk3j8    [ENFJ, INFP]
9p9p2axq    [INTP, ESTP]
9piy9onx    [ISTP, ESTP]
Name: extracted_mbti, dtype: object

In [39]:
# It is better to remove these users with ambiguous types
extracted_mbti_by_user_df = extracted_mbti_by_user_df[extracted_mbti_by_user_df.str.len() == 1].str[0].reset_index()
extracted_mbti_by_user_df

Unnamed: 0,user_id,extracted_mbti
0,101tuq,INTJ
1,115zer,ENTP
2,11bj6s,INFP
3,11nzmg,INTJ
4,123994is,ESTJ
...,...,...
307,x6twfh2,ENTP
308,xrxhv,INFJ
309,yelqs,ENFJ
310,zf59s,ESTJ


In [40]:
users_df = users_df.merge(extracted_mbti_by_user_df, left_on="id", right_on="user_id", how="left")

In [41]:
users_df = users_df.rename(columns={"mbti_type": "mbti_type_from_flair_text", "extracted_mbti": "mbti_type_from_comments"})

In [42]:
# Now we have two mbti types: mbti_type_from_flair_text and mbti_type_from_comments
users_df

Unnamed: 0,id,name,flair_text,mbti_type_from_flair_text,user_id,mbti_type_from_comments
0,6i0rnp1p,igid221,,,6i0rnp1p,INFJ
1,1s8dnq6p,Sheilaahmad,,,,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG,INFJ,,
3,mkfu3,ShannyPantsxo,,,,
4,vh9kmmx,lala2love,,,,
...,...,...,...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP,INFP,,
4807,pktgyjt,Squamply,,,,
4808,76fgxdug,LoviEnthusiast,,,76fgxdug,ENFP
4809,sh56t,sotaponi,,,,


In [43]:
# Let's take mbti_type_from_flair_text as a default value for the MBTI type of a user
users_df["mbti_type"] = users_df["mbti_type_from_flair_text"]

In [44]:
# However, if mbti_type_from_flair_text is not set, we use mbti_type_from_comments
no_mbti_type_from_flair_text = users_df["mbti_type_from_flair_text"].isna()
users_df.loc[no_mbti_type_from_flair_text, "mbti_type"] = users_df[no_mbti_type_from_flair_text]["mbti_type_from_comments"]

In [45]:
users_df[["id", "name", "mbti_type"]]

Unnamed: 0,id,name,mbti_type
0,6i0rnp1p,igid221,INFJ
1,1s8dnq6p,Sheilaahmad,
2,b708k,lzkbloodmage,INFJ
3,mkfu3,ShannyPantsxo,
4,vh9kmmx,lala2love,
...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP
4807,pktgyjt,Squamply,
4808,76fgxdug,LoviEnthusiast,ENFP
4809,sh56t,sotaponi,


In [46]:
comments_df[["id", "user_id", "parent_comment_id", "subreddit", "title", "content"]]

Unnamed: 0,id,user_id,parent_comment_id,subreddit,title,content
0,ltmgwm,6i0rnp1p,,infj,Always felt different and misunderstood by family,"I am an INFJ female, with no INFJ’s in my fami..."
1,gozeenz,1s8dnq6p,ltmgwm,infj,,Yupppp. No one understands me in my family. I ...
2,gp0l91l,b708k,gozeenz,infj,,Same. Doing my own thing whether it's outside ...
3,gp0ffo6,mkfu3,ltmgwm,infj,,I moved across the country straight out of col...
4,gp0g9eh,vh9kmmx,gp0ffo6,infj,,Are you me? I thought I was the only weirdo w...
...,...,...,...,...,...,...
15172,gp7jyjl,3c5ent8f,lunkpn,mbti,,I do this as well. It is definitely not becaus...
15173,gp7n1dp,681obxgg,lunkpn,mbti,,I do this too. I think it's just an Fi thing t...
15174,gp7tp8p,sh56t,lunkpn,mbti,,Sometimes? Not sure how simply using yourself ...
15175,gp8q0wi,1jceigvp,lunkpn,mbti,,It can appear selfish since it might look like...


In [55]:
# saving the users_df with mbti_types as a feather file
users_df[["id", "name", "mbti_type"]].to_feather("../data/users_df_with_mbti_type.feather")

# 2. Vectorize texts

In [30]:
import tensorflow_hub as hub
import numpy as np
import tensorflow_text

In [31]:
muse_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [43]:
batch_size = 100
n_batches = len(comments_df["content"]) // batch_size
if len(comments_df["content"]) % batch_size > 0:
    n_batches += 1

vectors = []
for i in range(n_batches):
    print("Prepare batch {}/{}".format(i + 1, n_batches))
    batch_start_index = i * batch_size
    batch_end_index = min(batch_start_index + batch_size, len(comments_df))
    sentences = comments_df.iloc[batch_start_index:batch_end_index]["content"].tolist()
    vectors.append(muse_model(sentences))
vectors = np.concatenate(vectors)

Prepare batch 1/152
Prepare batch 2/152
Prepare batch 3/152
Prepare batch 4/152
Prepare batch 5/152
Prepare batch 6/152
Prepare batch 7/152
Prepare batch 8/152
Prepare batch 9/152
Prepare batch 10/152
Prepare batch 11/152
Prepare batch 12/152
Prepare batch 13/152
Prepare batch 14/152
Prepare batch 15/152
Prepare batch 16/152
Prepare batch 17/152
Prepare batch 18/152
Prepare batch 19/152
Prepare batch 20/152
Prepare batch 21/152
Prepare batch 22/152
Prepare batch 23/152
Prepare batch 24/152
Prepare batch 25/152
Prepare batch 26/152
Prepare batch 27/152
Prepare batch 28/152
Prepare batch 29/152
Prepare batch 30/152
Prepare batch 31/152
Prepare batch 32/152
Prepare batch 33/152
Prepare batch 34/152
Prepare batch 35/152
Prepare batch 36/152
Prepare batch 37/152
Prepare batch 38/152
Prepare batch 39/152
Prepare batch 40/152
Prepare batch 41/152
Prepare batch 42/152
Prepare batch 43/152
Prepare batch 44/152
Prepare batch 45/152
Prepare batch 46/152
Prepare batch 47/152
Prepare batch 48/152
P

In [44]:
vectors.shape

(15177, 512)

In [48]:
distance_matrix = np.inner(vectors, vectors)

In [55]:
distance_matrix.shape

(15177, 15177)

In [52]:
labels = comments_df[["user_id"]].merge(users_df[["id", "mbti_type"]], left_on="user_id", right_on="id")["mbti_type"].values

In [53]:
labels.shape

(15177,)

In [54]:
labels

array(['INFJ', nan, nan, ..., 'ENFP', nan, nan], dtype=object)

# 3. Plot vectors

In [58]:
from umap import UMAP

In [59]:
reducer = UMAP(n_components=2, metric="cosine", verbose=True)
vectors_2d = reducer.fit_transform(vectors)

UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     verbose=True)
Construct fuzzy simplicial set
Wed Mar  3 23:07:39 2021 Finding Nearest Neighbors
Wed Mar  3 23:07:39 2021 Building RP forest with 11 trees
Wed Mar  3 23:07:43 2021 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	 7  /  14
	Stopping threshold met -- exiting after 7 iterations
Wed Mar  3 23:08:01 2021 Finished Nearest Neighbor Search
Wed Mar  3 23:08:04 2021 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Mar  3 23:08:14 2021 Finished embedding


In [60]:
vectors_2d.shape

(15177, 2)

In [71]:
plot_df = pd.DataFrame({
    "x_1": vectors_2d[:, 0],
    "x_2": vectors_2d[:, 1],
    "content": comments_df["content"],
    "label": labels
})

In [72]:
plot_df["label"] = plot_df["label"].fillna("NA")

In [73]:
plot_df

Unnamed: 0,x_1,x_2,content,label
0,4.481791,7.138180,"I am an INFJ female, with no INFJ’s in my fami...",INFJ
1,6.696858,9.541062,Yupppp. No one understands me in my family. I ...,
2,7.421050,9.256389,Same. Doing my own thing whether it's outside ...,
3,6.996467,8.909230,I moved across the country straight out of col...,INFJ
4,5.969614,8.778481,Are you me? I thought I was the only weirdo w...,INFJ
...,...,...,...,...
15172,6.707380,8.991220,I do this as well. It is definitely not becaus...,
15173,6.702337,9.051413,I do this too. I think it's just an Fi thing t...,
15174,7.318375,6.146318,Sometimes? Not sure how simply using yourself ...,ENFP
15175,6.456370,9.005738,It can appear selfish since it might look like...,


In [75]:
fig = px.scatter(plot_df, x="x_1", y="x_2", color="label", hover_data=["content"])

In [76]:
fig.write_html("projected_muse.html")

In [None]:
# Suggestions of things to fix in comments:
# - Remove when people quote someone
# - Remove [View Poll]
# - Replace abbreviations like 'bc' -> 'because'
# - Remove URLs