In [6]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

In [7]:
from mbti_type_from_text.db_utils import create_connection

In [8]:
db_connection = create_connection("../data/reddit.db")

In [9]:
import pandas as pd

comments_df = pd.read_sql(sql="SELECT * FROM Comments", con=db_connection)

In [10]:
comments_df.head()

Unnamed: 0,id,user_id,parent_comment_id,title,content,created_datetime,upvotes,subreddit
0,ltmgwm,6i0rnp1p,,Always felt different and misunderstood by family,"I am an INFJ female, with no INFJ’s in my fami...",2021-02-27 12:45:56,271,infj
1,gozeenz,1s8dnq6p,ltmgwm,,Yupppp. No one understands me in my family. I ...,2021-02-27 13:02:48,85,infj
2,gp0l91l,b708k,gozeenz,,Same. Doing my own thing whether it's outside ...,2021-02-27 16:04:27,30,infj
3,gp0ffo6,mkfu3,ltmgwm,,I moved across the country straight out of col...,2021-02-27 15:40:01,46,infj
4,gp0g9eh,vh9kmmx,gp0ffo6,,Are you me? I thought I was the only weirdo w...,2021-02-27 15:43:01,18,infj


In [11]:
users_df = pd.read_sql(sql="SELECT * FROM Users", con=db_connection)

In [12]:
users_df.head()

Unnamed: 0,id,name,flair_text
0,6i0rnp1p,igid221,
1,1s8dnq6p,Sheilaahmad,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG
3,mkfu3,ShannyPantsxo,
4,vh9kmmx,lala2love,


In [70]:
len(users_df)

4811

In [52]:
def unbold_text(string):
    unbolded_string = ""
    for c in string:
        int_code = ord(c)
        if int_code >= ord("𝐀") and int_code <= ord("𝐙"):
            unbolded_string += chr(int_code - 119743)
        else:
            unbolded_string += c
    return unbolded_string

In [53]:
def extract_mbti_from_flair_text(df):
    regex = r"((I|E|X)(S|N|X)(F|T|X)(J|P|X))"
    return df["flair_text"].str.upper().apply(unbold_text).str.extract(regex)[0]

users_df["mbti_type"] = extract_mbti_from_flair_text(df=users_df)

In [54]:
users_df

Unnamed: 0,id,name,flair_text,mbti_type,unbolded_flair_text
0,6i0rnp1p,igid221,,,
1,1s8dnq6p,Sheilaahmad,,,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG,INFJ,INFJ-T 4w5 HSP HSS Empath | 26M SG
3,mkfu3,ShannyPantsxo,,,
4,vh9kmmx,lala2love,,,
...,...,...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP,INFP,INFP
4807,pktgyjt,Squamply,,,
4808,76fgxdug,LoviEnthusiast,,,
4809,sh56t,sotaponi,,,


In [55]:
# Some users mark two types, by convention we keep the first one for now...
users_df[users_df["id"] == "mpa2p"]

Unnamed: 0,id,name,flair_text,mbti_type,unbolded_flair_text
2304,mpa2p,Mortallyinsane21,INFJ masking as INTP,INFJ,INFJ masking as INTP


In [57]:
# Bold font is supported!
users_df[users_df["id"] == "1vtv4bck"]

Unnamed: 0,id,name,flair_text,mbti_type,unbolded_flair_text
2533,1vtv4bck,yahgirlamberrr,𝐈𝐒𝐅𝐉,ISFJ,ISFJ


In [58]:
users_df["mbti_type"].unique()

array([nan, 'INFJ', 'ENFP', 'ENTP', 'ENTJ', 'INTJ', 'ESTP', 'INTP',
       'INFX', 'INFP', 'ISFP', 'ENFJ', 'ISTJ', 'ISTP', 'ESFP', 'ISFJ',
       'ESTJ', 'INTX', 'XNTP', 'IXTP', 'ESFJ'], dtype=object)

In [59]:
users_df["mbti_type"].nunique()

20

In [63]:
import plotly.express as px

In [66]:
count_by_mbti_df = users_df.groupby("mbti_type")["id"].count()

In [69]:
px.bar(count_by_mbti_df.sort_values(ascending=False))

In [128]:
import re

mbti_regex = "(I|E|X)(S|N|X)(F|T|X)(J|P|X)"
i_am_regex = "I('m| am)(?: (an|a))? ({})".format(mbti_regex)
my_mbti_regex = "(M|m)y ({}) (personality|experience)".format(mbti_regex)
mbti_here_regex = "({})(?: \((m|f)\))? here".format(mbti_regex)
fellow_mbti_regex = "(F|f)ellow ({})".format(mbti_regex)
i_mbti_regex = "(Me|I)(?: )?\(({})\)".format(mbti_regex)
looking_for_regex = "({}) looking for".format(mbti_regex)

match = re.search(i_am_regex, "I am an INFJ female, with no INFJ’s in my family. I’ve always felt like they never understood me and that they think I am \"too much\" and really weird when I share parts of \"my world\" and inner thoughts with them. It’s made me feel disconnected from them. \n\nIs this common for INFJ’s?")
if match:
    print(match.group(4))

I


In [135]:
def extract_mbti_from_message(df):
    mbti_regex = "(I|E|X)(S|N|X)(F|T|X)(J|P|X)"
    regex_dict = {
        "i_am_mbti_regex": {"regex": "I('m| am)(?: (an|a))? ({})".format(mbti_regex), "group_index": 2},
        "my_mbti_regex": {"regex": "(M|m)y ({}) (personality|experience)".format(mbti_regex), "group_index": 1},
        "mbti_here_regex": {"regex": "({})(?: \((m|f)\))? here".format(mbti_regex), "group_index": 0},
        "fellow_mbti_regex": {"regex": "(F|f)ellow ({})".format(mbti_regex), "group_index": 1},
        "i_mbti_regex": {"regex": "(Me|I)(?: )?\(({})\)".format(mbti_regex), "group_index": 1},
        "looking_for_regex": {"regex": "({}) looking for".format(mbti_regex), "group_index": 0}
    }
    result_df = pd.DataFrame()
    for regex_name, regex_item in regex_dict.items():
        result_df["{}__on__title".format(regex_name)] = df["title"].str.extract(regex_item["regex"])[regex_item["group_index"]]
        result_df["{}__on__content".format(regex_name)] = df["content"].str.extract(regex_item["regex"])[regex_item["group_index"]]
    return result_df


extract_mbti_from_message_df = extract_mbti_from_message(comments_df)

In [141]:
(~extract_mbti_from_message_df.isna()).sum(axis=0)

i_am_mbti_regex__on__title          6
i_am_mbti_regex__on__content      232
my_mbti_regex__on__title            1
my_mbti_regex__on__content          0
mbti_here_regex__on__title          6
mbti_here_regex__on__content       70
fellow_mbti_regex__on__title        3
fellow_mbti_regex__on__content     43
i_mbti_regex__on__title             3
i_mbti_regex__on__content           2
looking_for_regex__on__title        5
looking_for_regex__on__content      1
dtype: int64

In [144]:
extract_mbti_from_message_df[(~extract_mbti_from_message_df.isna()).sum(axis=1) == 2]

Unnamed: 0,i_am_mbti_regex__on__title,i_am_mbti_regex__on__content,my_mbti_regex__on__title,my_mbti_regex__on__content,mbti_here_regex__on__title,mbti_here_regex__on__content,fellow_mbti_regex__on__title,fellow_mbti_regex__on__content,i_mbti_regex__on__title,i_mbti_regex__on__content,looking_for_regex__on__title,looking_for_regex__on__content
52,,,,,ENTP,ENTP,,,,,,
6395,,INTP,,,,,,INTP,,,,
6710,,INFP,,,,INFP,,,,,,
9056,,ENFP,,,,,,,,,ENFP,
9563,ENTJ,ENTJ,,,,,,,,,,
10512,,ENFP,,,,ENFP,,,,,,
10977,,ENFJ,,,,,,ENFJ,,,,
14123,,ENFP,,,,,,,,,ENFP,
14225,,INFP,,,,INFP,,,,,,
14531,,ENFP,,,,,,,,,ENFP,


In [153]:
import numpy as np


def get_extracted_mbti_if_exists(row):
    if row.last_valid_index() is None:
        return np.nan
    else:
        return row[row.last_valid_index()]

comments_df["extracted_mbti"] = extract_mbti_from_message_df.apply(get_extracted_mbti_if_exists, axis=1)

In [154]:
comments_df

Unnamed: 0,id,user_id,parent_comment_id,title,content,created_datetime,upvotes,subreddit,extracted_mbti
0,ltmgwm,6i0rnp1p,,Always felt different and misunderstood by family,"I am an INFJ female, with no INFJ’s in my fami...",2021-02-27 12:45:56,271,infj,INFJ
1,gozeenz,1s8dnq6p,ltmgwm,,Yupppp. No one understands me in my family. I ...,2021-02-27 13:02:48,85,infj,
2,gp0l91l,b708k,gozeenz,,Same. Doing my own thing whether it's outside ...,2021-02-27 16:04:27,30,infj,
3,gp0ffo6,mkfu3,ltmgwm,,I moved across the country straight out of col...,2021-02-27 15:40:01,46,infj,
4,gp0g9eh,vh9kmmx,gp0ffo6,,Are you me? I thought I was the only weirdo w...,2021-02-27 15:43:01,18,infj,
...,...,...,...,...,...,...,...,...,...
15172,gp7jyjl,3c5ent8f,lunkpn,,I do this as well. It is definitely not becaus...,2021-02-28 20:11:41,10,mbti,
15173,gp7n1dp,681obxgg,lunkpn,,I do this too. I think it's just an Fi thing t...,2021-02-28 20:27:52,6,mbti,
15174,gp7tp8p,sh56t,lunkpn,,Sometimes? Not sure how simply using yourself ...,2021-02-28 21:02:51,2,mbti,
15175,gp8q0wi,1jceigvp,lunkpn,,It can appear selfish since it might look like...,2021-03-01 00:24:43,1,mbti,


In [160]:
users_df

Unnamed: 0,id,name,flair_text,mbti_type,unbolded_flair_text
0,6i0rnp1p,igid221,,,
1,1s8dnq6p,Sheilaahmad,,,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG,INFJ,INFJ-T 4w5 HSP HSS Empath | 26M SG
3,mkfu3,ShannyPantsxo,,,
4,vh9kmmx,lala2love,,,
...,...,...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP,INFP,INFP
4807,pktgyjt,Squamply,,,
4808,76fgxdug,LoviEnthusiast,,,
4809,sh56t,sotaponi,,,


In [163]:
merged_comment_df = comments_df.merge(users_df[["id", "mbti_type"]], left_on="user_id", right_on="id")

In [164]:
((merged_comment_df["mbti_type"].isna()) & (~merged_comment_df["extracted_mbti"].isna())).sum()

227

In [165]:
((~merged_comment_df["mbti_type"].isna()) & (merged_comment_df["extracted_mbti"].isna())).sum()

7740

In [166]:
comments_df.groupby("user_id")["extracted_mbti"].unique()

user_id
100fu2          [nan]
100sih          [nan]
100wx2          [nan]
101tuq    [INTJ, nan]
104bxl          [nan]
             ...     
zuefe           [nan]
zvaew           [nan]
zvb3x           [nan]
zwq9h           [nan]
zyxf8           [nan]
Name: extracted_mbti, Length: 4811, dtype: object

In [167]:
comments_df.groupby("user_id")["extracted_mbti"].unique().str[0]

user_id
100fu2     NaN
100sih     NaN
100wx2     NaN
101tuq    INTJ
104bxl     NaN
          ... 
zuefe      NaN
zvaew      NaN
zvb3x      NaN
zwq9h      NaN
zyxf8      NaN
Name: extracted_mbti, Length: 4811, dtype: object

In [176]:
extracted_mbti_by_user_df = comments_df[~comments_df["extracted_mbti"].isna()][["user_id", "extracted_mbti"]].groupby("user_id")["extracted_mbti"].unique()

In [177]:
extracted_mbti_by_user_df[extracted_mbti_by_user_df.str.len() > 1]

user_id
64ga0       [ENFJ, ENTP]
6ddnv9ui    [ENFJ, ESTP]
870mk3j8    [ENFJ, INFP]
9p9p2axq    [INTP, ESTP]
9piy9onx    [ISTP, ESTP]
Name: extracted_mbti, dtype: object

In [178]:
comments_df[comments_df["user_id"] == "64ga0"]

Unnamed: 0,id,user_id,parent_comment_id,title,content,created_datetime,upvotes,subreddit,extracted_mbti
10548,gp76591,64ga0,lu5zq9,,ENTP pursuing an ENFJ here. I felt she is the ...,2021-02-28 19:00:03,1,enfj,ENFJ
10549,ltp0x2,64ga0,,ENFJ game,Do you tend to unconsciously play a game of pu...,2021-02-27 15:17:16,58,enfj,ENTP
10551,gp0p2x2,64ga0,gp0oehu,,She already responds quite well and it seems n...,2021-02-27 16:26:07,3,enfj,
10554,gp12ttu,64ga0,gp0peg0,,I never expected a monogamous relationship fro...,2021-02-27 17:16:03,2,enfj,
10558,gp6n9vi,64ga0,gp5z54d,,"We started out super intense, talked for month...",2021-02-28 16:57:59,1,enfj,
12496,gpa3q6s,64ga0,luwh79,,I do have feelings but I dont stay too long in...,2021-03-01 09:19:53,2,entp,
12786,gp7etjg,64ga0,lu1b7y,,"Pursuing a ENFJ girl here, but we are long dis...",2021-02-28 19:44:51,2,entp,
12788,gp7nc1c,64ga0,gp7l50e,,"For now yes, but Im more distanced. Still very...",2021-02-28 20:29:25,2,entp,
12790,gp7pcj4,64ga0,gp7o63w,,I still have feelings for her but nowhere near...,2021-02-28 20:39:58,1,entp,
12792,gp7r283,64ga0,gp7qksb,,Damn thats amazing. Hopefully she will love th...,2021-02-28 20:49:01,2,entp,


In [180]:
comments_df.iloc[10549]["content"]

'Do you tend to unconsciously play a game of push and pull with your romantic partners? Things like pulling away emotionally and being insecure about your feelings? If you have ever done that, do you like it more when the person reaches back and shows you they dont expect anything  or when they quit and act indifferent. I am asking because this enfj girl pulled away from me a bit and it felt bad but since I am an ENTP I got over it quick and I told her Im not attached and I don\'t expect anything from her, just want to see her do well, and she said she is not used to such behavior and treatment... I think I did the right thing, even though some people may think I\'m "simping" or whatever, I just  like to be honest with people I really like.'

In [181]:
comments_df[comments_df["user_id"] == "6ddnv9ui"]

Unnamed: 0,id,user_id,parent_comment_id,title,content,created_datetime,upvotes,subreddit,extracted_mbti
10799,lq53jt,6ddnv9ui,,Struggling to accept I am ENFJ,I (18M) keep reading about ENFJs and how femin...,2021-02-23 01:59:49,50,enfj,ENFJ
10804,gog40vs,6ddnv9ui,goeplq2,,Thank you so much for this :) great to read.,2021-02-23 11:38:36,2,enfj,
10806,gog47pi,6ddnv9ui,gofbl52,,This was very informative . Thank you.,2021-02-23 11:41:32,1,enfj,
10810,gog4bpq,6ddnv9ui,gofci2i,,I don’t think I am ESTP. I literally absorb my...,2021-02-23 11:43:14,1,enfj,ESTP
10812,goh535n,6ddnv9ui,gog4k4f,,Well like if I am having a convo with them and...,2021-02-23 17:34:08,1,enfj,
10815,gog4gut,6ddnv9ui,gofwzwg,,"Yes , I do like to use my Ni a lot. I base pre...",2021-02-23 11:45:25,3,enfj,
10818,goh4w2z,6ddnv9ui,gog682x,,What industry is this in ?,2021-02-23 17:32:58,2,enfj,
10821,goh5l9c,6ddnv9ui,gog27qh,,Damn man.. you are so right. I only want to be...,2021-02-23 17:37:08,2,enfj,
10826,goh64yx,6ddnv9ui,gog74ta,,Thanks so much for this brother . I appreciate...,2021-02-23 17:40:27,2,enfj,
10828,goj0msx,6ddnv9ui,gohiih7,,It was great. Thank you very much :),2021-02-24 01:35:00,2,enfj,


In [184]:
# It is better to remove these users with ambiguous types
extracted_mbti_by_user_df[extracted_mbti_by_user_df.str.len() == 1].str[0].reset_index()

Unnamed: 0,user_id,extracted_mbti
0,101tuq,INTJ
1,115zer,ENTP
2,11bj6s,INFP
3,11nzmg,INTJ
4,123994is,ESTJ
...,...,...
307,x6twfh2,ENTP
308,xrxhv,INFJ
309,yelqs,ENFJ
310,zf59s,ESTJ


In [192]:
users_df = users_df.merge(extracted_mbti_by_user_df[extracted_mbti_by_user_df.str.len() == 1].str[0], left_on="id", right_on="user_id", how="left")

In [193]:
users_df = users_df.rename(columns={"mbti_type": "mbti_type_from_flair_text", "extracted_mbti": "mbti_type_from_comments"})

In [194]:
users_df

Unnamed: 0,id,name,flair_text,mbti_type_from_flair_text,unbolded_flair_text,mbti_type_from_comments
0,6i0rnp1p,igid221,,,,INFJ
1,1s8dnq6p,Sheilaahmad,,,,
2,b708k,lzkbloodmage,INFJ-T 4w5 HSP HSS Empath | 26M SG,INFJ,INFJ-T 4w5 HSP HSS Empath | 26M SG,
3,mkfu3,ShannyPantsxo,,,,
4,vh9kmmx,lala2love,,,,
...,...,...,...,...,...,...
4806,2k93qy2z,krystalzhhf,INFP,INFP,INFP,
4807,pktgyjt,Squamply,,,,
4808,76fgxdug,LoviEnthusiast,,,,ENFP
4809,sh56t,sotaponi,,,,


In [195]:
users_df["mbti_type"] = users_df["mbti_type_from_flair_text"]

In [196]:
no_mbti_type_from_flair_text = users_df["mbti_type_from_flair_text"].isna()
users_df.loc[no_mbti_type_from_flair_text, "mbti_type"] = users_df[no_mbti_type_from_flair_text]["mbti_type_from_comments"]

In [198]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4811 entries, 0 to 4810
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   id                         4811 non-null   object
 1   name                       4811 non-null   object
 2   flair_text                 4811 non-null   object
 3   mbti_type_from_flair_text  1978 non-null   object
 4   unbolded_flair_text        4811 non-null   object
 5   mbti_type_from_comments    312 non-null    object
 6   mbti_type                  2173 non-null   object
dtypes: object(7)
memory usage: 300.7+ KB


In [199]:
px.bar(users_df.groupby("mbti_type")["id"].count().sort_values(ascending=False))