In [1]:
import pickle

In [2]:
filename = "data/MVMR_BERT_tok.pkl"
# filename = "BERT.tok.pkl"
with open(filename, "rb") as f:
    data = pickle.load(f)

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
import re

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/malcolmzhao/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/malcolmzhao/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# to handle the pattern such as "he ##llo" to "hello
def decontract_text(string):
    pattern = r'((?:\w+ ##\w+\s)*)'
    matches = re.findall(pattern, string)
    for match in matches:
        if len(match) > 0:
            modified_string = re.sub(r"[ #]", "", match)
            modified_string += " "
            string = string.replace(match, modified_string)
    return string

In [9]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    # Tokenize the input text
    text = decontract_text(text)
    word_tokens = word_tokenize(text) # word_tokens will be a list of words in the input text.
    # Remove stopwords and punctuations
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words and word.isalnum()]
    # Reconstruct the string
    filtered_text_str = ' '.join(filtered_text)
    return filtered_text_str

In [10]:
mbti_dict = {
    "i": "e",
    "n": "s",
    "f": "t",
    "p": "j"
}

In [11]:
def get_score(idx, mbti_attr):
    score = data[idx][mbti_attr.upper()]
    if not (0 <= score <= 100):
        score = 100 - data[idx][mbti_dict[mbti_attr].upper()]
    return score

In [12]:
# create raw data with the below structure
raw_data = pd.DataFrame({
    "id": None,
    "person": None,
    "category": None,
    "i_score": None,
    "n_score": None,
    "f_score": None,
    "p_score": None,
    "dialog": None
    }, index = [0])
for i in range(len(data)):
    raw_data = raw_data.append({
        "id": data[i]["id"],
        "person": data[i]["mbti_profile"],
        "category": data[i]["subcategory"],
        "i_score": get_score(i, "i"),
        "n_score": get_score(i, "n"),
        "f_score": get_score(i, "f"),
        "p_score": get_score(i, "p"),
        "dialog": remove_stopwords("".join(data[i]["dialog_text"]))
    }, ignore_index = True)
    # if i >= 2:
    #     break
    if i % 200 == 0:
        print("\r%d"%i, end = " ")

3400 

In [13]:
raw_data = raw_data.dropna(subset = "id")

In [14]:
raw_data = raw_data.reset_index(drop=True)
drop_pool = (raw_data[["i_score", "n_score", "f_score", "p_score"]].isna().sum(axis = 1) >= 3) # remove character with more than 3 channels are null
raw_data = raw_data.loc[~drop_pool, :]

In [15]:
# remove characters with less than 80 words
raw_data["information_length"] = raw_data["dialog"].apply(lambda x:len(x.split()))
raw_data = raw_data.sort_values(by = "information_length", ascending = True)
raw_data = raw_data.loc[raw_data["information_length"] >= 80, :]

In [18]:
raw_data.shape

(2360, 9)

In [22]:
# remove charaters that do not have unique dialog
raw_data = raw_data.drop_duplicates(subset = ['dialog'], keep = False)

In [25]:
for col in ["i_score", "n_score", "f_score", "p_score"]:
    raw_data[col] = raw_data[col].fillna(raw_data[col].median()) # median fill na

In [26]:
raw_data.shape

(2144, 9)

In [29]:
raw_data.to_excel("data/MVMR_BERT_tok.xlsx", index = False)