# Text to MBTI

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
import nltk

nltk.download('wordnet')

In [None]:
# Matplotlib configuration
mpl.rcParams['figure.figsize'] = 15, 15
mpl.rcParams['figure.dpi'] = 300    # tells matplotlib to display inline plots at 300 DPI
mpl.rc("savefig", dpi = 300)        # tells matplotlib to save plots at 300 DPI
plt.style.use('fivethirtyeight')
# Makes Jupyter show the output of all lines, not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df = pd.read_csv("../data/mbti.csv")
raw_df = df.copy()

In [None]:
df

In [None]:
df.info()

In [None]:
df.type.value_counts()

The number of people per type are very imbalanced

In [None]:
df["posts"] = df.posts.apply(lambda l: l.split("|||"))
df["posts_count"] = df.posts.apply(len)

url_regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
punctuation_regex = r"[.!?\\-]"
emoji_regex = "(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)"
# matches
"""
:( :) :P :p :O :3 :| :/ :\ :$ :* :@
:-( :-) :-P :-p :-O :-3 :-| :-/ :-\ :-$ :-* :-@
:^( :^) :^P :^p :^O :^3 :^| :^/ :^\ :^$ :^* :^@
): (: $: *:
)-: (-: $-: *-:
)^: (^: $^: *^:
<3 </3 <\3
:smile: :hug: :pencil:
"""

df["posts_without_urls"] = df.posts \
    .apply(lambda posts: [re.sub(url_regex, '', post) for post in posts])

# Counting occurences of things across all posts of a person
df["urls_count"] = df.posts \
    .apply(lambda posts: sum([len(re.findall(url_regex, post)) for post in posts]))
df["punctuations_count"] = df.posts_without_urls \
    .apply(lambda posts: sum([len(re.findall(punctuation_regex, post)) for post in posts]))
df["emojis_count"] = df.posts_without_urls \
    .apply(lambda posts: sum([len(re.findall(emoji_regex, post)) for post in posts]))

In [None]:
def add_count_means_per_person(df):
    """
    Adds a mean per person column for each count column
    Requires the "posts_count" column
    """
    count_columns = [c for c in df.columns if c.endswith("_count") and c != "posts_count"]
    for count_column in count_columns:
        mean_col_name = "mean_{term_singular}_count".format(term_singular=count_column.split("_")[0][:-1])
        df[mean_col_name] = df[count_column] / df.posts_count
    return df
        
df = add_count_means_per_person(df)

In [None]:
# Adding the 4 dichotomies
df['is_E'] = df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
df['is_S'] = df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
df['is_T'] = df['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
df['is_J'] = df['type'].apply(lambda x: 1 if x[3] == 'J' else 0)
dichotomies = ["is_E", "is_S", "is_T", "is_J"]
all_dichotomies = ["is_{letter}".format(letter=letter) for letter in list("EISNTFJP")]

In [None]:
df.head()

In [None]:
df.iloc[0].posts

In [None]:
def aggregate_from_individuals(by: [str]):
    """
    Groups individuals by the specified column(s)
    """
    aggregation = {
        **{"posts_count": [sum, len],
           "posts": lambda post_lists: ' '.join([post for posts in post_lists for post in posts])},
          # flattens the list of post_lists then joins the result to have 1 huge string
        **{mean_c: np.mean for mean_c in [c for c in df.columns if c.startswith("mean_")]}
    }
    df_per_type = df.groupby(by) \
        .agg(aggregation).rename(columns={"posts": "content"})
    df_per_type["individuals"] = df_per_type.posts_count.len
    df_per_type["posts"] = df_per_type.posts_count["sum"]
    df_per_type.columns = df_per_type.columns.droplevel(1)
    df_per_type = df_per_type.drop(columns=["posts_count"])
    df_per_type = df_per_type[list(df_per_type.columns[-2:]) + list(df_per_type.columns[:-2])]
    return df_per_type

df_per_type = aggregate_from_individuals(by=["type"])
df_per_type

In [None]:
# adding dichotomies agg
df_per_dicho = pd.concat([aggregate_from_individuals(by=dicho) for dicho in dichotomies])
df_per_dicho.index = all_dichotomies
df_per_dicho

In [None]:
# joining to have one summary df
agg_df = pd.concat([df_per_type, df_per_dicho])
agg_df.drop(columns=["content"]).to_csv("some_stats_per_group.csv")
agg_df

In [None]:
for col_to_plot in [c for c in agg_df.columns if c.startswith("mean_")]:
    agg_df[col_to_plot].sort_values().plot.barh(title=col_to_plot.replace('_', ' ').title());
    plt.savefig("{}.png".format(col_to_plot), bbox="tight");
    plt.show()

## Cleaning the Posts

In [None]:
from nltk.tokenize import word_tokenize

def clean_text(text):
    text = re.sub(url_regex, '', text)
    text = re.sub(emoji_regex, '', text)
    text = re.sub(punctuation_regex, '', text)
    text = text.lower()
    text = re.sub(r'\W|_', ' ', text) # removes special chars
    text = re.sub(r'[0-9]', '', text) # removes digits
    text = re.sub(r'\s+', ' ', text) # removes multiple spaces
    text = re.sub(r'^\s|\s$', '', text) # removes space at the start or end of the string
    
    tokens = word_tokenize(text)  # tokenizes
    
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords] # removes stopwords
    
    wn = nltk.WordNetLemmatizer()
    tokens = [wn.lemmatize(token) for token in tokens] # lematizes=root words
    return tokens

example_post = agg_df.iloc[0].content[:174]
print("{} \n     |\n     v\n{}".format(
    example_post,
    clean_text(example_post)
))

In [None]:
agg_df.content = agg_df.content.apply(clean_text)

## Selecting Words
All words should not be kept as some tend to make a model worse rather than better.

In [None]:
def get_vocabulary_and_occurences(series):
    series = series.copy()
    all_text = []
    for text in series:
        for word in text:
            all_text.append(word)
            
    vocab = set(all_text)
    occurences_per_word = {word: all_text.count(word) for word in vocab}
    return vocab, occurences_per_word

vocabulary, occurencer_per_word = get_vocabulary_and_occurences(agg_df.content)

In [None]:
import json

with open("word_occurrences.json", "w") as outfile:  
    json.dump(occurencer_per_word, outfile) 

In [None]:
occurencer_per_word_series = pd.Series(list(occurencer_per_word.values()))
occurencer_per_word_series.index = occurencer_per_word.keys()
occurencer_per_word_series.plot.hist();

In [None]:
MIN_OCCURENCES = 2
rare_words = [w for w in vocabulary if occurencer_per_word[w] < MIN_OCCURENCES]
print("{} words ({}%) will not be kept because of their rarity".format(
    len(rare_words),
    round(len(rare_words) / len(vocabulary) * 100)
))

def remove_rare_words(tokens):
    return [t for t in tokens if t not in rare_words]

## Vectorization

Sklearn machine learning models are meant to process numerical data, not text.

We will therefore vectorize our text: we will convert them to a table indicating for each one the presence or absence of each word with a float value translating its relative importance.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_vectors_df(text_col): 
    tfidf_vect = TfidfVectorizer()
    X = tfidf_vect.fit_transform(text_col)
    df = pd.DataFrame(X.toarray())
    df.columns = tfidf_vect.get_feature_names()
    return df
    
vectors = get_vectors_df(agg_df.content.apply(lambda c: ' '.join(c)))
vectors["group"] = agg_df.index
vectors = vectors[["group"] + list(vectors.columns[1:])]
vectors

## Removing non-correlated words

In [None]:
corr_df = vectors.corr()

In [None]:
char_indexes = source_dummies.columns
corr_df = corr_df[char_indexes].sort_values(0, ascending=False)
corr_df = corr_df.drop(char_indexes)
corr_df