In [1]:
# import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer # <-- I went with a TF-IDF approach for now, but we could switch to this
                                                            #      later on if we want to try a simpler "bag-of-words" approach.

import nltk
from textblob import TextBlob

---------------------
## **Parameters:**

In [2]:
MIN_DOC_FREQ = 0.02
MAX_DOC_FREQ = 0.8

#_______________________________________
# For testing & diagnostics ONLY:
DATA_LIMIT = 3

---------------------
### Data intake:

In [3]:
data = pd.read_csv('raw_data/Types_500/Types_500.csv')
# data = pd.read_csv('raw_data/Twitter_MBTI/twitter_MBTI.csv')

In [4]:
data = data[['type', 'posts']]
data.rename(columns={'type': 'target', 'posts': 'text'}, inplace=True)

print(f'\nDataset contains {data.shape[0]} rows\n')
data.sample(10)


Dataset contains 106067 rows



Unnamed: 0,target,text
0,INTJ,know intj tool use interaction people excuse a...
1,INTJ,rap music ehh opp yeah know valid well know fa...
2,INTJ,preferably p hd low except wew lad video p min...
3,INTJ,drink like wish could drink red wine give head...
4,INTJ,space program ah bad deal meing freelance max ...


In [5]:
### OPTIONAL: DATA LIMITING
if DATA_LIMIT > 0:
    data = data.iloc[:3]

Unnamed: 0,target,text
0,INTJ,know intj tool use interaction people excuse a...
1,INTJ,rap music ehh opp yeah know valid well know fa...
2,INTJ,preferably p hd low except wew lad video p min...


In [6]:
data

Unnamed: 0,target,text
0,INTJ,know intj tool use interaction people excuse a...
1,INTJ,rap music ehh opp yeah know valid well know fa...
2,INTJ,preferably p hd low except wew lad video p min...


---------------
### Spelling correction:

**Use this version if done BEFORE tokenizing (which will mess up the MBTI type removal...)**

In [7]:
def correct_spelling(dataframe):
    for i in range(dataframe.shape[0]):
        dataframe.text[i] = str(TextBlob(dataframe.iloc[i].text).correct())
    return dataframe

In [8]:
# [[[For later]]] COUNT number of corrections:


------------
### Tokenizing:

In [9]:
# tokenizing:
# data['text'] = data['text'].apply(lambda row: nltk.word_tokenize(row))

In [10]:
def tokenize(dataframe):
    dataframe['text'] = dataframe['text'].apply(lambda row: nltk.word_tokenize(row))
    return dataframe

-----------------
### Removing key personality terms:

(Step needs to be after TOKENIZING)

In [11]:
def remove_MBTI_types(dataframe):
    # Set list of terms (strings) to remove:
    masking = ['intj', 'intp', 'infj', 'infp',
               'istj', 'istp', 'isfj', 'isfp',
               'entj', 'entp', 'enfj', 'enfp',
               'estj', 'estp', 'esfj', 'esfp']
    # Update dataframe column with masked text:
    dataframe['text'] = dataframe['text'].apply(lambda x: [word for word in x if word not in masking])
    return dataframe

---------------
### Spelling correction:

**Use this version if done AFTER tokenizing (which I think should be fine...)**

In [12]:
def correct_spelling(dataframe):
    for i in range(dataframe.shape[0]):
        dataframe.text[i] = str(TextBlob(str(' '.join(data.iloc[i].text))).correct())
    return dataframe

In [13]:
# [[[For later]]] COUNT number of corrections:


-------------
### Vectorization:

Uses TF-IDF, and so also removes rare words:

In [14]:
def vectorize(dataframe):
    # Instantiate vectorizer:
    vectorizer = TfidfVectorizer(min_df=MIN_DOC_FREQ, max_df=MAX_DOC_FREQ)

    # Fit & transform training data:
    X = vectorizer.fit_transform(dataframe['text'].apply(' '.join))

    # Re-cast vectorized data into DataFrame format:
    X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    # Append vectorized output onto the input dataframe:
    return pd.concat([data, X_df], axis=1)


------

In [15]:
test = tokenize(data)   # <--------------------
test = remove_MBTI_types(test)
test = correct_spelling(test)
test = tokenize(test)   # <--------------------
test = vectorize(test)
test


Unnamed: 0,target,text,able,accept,accord,achieve,acknowledgment,act,active,activity,...,wolf,woman,wooden,work,world,write,yeah,yet,york,zone
0,INTJ,"[know, tool, use, interaction, people, excuse,...",0.0,0.0,0.041373,0.0,0.041373,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.041373,0.0,0.031465,0.0,0.0
1,INTJ,"[ran, music, eh, pp, yeah, know, valid, well, ...",0.092274,0.070177,0.0,0.0,0.0,0.070177,0.0,0.046137,...,0.046137,0.035088,0.046137,0.105265,0.046137,0.0,0.046137,0.0,0.046137,0.0
2,INTJ,"[preferably, p, he, low, except, we, lad, vide...",0.0,0.029941,0.0,0.039369,0.0,0.029941,0.039369,0.0,...,0.0,0.029941,0.0,0.179648,0.0,0.0,0.0,0.029941,0.0,0.039369
