# Natural Language Processing

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [5]:
pip install emoji


The following command must be run outside of the IPython shell:

    $ pip install emoji

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [6]:
import os
os.getcwd()

'C:\\Users\\marangrang\\Downloads\\Personality Classification'

In [7]:
os.chdir("C:\\Users\\marangrang\\Downloads\\mbti-myers-briggs-personality-type-dataset")

We got our data from Kaggle InClass Prediction Competition, a Personality Profile Prediction.

In [8]:
# use Pandas to read in the csv files. The pd.read_csv() method creates a DataFrame from a csv file
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
#Look at the first 3 rows of our training data
train.head(3)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...


In [10]:
#Look at the first 3 rows of our testing data
test.head(3)

Unnamed: 0,id,posts
0,1,Musical Linguistic Logic & Naturalist (tied)|...
1,2,'You: hello :) Stranger: hii You: how are you ...
2,3,'What worked for me was knowing that limerence...


In [11]:
test_id = test['id']
test_id.head()

0    1
1    2
2    3
3    4
4    5
Name: id, dtype: int64

#  Data Cleaning

# 1.1 Introduction
<br>

We'll be walking through:
- Where we got our data - in this case, we'll get our data from kaggle competition MBTI
- Cleaning the data - we will walk through popular text pre-processing techniques
- Organizing the data - we will organize the cleaned data into a way that is easy to input into other algorithms
<br>

The output of this notebook will be clean, organized data in two standard text formats:
<br>

1. Corpus - a collection of text
2. Document-Term Matrix - word counts in matrix format

# Problem Statement
<br>
As a reminder, our goal is to look at posts from different sites and be able to identify personality type for each post.

# Getting The Data
<br>
Luckily, there are sites like kaggle that runs exciting competitions.

# Cleaning The Data
<br>
When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there some common data cleaning techniques which are also known as text-processing techniques. 
With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimium viable product)approuch - start simple and iterate. Here are a bunchs of things you can do to clean your data. We're going to excute the common cleaning steps here and the rest can be done at a later point to improve our results. 
<br>
#### Basic Text Pre-processing of text data:
- Make text all lower case
- Remove punctuation
- Stopwords removal
- Remove numerical values
- Frequent words removal
- Remove common non-sensical text(/n)
- Rare words removal
- Tokenize text
- Stemming
- Lemmatication
- Spelling correction
<br>
#### Advance Text Processing:
- N-grams
- Term frequency
- Inverse Document Frequency
- Term Frequency-Inverse Document Frequency(TF-IDF)
- Bag of words
- Sentiment Analysis
- Word Embedding

In [12]:
# Let's take a look at our data
next(iter(train.keys()))

'type'

In [13]:
datasets = train
datasets.set_index('type',inplace=True)
datasets.head()

Unnamed: 0_level_0,posts
type,Unnamed: 1_level_1
INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
ENTP,'I'm finding the lack of me in these posts ver...
INTP,'Good one _____ https://www.youtube.com/wat...
INTJ,"'Dear INTP, I enjoyed our conversation the o..."
ENTJ,'You're fired.|||That's another silly misconce...


In [14]:
# Let's take a look at a post for INFJ
datasets.posts.loc['INFJ']

type
INFJ    'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
INFJ    'No, I can't draw on my own nails (haha). Thos...
INFJ    I'm not sure, that's a good question. The dist...
INFJ    'One time my parents were fighting over my dad...
INFJ    'Joe santagato - ENTP|||ENFJ or  ENTP?   I'm n...
INFJ    'some of these both excite and calm me:  BUTTS...
INFJ    'I fully believe in the power of being a prote...
INFJ    'It is very annoying to be misinterpreted. Esp...
INFJ    'I think that that can absolutely be true of i...
INFJ    it could be pyroluria.. you know.. it is an on...
INFJ    'Sometimes I wonder that too.. the reason bein...
INFJ    http://www.youtube.com/watch?v=ipUdoUcNmKI  ht...
INFJ    'Trying not to feel totally worthless...  Why ...
INFJ    'Me: INFJ Mom: ISTJ Dad: ENFJ Sister: ISTJ|||I...
INFJ    'I would strongly recommend not taking shortcu...
INFJ    'than you may be an ambivert, somewhere in the...
INFJ    'Yeah i'm an a-hole too depending on who you a...
INFJ    '

In [15]:
# Apply a first round of text cleaning techniques
import re
import string

def cleaning_data(text):
    '''Remove web url'''
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    '''Make text lowercase'''
    text = text.lower()
    '''remove text in square brackets'''
    text = re.sub('\[.*?\]', '', text)
    '''remove punctuations'''
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    '''remove digits'''
    text = re.sub('\w*\d\w*', '', text)
    '''remove stop words'''
    STOPWORDS = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

data_round1 = lambda x: cleaning_data(x)

In [16]:
# Lets take a look at the updated text
data_cleaning = pd.DataFrame(datasets.posts.apply(data_round1))
data_cleaning

Unnamed: 0_level_0,posts
type,Unnamed: 1_level_1
INFJ,intj moments sportscenter top ten plays pranks...
ENTP,im finding lack posts alarmingsex boring posit...
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...
INTJ,dear intp enjoyed conversation day esoteric ga...
ENTJ,youre firedthats another silly misconception a...
INTJ,science perfect scientist claims scientific in...
INFJ,cant draw nails haha done professionals nails ...
INTJ,tend build collection things desktop use frequ...
INFJ,im sure thats good question distinction two de...
INTP,position actually let go person due various re...


In [17]:
def cleaning_data2(text):
    '''Get rid of some additional punctuations '''
    text = re.sub('\[''""...]', '', text)
    '''Get rid of non-sensical'''
    text = re.sub('\n', '', text)
    '''Remove single characters from the start'''
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    '''Removing prefixed 'b'''
    text = re.sub(r'^b\s+', '', text)
    '''Correcting typos'''
    text = text.correct()
    '''Remove rare words'''
    freq = pd.Series(' '.join(data_cleaning['posts']).split()).value_counts()[-500:]
    # let's remove these words as their presence will be of any use
    freq = list(freq.index)
    text = data_cleaning['posts'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    return text

data_round2 = lambda x: cleaning_data(x)

In [18]:
# Lets take a look at the updated text
data_cleaning = pd.DataFrame(data_cleaning.posts.apply(data_round2))
data_cleaning

Unnamed: 0_level_0,posts
type,Unnamed: 1_level_1
INFJ,intj moments sportscenter top ten plays pranks...
ENTP,im finding lack posts alarmingsex boring posit...
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...
INTJ,dear intp enjoyed conversation day esoteric ga...
ENTJ,youre firedthats another silly misconception a...
INTJ,science perfect scientist claims scientific in...
INFJ,cant draw nails haha done professionals nails ...
INTJ,tend build collection things desktop use frequ...
INFJ,im sure thats good question distinction two de...
INTP,position actually let go person due various re...


In [19]:
def cleaning_data3(text):
    '''Get rid of all single characters'''
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', str(text))
    '''Substituting multiple spaces with single space'''
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    '''Remove all the special characters'''
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', 'user','@[a-z0-9]+', str(text))
    '''Remove Frequent words'''
    freq = pd.Series(' '.join(data_cleaning['posts']).split()).value_counts()[:500]
    # let's remove these words as their presence will be of any use
    freq = list(freq.index)
    text = data_cleaning['posts'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    return text
                  
data_round3 = lambda x: cleaning_data(x) 

In [20]:
# Lets take a look at the updated text
data_cleaning = pd.DataFrame(data_cleaning.posts.apply(data_round3))
data_cleaning

Unnamed: 0_level_0,posts
type,Unnamed: 1_level_1
INFJ,intj moments sportscenter top ten plays pranks...
ENTP,im finding lack posts alarmingsex boring posit...
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...
INTJ,dear intp enjoyed conversation day esoteric ga...
ENTJ,youre firedthats another silly misconception a...
INTJ,science perfect scientist claims scientific in...
INFJ,cant draw nails haha done professionals nails ...
INTJ,tend build collection things desktop use frequ...
INFJ,im sure thats good question distinction two de...
INTP,position actually let go person due various re...


In [21]:
def token_maker(token):
    tokeniser = TreebankWordTokenizer()
    data_cleaning['tokens'] = pd.DataFrame(data_cleaning.posts.apply(tokeniser.tokenize))
    return token
data_round4 = lambda x: cleaning_data(x)

In [22]:
data_cleaning = token_maker(data_cleaning)
data_cleaning

Unnamed: 0_level_0,posts,tokens
type,Unnamed: 1_level_1,Unnamed: 2_level_1
INFJ,intj moments sportscenter top ten plays pranks...,"[intj, moments, sportscenter, top, ten, plays,..."
ENTP,im finding lack posts alarmingsex boring posit...,"[im, finding, lack, posts, alarmingsex, boring..."
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...,"[good, one, httpswwwyoutubecomwatchvfhigbolffg..."
INTJ,dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote..."
ENTJ,youre firedthats another silly misconception a...,"[youre, firedthats, another, silly, misconcept..."
INTJ,science perfect scientist claims scientific in...,"[science, perfect, scientist, claims, scientif..."
INFJ,cant draw nails haha done professionals nails ...,"[cant, draw, nails, haha, done, professionals,..."
INTJ,tend build collection things desktop use frequ...,"[tend, build, collection, things, desktop, use..."
INFJ,im sure thats good question distinction two de...,"[im, sure, thats, good, question, distinction,..."
INTP,position actually let go person due various re...,"[position, actually, let, go, person, due, var..."


In [23]:
# find the stem of each word in words
def stemm_maker(words):
    stemm = SnowballStemmer('english')
    return [stemm.stem(word) for word in words] 

In [24]:
data_cleaning['stem'] = pd.DataFrame(data_cleaning.tokens.apply(stemm_maker))
data_cleaning

Unnamed: 0_level_0,posts,tokens,stem
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
INFJ,intj moments sportscenter top ten plays pranks...,"[intj, moments, sportscenter, top, ten, plays,...","[intj, moment, sportscent, top, ten, play, pra..."
ENTP,im finding lack posts alarmingsex boring posit...,"[im, finding, lack, posts, alarmingsex, boring...","[im, find, lack, post, alarmingsex, bore, posi..."
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...,"[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good, one, httpswwwyoutubecomwatchvfhigbolffg..."
INTJ,dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote...","[dear, intp, enjoy, convers, day, esoter, gab,..."
ENTJ,youre firedthats another silly misconception a...,"[youre, firedthats, another, silly, misconcept...","[your, firedthat, anoth, silli, misconcept, ap..."
INTJ,science perfect scientist claims scientific in...,"[science, perfect, scientist, claims, scientif...","[scienc, perfect, scientist, claim, scientif, ..."
INFJ,cant draw nails haha done professionals nails ...,"[cant, draw, nails, haha, done, professionals,...","[cant, draw, nail, haha, done, profession, nai..."
INTJ,tend build collection things desktop use frequ...,"[tend, build, collection, things, desktop, use...","[tend, build, collect, thing, desktop, use, fr..."
INFJ,im sure thats good question distinction two de...,"[im, sure, thats, good, question, distinction,...","[im, sure, that, good, question, distinct, two..."
INTP,position actually let go person due various re...,"[position, actually, let, go, person, due, var...","[posit, actual, let, go, person, due, various,..."


In [25]:
def lemma_maker(words):
    ''' Lemmatization - returns the dictionary form of a word '''
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

In [26]:
data_cleaning['lemma'] = pd.DataFrame(data_cleaning.tokens.apply(lemma_maker))
data_cleaning

Unnamed: 0_level_0,posts,tokens,stem,lemma
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
INFJ,intj moments sportscenter top ten plays pranks...,"[intj, moments, sportscenter, top, ten, plays,...","[intj, moment, sportscent, top, ten, play, pra...","[intj, moment, sportscenter, top, ten, play, p..."
ENTP,im finding lack posts alarmingsex boring posit...,"[im, finding, lack, posts, alarmingsex, boring...","[im, find, lack, post, alarmingsex, bore, posi...","[im, finding, lack, post, alarmingsex, boring,..."
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...,"[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good, one, httpswwwyoutubecomwatchvfhigbolffg..."
INTJ,dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote...","[dear, intp, enjoy, convers, day, esoter, gab,...","[dear, intp, enjoyed, conversation, day, esote..."
ENTJ,youre firedthats another silly misconception a...,"[youre, firedthats, another, silly, misconcept...","[your, firedthat, anoth, silli, misconcept, ap...","[youre, firedthats, another, silly, misconcept..."
INTJ,science perfect scientist claims scientific in...,"[science, perfect, scientist, claims, scientif...","[scienc, perfect, scientist, claim, scientif, ...","[science, perfect, scientist, claim, scientifi..."
INFJ,cant draw nails haha done professionals nails ...,"[cant, draw, nails, haha, done, professionals,...","[cant, draw, nail, haha, done, profession, nai...","[cant, draw, nail, haha, done, professional, n..."
INTJ,tend build collection things desktop use frequ...,"[tend, build, collection, things, desktop, use...","[tend, build, collect, thing, desktop, use, fr...","[tend, build, collection, thing, desktop, use,..."
INFJ,im sure thats good question distinction two de...,"[im, sure, thats, good, question, distinction,...","[im, sure, that, good, question, distinct, two...","[im, sure, thats, good, question, distinction,..."
INTP,position actually let go person due various re...,"[position, actually, let, go, person, due, var...","[posit, actual, let, go, person, due, various,...","[position, actually, let, go, person, due, var..."


In [27]:
data_cleaning['cleaned_lemma'] = data_cleaning['lemma'].apply(lambda x: ' '.join(x))

In [29]:
data_cleaning['type'] = data_cleaning.index
X_train = data_cleaning.groupby('type')['cleaned_lemma'].apply(list).reset_index()

data_cleaning['clean_post'] = X_train['cleaned_lemma'].apply(lambda x: ' '.join(x))

Defaulting to column, but this will raise an ambiguity error in a future version
  


In [30]:
def mbti_classes(df):
    mind = {"I": 0, "E": 1}
    energy = {"S": 0, "N": 1}
    nature = {"F": 0, "T": 1}
    tactics = {"P": 0, "J": 1}
    mbti = [mind, energy, nature, tactics]
    mbti_list = ['mind', 'energy', 'nature', 'tactics']
    for i in range(len(mbti)):
        data_cleaning[str(mbti_list[i])] = data_cleaning['type'].astype(str).str[i].map(mbti[i])
    return df

In [31]:
data_cleaning = mbti_classes(data_cleaning)
data_cleaning

Unnamed: 0_level_0,posts,tokens,stem,lemma,cleaned_lemma,type,clean_post,mind,energy,nature,tactics
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
INFJ,intj moments sportscenter top ten plays pranks...,"[intj, moments, sportscenter, top, ten, plays,...","[intj, moment, sportscent, top, ten, play, pra...","[intj, moment, sportscenter, top, ten, play, p...",intj moment sportscenter top ten play prankswh...,INFJ,,0,1,0,1
ENTP,im finding lack posts alarmingsex boring posit...,"[im, finding, lack, posts, alarmingsex, boring...","[im, find, lack, post, alarmingsex, bore, posi...","[im, finding, lack, post, alarmingsex, boring,...",im finding lack post alarmingsex boring positi...,ENTP,,1,1,1,0
INTP,good one httpswwwyoutubecomwatchvfhigbolffgwof...,"[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good, one, httpswwwyoutubecomwatchvfhigbolffg...","[good, one, httpswwwyoutubecomwatchvfhigbolffg...",good one httpswwwyoutubecomwatchvfhigbolffgwof...,INTP,,0,1,1,0
INTJ,dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote...","[dear, intp, enjoy, convers, day, esoter, gab,...","[dear, intp, enjoyed, conversation, day, esote...",dear intp enjoyed conversation day esoteric ga...,INTJ,,0,1,1,1
ENTJ,youre firedthats another silly misconception a...,"[youre, firedthats, another, silly, misconcept...","[your, firedthat, anoth, silli, misconcept, ap...","[youre, firedthats, another, silly, misconcept...",youre firedthats another silly misconception a...,ENTJ,,1,1,1,1
INTJ,science perfect scientist claims scientific in...,"[science, perfect, scientist, claims, scientif...","[scienc, perfect, scientist, claim, scientif, ...","[science, perfect, scientist, claim, scientifi...",science perfect scientist claim scientific inf...,INTJ,,0,1,1,1
INFJ,cant draw nails haha done professionals nails ...,"[cant, draw, nails, haha, done, professionals,...","[cant, draw, nail, haha, done, profession, nai...","[cant, draw, nail, haha, done, professional, n...",cant draw nail haha done professional nail yes...,INFJ,,0,1,0,1
INTJ,tend build collection things desktop use frequ...,"[tend, build, collection, things, desktop, use...","[tend, build, collect, thing, desktop, use, fr...","[tend, build, collection, thing, desktop, use,...",tend build collection thing desktop use freque...,INTJ,,0,1,1,1
INFJ,im sure thats good question distinction two de...,"[im, sure, thats, good, question, distinction,...","[im, sure, that, good, question, distinct, two...","[im, sure, thats, good, question, distinction,...",im sure thats good question distinction two de...,INFJ,,0,1,0,1
INTP,position actually let go person due various re...,"[position, actually, let, go, person, due, var...","[posit, actual, let, go, person, due, various,...","[position, actually, let, go, person, due, var...",position actually let go person due various re...,INTP,,0,1,1,0


# 1.5 Organizing The Data
<br>
The output of this notebook will be clean, organized data in two standard text formats:

1. Corpus - a collection of text
<br>
2. Document-Term Matrix - words counts in matrix format

### 1.5.1 Corpus

In [32]:
datasets.head()

Unnamed: 0_level_0,posts
type,Unnamed: 1_level_1
INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
ENTP,'I'm finding the lack of me in these posts ver...
INTP,'Good one _____ https://www.youtube.com/wat...
INTJ,"'Dear INTP, I enjoyed our conversation the o..."
ENTJ,'You're fired.|||That's another silly misconce...


In [33]:
# Let's pickle it for later use
datasets.to_pickle("corpus.pkl")

### 1.5.2 Document-Term Matrix
<br>
For many of the techniques we'll be using in future notebooks, the text must be tokenized, meaning broken down into smaller pieces. The most common tokenization technique is to break down text into words. We can do this using scikit-learn's CountVectorizer, where every row will represent a different document and every column will represent a different word.
<br>

In addition, with CountVectorizer, we can remove stop words. Stop words are common words that add no additional meaning to text such as 'a','the',etc.

In [34]:
filtered_sent = ['being', "haven't", 'they', 'but', 'my', 'through', 'up', 'once', "wasn't", 'over', 'his', 'all', 'the', 
'am', 'd', 'until', 'when', 'it', 'shan', 'on', 'him', 'she', 'yourselves', 'themselves', 'theirs', 'as', 'while', 'more', 
'have', 'been', 'just', "doesn't", 'aren', "hasn't", 'will', 'were', 'your', 'ain', 'doesn', 'this', 'these', 'with', 'o',
'here', 're', 'same', 'isn', 'had', 'above', 'whom', 'nor', 'by', 'herself', 'such', 'ourselves', 'where', 'any', 'mightn',
'are', 'you', 'its', 'won', 'yourself', 'needn', 'why', "didn't", 'ma', 'no', 'against', 'don', "she's", 'has', 'be', 'ours',
'only', 'yours', 'm', 'hadn', 'those', 'during', 'into', 'and', "that'll", 'is', "should've", "mustn't", 'under', 'mustn', 
'some','a', 'was', 'off', 'me', 'wasn', 'after', 'i', 'who', 'than', 'both', "you're", 'to', 'not', 'himself', 'he', 'again', 
'how', 'so', 'if', 'that', "hadn't", 'which', 'too', "you'll", "aren't", "it's", 'below', 'y', 'or', 'then', 'their', 'wouldn', 
've', 'can', "you've", "couldn't", 'there', 'hasn', 'having', 'most', "won't", 'each', 'hers', 'did', "shouldn't", 'an', 't', 
"weren't", 'between', 'out', 'down', 'own', 'do', 'itself', 'from', "don't", 'll', 'haven', 'her', "needn't", 'couldn', "you'd",
"mightn't", 'about','didn','for', 'few', 'other', 'does', 'before', "wouldn't", 'we', "isn't", 'shouldn', "shan't", 'of', 'at',
'im','like', 'think', 'people', 'dont', 'know', 'really', 'would', 'one', 'get','feel', 'love', 'time', 'ive', 'much', 'say', 
'things', 'want', 'see', 'way', 'someone', 'also', 'well', 'friends','further','doing', 'them', 'in', 'our','weren','something', 
'always', 'type', 'lot', 'could', 'make', 'go', 'thing', 'even', 'person', 'need','find', 'right', 'never', 'youre', 'thats',  
'pretty', 'though', 'sure', 'said', 'cant', 'first', 'actually', 'still','best', 'many', 'take', 'others', 'work', 'read', 's',
'around', 'thought', 'try', 'back', 'makes', 'better', 'trying', 'didnt', 'because','what','life', 'friend', 'every', 'got',
'agree', 'kind', 'mean', 'tell', 'post', 'two', 'probably', 'talk','anything', 'since', 'maybe', 'understand', 'seems', 'ill',  
'doesnt', 'thread', 'new', 'long', 'ever', 'years', 'hard', 'might','types', 'us', 'everyone','different', 'look', 'usually',  
'come', 'personality', 'guess', 'mind', 'relationship', 'bit', 'quite','great', 'made', 'thinking', 'everything', 'school', 
'help', 'yes', 'definitely', 'believe', 'point', 'used', 'infp', 'guys', 'tend','hes', 'use', 'intj', 'little','should','very',
'often', 'getting', 'interesting', 'last', 'talking', 'infj', 'times','another', 'mbti', 'enfp', 'world','question','part',
'feeling', 'fun', 'intp', 'enough', 'isnt', 'else', 'hate', 'lol', 'keep', 'myself','give','good','going','sometimes','id',
'anyone', 'nice', 'idea', 'sense','least','enfj', 'entj', 'entp', 'esfj', 'esfp', 'estj', 'estp','may', 'day','seem', 'bad',
'isfj', 'isfp', 'istj', 'istp','sound','thank', 'theres','now', 'enfjs','entjs', 'entps', 'esfjs', 'esfps', 'estjs', 'estps',
                'isfjs', 'isfps', 'istjs', 'istps']

In [35]:
# We are going to create a document-term matrix using CountVectorizer

vectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=1.0, stop_words=filtered_sent)  
X = vectorizer.fit_transform(data_cleaning.cleaned_lemma)
data_x = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names()) 
data_x.index = data_cleaning.index
del data_x.index.name
data_x

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,achieve,...,year,yesterday,yet,youd,youi,youll,young,younger,youtube,youve
INFJ,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
ENTP,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
INTP,2,1,0,2,0,0,0,0,1,0,...,4,0,0,0,0,0,0,0,0,0
INTJ,0,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2
ENTJ,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
INTJ,0,1,0,0,0,0,1,0,0,0,...,2,0,0,0,0,0,0,0,0,0
INFJ,0,1,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0
INTJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
INFJ,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
INTP,0,0,0,2,0,0,0,0,0,0,...,4,0,1,0,0,0,0,0,0,0


In [36]:
# Let's pickle it for later use
data_x.to_pickle("xdata.pkl")

In [40]:
# Let's also pickle the cleaned data (before we put it in documnet-term matrix)
import pickle

data_cleaning.to_pickle('data_cleaning.pkl')
pickle.dump(X, open("vectorizer.pkl", "wb"))