In [1]:
import numpy as np
import re 
import string
import pandas as pd
pd.options.display.max_colwidth = 100
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm_notebook
from tqdm import tqdm
tqdm.pandas()

tqdm_notebook().pandas()

from textblob import TextBlob, Word
from nltk.tokenize import word_tokenize
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
# !pip install vaderSentiment

In [3]:
df = pd.read_csv('mbti_1.csv')

In [6]:
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1ro...
1,ENTP,'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the sam...
2,INTP,"'Good one _____ https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I kno..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the other day. Esoteric gabbing about the nature of th..."
4,ENTJ,'You're fired.|||That's another silly misconception. That approaching is logically is going to b...


In [5]:
df.shape

(8675, 2)

In [6]:
df.type.value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

##### Data cleaning 

In [7]:
def clean_r1(text):
    text = text.lower()
    text = re.sub(r'http[^\s]*','',text)  # remove urls only
    return text

In [8]:
df.posts = df.posts.apply(clean_r1)

In [9]:
df.head()

Unnamed: 0,type,posts
0,INFJ,' and intj moments sportscenter not top ten plays pranks|||what has been the most life-cha...
1,ENTP,'i'm finding the lack of me in these posts very alarming.|||sex can be boring if it's in the sam...
2,INTP,"'good one _____ course, to which i say i know; that's my blessing and my curse.|||does being..."
3,INTJ,"'dear intp, i enjoyed our conversation the other day. esoteric gabbing about the nature of th..."
4,ENTJ,'you're fired.|||that's another silly misconception. that approaching is logically is going to b...


In [10]:
# check if urls are removed

for post in df.posts:
    if 'http' in post:
        print(post)        # return empty if none was found

##### Data cleaning - Word contraction

words that are contracted are expanded to its full form. eg 
- I'm : I am, 
- you're : you are

In [11]:
word_contraction = pd.read_csv('list of contraction.csv')      # list of contracted form to expanded form
word_contraction.head()

Unnamed: 0,contract,full
0,ain't,are not
1,aren't,are not
2,can't,can not
3,could've,could have
4,couldn't,could not


In [12]:
word_contraction_diction = dict(zip(word_contraction.contract,word_contraction.full))

In [13]:
def expand_word(text):
    for k,v in word_contraction_diction.items():
        if k in text:                                # 
            text = text.replace(k,v)
    return text

In [14]:
df.posts = df.posts.apply(expand_word)

In [15]:
df.head()

Unnamed: 0,type,posts
0,INFJ,' and intj moments sportscenter not top ten plays pranks|||what has been the most life-cha...
1,ENTP,'i am finding the lack of me in these posts very alarming.|||sex can be boring if it is in the s...
2,INTP,"'good one _____ course, to which i say i know; that is my blessing and my curse.|||does bein..."
3,INTJ,"'dear intp, i enjoyed our conversation the other day. esoteric gabbing about the nature of th..."
4,ENTJ,'you are fired.|||that is another silly misconception. that approaching is logically is going to...


Each entries of user's post is separated with a |||. Here we'll have a column to separate the posts in entries

In [16]:
df['separate_posts'] = df.posts.apply(lambda x : x.split('|||'))  # Develop a column for seperated posts
df.head()

Unnamed: 0,type,posts,separate_posts
0,INFJ,' and intj moments sportscenter not top ten plays pranks|||what has been the most life-cha...,"[' and intj moments sportscenter not top ten plays pranks, what has been the most life-cha..."
1,ENTP,'i am finding the lack of me in these posts very alarming.|||sex can be boring if it is in the s...,"['i am finding the lack of me in these posts very alarming., sex can be boring if it is in the s..."
2,INTP,"'good one _____ course, to which i say i know; that is my blessing and my curse.|||does bein...","['good one _____ course, to which i say i know; that is my blessing and my curse., does bein..."
3,INTJ,"'dear intp, i enjoyed our conversation the other day. esoteric gabbing about the nature of th...","['dear intp, i enjoyed our conversation the other day. esoteric gabbing about the nature of t..."
4,ENTJ,'you are fired.|||that is another silly misconception. that approaching is logically is going to...,"['you are fired., that is another silly misconception. that approaching is logically is going to..."


##### Expand the data for each user and entries
A user ID is assigned to their respective posts

In [17]:
# Generate a user ID from the index
df['user'] = df.index       

In [18]:
df.head()

Unnamed: 0,type,posts,separate_posts,user
0,INFJ,' and intj moments sportscenter not top ten plays pranks|||what has been the most life-cha...,"[' and intj moments sportscenter not top ten plays pranks, what has been the most life-cha...",0
1,ENTP,'i am finding the lack of me in these posts very alarming.|||sex can be boring if it is in the s...,"['i am finding the lack of me in these posts very alarming., sex can be boring if it is in the s...",1
2,INTP,"'good one _____ course, to which i say i know; that is my blessing and my curse.|||does bein...","['good one _____ course, to which i say i know; that is my blessing and my curse., does bein...",2
3,INTJ,"'dear intp, i enjoyed our conversation the other day. esoteric gabbing about the nature of th...","['dear intp, i enjoyed our conversation the other day. esoteric gabbing about the nature of t...",3
4,ENTJ,'you are fired.|||that is another silly misconception. that approaching is logically is going to...,"['you are fired., that is another silly misconception. that approaching is logically is going to...",4


##### Create a new dataframe to expand the entries of  user

In [19]:
# Entries are expanded first into columns 
df_expand = pd.DataFrame(df[['type','user','separate_posts']])
df_expand.head(3)

Unnamed: 0,type,user,separate_posts
0,INFJ,0,"[' and intj moments sportscenter not top ten plays pranks, what has been the most life-cha..."
1,ENTP,1,"['i am finding the lack of me in these posts very alarming., sex can be boring if it is in the s..."
2,INTP,2,"['good one _____ course, to which i say i know; that is my blessing and my curse., does bein..."


In [20]:
df_expand = pd.DataFrame(df.separate_posts.tolist(), index = df[['user','type']])
df_expand.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
"(0, INFJ)",' and intj moments sportscenter not top ten plays pranks,what has been the most life-changing experience in your life?,on repeat for most of today.,may the perc experience immerse you.,the last thing my infj friend posted on his facebook before committing suicide the next day. res...,84389 84390 ...,welcome and stuff.,game. set. match.,"prozac, wellbrutin, at least thirty minutes of moving your legs (and i do not mean moving them w...",basically come up with three items you have determined that each type (or whichever types you wa...,...,,,,,,,,,,
"(1, ENTP)",'i am finding the lack of me in these posts very alarming.,sex can be boring if it is in the same position often. for example me and my girlfriend are curr...,giving new meaning to 'game' theory.,hello *entp grin* that is all it takes. than we converse and they do most of the flirting while...,this + lack of balance and hand eye coordination.,"real iq test i score 127. internet iq tests are funny. i score 140s or higher. now, like the fo...","you know you are an entp when you vanish from a site for a year and a half, return, and find peo...","over think things sometimes. i go by the old sherlock holmes quote. perhaps, when a man has sp...",cheshirewolf.tumblr.com so is i :d,"400,000+ post",...,,,,,,,,,,
"(2, INTP)","'good one _____ course, to which i say i know; that is my blessing and my curse.",does being absolutely positive that you and your best friend could be an amazing couple count? i...,"no, i did not; thank you for a link!",so-called ti-si loop (and it can stem from any current topic/obsession) can be deadly. it is lik...,have you noticed how peculiar vegetation can be? all you have to do is look down at the grass: d...,the smiths – never had no one ever,i often find myself spotting faces on marble tiles/wood.,this 5 year-old sentence is an incredibly accurate and beautiful description.,i have not visited this website in the last 3 years. so whoever reads this (and maybe even remem...,"when you sit in your garden until 10:30 pm writing songs, and sing them (together with dozens of...",...,,,,,,,,,,


In [21]:
# transpose columns to rows

df_expand = df_expand.stack().reset_index()              
df_expand.drop('level_1',axis=1,inplace=True)
df_expand.rename(columns={0:'clean_posts'},inplace=True)
df_expand['user'] = [k for k,_ in df_expand['level_0']]
df_expand['type'] = [v for _,v in df_expand['level_0']]
df_expand.drop('level_0',axis=1,inplace=True)

In [22]:
df_expand.head(10)

Unnamed: 0,clean_posts,user,type
0,' and intj moments sportscenter not top ten plays pranks,0,INFJ
1,what has been the most life-changing experience in your life?,0,INFJ
2,on repeat for most of today.,0,INFJ
3,may the perc experience immerse you.,0,INFJ
4,the last thing my infj friend posted on his facebook before committing suicide the next day. res...,0,INFJ
5,84389 84390 ...,0,INFJ
6,welcome and stuff.,0,INFJ
7,game. set. match.,0,INFJ
8,"prozac, wellbrutin, at least thirty minutes of moving your legs (and i do not mean moving them w...",0,INFJ
9,basically come up with three items you have determined that each type (or whichever types you wa...,0,INFJ


In [23]:
df_expand.shape

(405263, 3)

##### Entries reduction
Entries that are too short are removed. Here we chose an arbitrary figure. Entries below or euqals to 5 words are removed

In [24]:
# Tokenize the entries for word count
tqdm_notebook().pandas()
df_expand['tokenized_words'] = df_expand.clean_posts.progress_apply(lambda x: word_tokenize(x))

# Lemmantize word
df_expand['tokenized_words'] = df_expand['tokenized_words'].progress_apply(lambda x : [WordNetLemmatizer().lemmatize(word) for word in x])

df_expand['tokenized'] = df_expand['tokenized_words'].progress_apply(lambda x: len(x))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=405263), HTML(value='')))




HBox(children=(IntProgress(value=0, max=405263), HTML(value='')))




HBox(children=(IntProgress(value=0, max=405263), HTML(value='')))




In [25]:
# Lemmantized the words

temp = df_expand['tokenized_words'][0:10]
temp.apply(lambda x : [WordNetLemmatizer().lemmatize(i,'v') for i in x])


0                                       [', and, intj, moment, sportscenter, not, top, ten, play, prank]
1                                [what, ha, be, the, most, life-changing, experience, in, your, life, ?]
2                                                                  [on, repeat, for, most, of, today, .]
3                                                          [may, the, perc, experience, immerse, you, .]
4    [the, last, thing, my, infj, friend, post, on, his, facebook, before, commit, suicide, the, next...
5                                                                                    [84389, 84390, ...]
6                                                                               [welcome, and, stuff, .]
7                                                                            [game, ., set, ., match, .]
8    [prozac, ,, wellbrutin, ,, at, least, thirty, minute, of, move, your, leg, (, and, i, do, not, m...
9    [basically, come, up, with, three, item, you, have

In [26]:
# Keep entries that have more than 5 words. 

print('Total number of entries: {}'.format(df_expand.clean_posts.count())) # total number of entries before reduction
df_expand = df_expand[df_expand.tokenized > 5]
print('Total number of entries: {}'.format(df_expand.clean_posts.count())) # total number of entries after reduction
df_expand.reset_index()
df_expand.head(3)

Total number of entries: 405263
Total number of entries: 381421


Unnamed: 0,clean_posts,user,type,tokenized_words,tokenized
0,' and intj moments sportscenter not top ten plays pranks,0,INFJ,"[', and, intj, moment, sportscenter, not, top, ten, play, prank]",10
1,what has been the most life-changing experience in your life?,0,INFJ,"[what, ha, been, the, most, life-changing, experience, in, your, life, ?]",11
2,on repeat for most of today.,0,INFJ,"[on, repeat, for, most, of, today, .]",7


##### Data Cleaning - second round
We setup a dataframe of clean posts

In [27]:
# rejoin as a list for count vectorizing
df_clean_posts = df_expand.groupby(['user','type'])['clean_posts'].apply(lambda x: ' '.join(x))

In [28]:
df_clean_posts = df_clean_posts.reset_index()
df_clean_posts.head(3)

Unnamed: 0,user,type,clean_posts
0,0,INFJ,' and intj moments sportscenter not top ten plays pranks what has been the most life-chang...
1,1,ENTP,'i am finding the lack of me in these posts very alarming. sex can be boring if it is in the sam...
2,2,INTP,"'good one _____ course, to which i say i know; that is my blessing and my curse. does being ..."


In [29]:
# stop words. Remove reference to the personality types in posts
ptype_in_posts = ['infj', 'entp', 'intp', 'intj', 'entj', 'enfj', 'infp', 'enfp','isfp', 'istp', 'isfj', 'istj', 'estp', 
              'esfp', 'estj', 'esfj']

In [30]:
def clean_r2(text):
    text = re.sub(r'\d',' ',text)                                    # remove digits 
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)    # removel punctuation 
    for word in ptype_in_posts :                                     # remove persnoality type words in post
        if word in text:                          
            text = text.replace(word,'')
    text = re.sub('i','I',text)                                      # Textblob recognizes 'i' as NN(noun), hence, this is set to uppercase 'I' to be recognized as PRP
    return text

##### cleaned posts

In [31]:
df_clean_posts.clean_posts = df_clean_posts.clean_posts.apply(clean_r2)

##### Sentiment analysis with TextBlob

In [32]:
tqdm_notebook().pandas() # track duration

df_sentiment_textblob = pd.DataFrame({'type': df_clean_posts.type})
df_sentiment_textblob['polarity'] = df_clean_posts.clean_posts.progress_apply(lambda x : round(TextBlob(str(x)).sentiment.polarity,3))
df_sentiment_textblob['subjectivity'] = df_clean_posts.clean_posts.progress_apply(lambda x : round(TextBlob(str(x)).sentiment.subjectivity,3))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




In [33]:
df_sentiment_textblob.head()

Unnamed: 0,type,polarity,subjectivity
0,INFJ,0.164,0.496
1,ENTP,0.103,0.479
2,INTP,0.146,0.58
3,INTJ,0.111,0.542
4,ENTJ,0.069,0.503


##### pronouns count
The number of different pronouns used is considered. This is inspired by James W. Pennebaker's Secret Life of Pronoun
 https://www.secretlifeofpronouns.com/
- conjunction
- determiner
- verb auxillary
- personal_pronoun
- possessive_pronoun
- verb

Part of speech abbrevation https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

•	CC	conjunction, coordinating and, or, but      
•	CD	cardinal number	five, three, 13%     
•	DT	determiner	the, a, these     
•	EX	existential there	there were six boys     
•	FW	foreign word	mais     
•	IN	conjunction, subordinating or preposition	of, on, before, unless    
•	JJ	adjective	nice, easy    
•	JJR	adjective, comparative	nicer, easier    
•	JJS	adjective, superlative	nicest, easiest     
•	LS	list item marker	     
•	MD	verb, modal auxillary	may, should     
•	NN	noun, singular or mass	tiger, chair, laughter         
•	NNS	noun, plural	tigers, chairs, insects     
•	NNP	noun, proper singular	Germany, God, Alice     
•	NNPS	noun, proper plural	we met two Christmases ago   

•	PDT	predeterminer	both his children     
•	POS	possessive ending	's    
•	PRP	pronoun, personal	me, you, it     
•	PRP$	pronoun, possessive	my, your, our     
•	RB	adverb	extremely, loudly, hard      
•	RBR	adverb, comparative	better     
•	RBS	adverb, superlative	best     
•	RP	adverb, particle	about, off, up     
•	SYM	symbol	%       

•	TO	infinitival to	what to do?         
•	UH	interjection	oh, oops, gosh         
•	VB	verb, base form	think     
•	VBZ	verb, 3rd person singular present	she thinks          
•	VBP	verb, non-3rd person singular present	I think      
•	VBD	verb, past tense	they thought      
•	VBN	verb, past participle	a sunken ship      
•	VBG	verb, gerund or present participle	thinking is fun      
•	WDT	wh-determiner	which, whatever, whichever      
•	WP	wh-pronoun, personal	what, who, whom      
•	WP$	wh-pronoun, possessive	whose, whosever      
•	WRB	wh-adverb	where, when     



In [34]:
df_pronoun = df_clean_posts.copy()

In [35]:
tqdm_notebook().pandas() # track duration

df_pronoun['conjunction'] = df_pronoun.clean_posts.progress_apply(lambda x : len([(v) for k,v in TextBlob(x).tags if v == 'CC']))
df_pronoun['determiner'] = df_pronoun.clean_posts.progress_apply(lambda x : len([(v) for k,v in TextBlob(x).tags if v == 'DT']))
df_pronoun['verb_aux'] = df_pronoun.clean_posts.progress_apply(lambda x : len([(v) for k,v in TextBlob(x).tags if v == 'MD']))
df_pronoun['personal_pron'] = df_pronoun.clean_posts.progress_apply(lambda x : len([(v) for k,v in TextBlob(x).tags if v == 'PRP']))
df_pronoun['possessive_pron'] = df_pronoun.clean_posts.progress_apply(lambda x : len([(v) for k,v in TextBlob(x).tags if v == 'UH']))
df_pronoun['verb'] = df_pronoun.clean_posts.progress_apply(lambda x : len([(v) for k,v in TextBlob(x).tags if v == 'VB']))


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8675), HTML(value='')))




In [36]:
df_pronoun.drop('clean_posts',axis=1,inplace=True)

In [37]:
# sum all except verbs which is not a functional noun
df_pronoun['function_word_count'] = df_pronoun[['conjunction', 'determiner', 'verb_aux', 'personal_pron','possessive_pron',]].sum(axis=1)

In [38]:
df_pronoun.head()

Unnamed: 0,user,type,conjunction,determiner,verb_aux,personal_pron,possessive_pron,verb,function_word_count
0,0,INFJ,12,51,7,33,0,24,103
1,1,ENTP,41,88,10,164,1,63,304
2,2,INTP,26,42,19,83,0,56,170
3,3,INTJ,31,87,16,125,0,57,259
4,4,ENTJ,33,80,13,86,0,57,212


##### Count Vectorize

In [39]:
corpus = df_clean_posts.clean_posts.tolist()

In [40]:
cvn = CountVectorizer(max_features=1500,      # set max num of vocabs
                      stop_words = 'english',  # we remove the common words
                      analyzer="word",
                      min_df=0.1)           # ignore 10% of lowest vocab usage

model = cvn.fit_transform(corpus)

In [41]:
df_cvn = pd.DataFrame(model.todense(),columns=cvn.get_feature_names(), )
df_cvn.insert(0,'ptype',df_clean_posts.type.values)
df_cvn.head()


Unnamed: 0,ptype,ability,able,absolutely,accept,accurate,act,actual,actually,add,...,wrong,wrote,xd,yeah,year,years,yes,yesterday,young,younger
0,INFJ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,ENTP,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,INTP,2,1,2,0,1,0,0,2,0,...,0,0,0,0,0,4,1,0,0,0
3,INTJ,0,2,0,0,0,0,1,2,0,...,0,0,0,1,0,0,0,0,0,0
4,ENTJ,0,0,0,0,0,0,0,1,0,...,1,0,1,0,1,0,1,0,0,0


##### TfIDF vectorize

In [42]:
tfidf = TfidfVectorizer(max_features=1500,       # set max num of vocabs
                        stop_words = 'english',  # we remove the common words
                        analyzer="word",           
                        min_df=0.05)             # Consider only words used 95% ot the time

model = tfidf.fit_transform(corpus)

df_tfidf = pd.DataFrame(model.todense(),columns=tfidf.get_feature_names())
df_tfidf.insert(0,'ptype',df_clean_posts.type.values)
df_tfidf.head()

Unnamed: 0,ptype,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,...,yeah,year,years,yep,yes,yesterday,young,younger,youtube,yup
0,INFJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.045766,0.098302,0.0,0.0,0.0,0.0,0.0,0.0
1,ENTP,0.0,0.038888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.036724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,INTP,0.1365,0.047305,0.0,0.114377,0.0,0.0,0.0,0.0,0.068651,...,0.0,0.0,0.144245,0.0,0.032915,0.0,0.0,0.0,0.0,0.089335
3,INTJ,0.0,0.078533,0.073945,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENTJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.036683,0.0,0.0,0.027028,0.0,0.0,0.0,0.0,0.0


##### Create target 

In [43]:
df_target = pd.DataFrame(df_clean_posts.type.copy())

In [44]:
df_target['world_E_I'] = df_target.type.apply(lambda x: 1 if x[0] == 'E' else 0)
df_target['information_S_N'] = df_target.type.apply(lambda x: 1 if x[1] == 'S' else 0)
df_target['decision_T_F'] = df_target.type.apply(lambda x: 1 if x[2] == 'T' else 0)
df_target['structure_J_P'] = df_target.type.apply(lambda x: 1 if x[3] == 'J' else 0)

##### Word counts

In [45]:
df_expand.head()

Unnamed: 0,clean_posts,user,type,tokenized_words,tokenized
0,' and intj moments sportscenter not top ten plays pranks,0,INFJ,"[', and, intj, moment, sportscenter, not, top, ten, play, prank]",10
1,what has been the most life-changing experience in your life?,0,INFJ,"[what, ha, been, the, most, life-changing, experience, in, your, life, ?]",11
2,on repeat for most of today.,0,INFJ,"[on, repeat, for, most, of, today, .]",7
3,may the perc experience immerse you.,0,INFJ,"[may, the, perc, experience, immerse, you, .]",7
4,the last thing my infj friend posted on his facebook before committing suicide the next day. res...,0,INFJ,"[the, last, thing, my, infj, friend, posted, on, his, facebook, before, committing, suicide, the...",62


In [46]:
df_expand[df_expand.user == 0].tokenized.sum()

668

In [47]:
total_words_count_per_user = df_expand.groupby('user')['tokenized'].sum()
df_wordcounts = pd.DataFrame(total_words_count_per_user)
df_wordcounts.rename(columns={'tokenized':'word_count'},inplace=True)
df_wordcounts.head()

Unnamed: 0_level_0,word_count
user,Unnamed: 1_level_1
0,668
1,1419
2,1025
3,1326
4,1153


In [48]:
df_wordcounts['num_of_entries'] = df_expand.groupby('user')['clean_posts'].count()
df_wordcounts.head()

Unnamed: 0_level_0,word_count,num_of_entries
user,Unnamed: 1_level_1,Unnamed: 2_level_1
0,668,31
1,1419,43
2,1025,37
3,1326,44
4,1153,38


In [49]:
df_wordcounts['avg_words_post'] = np.floor(df_wordcounts.word_count/df_wordcounts.num_of_entries)

In [50]:
df_wordcounts.head()

Unnamed: 0_level_0,word_count,num_of_entries,avg_words_post
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,668,31,21.0
1,1419,43,33.0
2,1025,37,27.0
3,1326,44,30.0
4,1153,38,30.0


##### final cleaning to columns name

In [51]:
df_clean_posts.rename(columns={'type':'ptype'},inplace=True)
df_sentiment_textblob.rename(columns={'type':'ptype'},inplace=True)
df_pronoun.rename(columns={'type':'ptype'},inplace=True)
df_target.rename(columns={'type':'ptype'},inplace=True)
df_wordcounts.rename(columns={'type':'ptype'},inplace=True)

##### Putting dataframes together

In [52]:
df_merge_clean_posts = pd.concat([df_target,
                                  df_clean_posts['clean_posts'],
                                  df_sentiment_textblob[['polarity', 'subjectivity']],
                                  df_pronoun[['conjunction', 'determiner', 'verb_aux','personal_pron', 'possessive_pron', 'verb', 'function_word_count']],
                                  df_wordcounts],axis=1)

In [53]:
df_merge_clean_posts.head(3)

Unnamed: 0,ptype,world_E_I,information_S_N,decision_T_F,structure_J_P,clean_posts,polarity,subjectivity,conjunction,determiner,verb_aux,personal_pron,possessive_pron,verb,function_word_count,word_count,num_of_entries,avg_words_post
0,INFJ,0,0,0,1,and moments sportscenter not top ten plays pranks what has been the most lIfechangIng ex...,0.164,0.496,12,51,7,33,0,24,103,668,31,21.0
1,ENTP,1,0,1,0,I am fIndIng the lack of me In these posts very alarmIng sex can be borIng If It Is In the same ...,0.103,0.479,41,88,10,164,1,63,304,1419,43,33.0
2,INTP,0,0,1,0,good one course to whIch I say I know that Is my blessIng and my curse does beIng absolutel...,0.146,0.58,26,42,19,83,0,56,170,1025,37,27.0


In [54]:
df_merge_tfidf = pd.concat([df_target,df_tfidf.drop('ptype',axis=1)],axis=1)

In [60]:
df_merge_cvn = pd.concat([df_target,df_cvn.drop('ptype',axis=1)],axis=1)

##### Export to CSV

In [61]:
df_merge_clean_posts.to_csv('df_merge_clean_posts.csv')
df_merge_tfidf.to_csv('df_merge_tfidf.csv')
df_merge_cvn.to_csv('df_merge_cvn.csv')

##### Sentiment analysis with Vader

In [56]:
# analyser = SentimentIntensityAnalyzer()
# df_sentiment_vader = df_clean_posts.copy()

In [57]:
# tqdm_notebook().pandas()
# df_sentiment_vader['sentiment'] = df_sentiment_vader.clean_posts.progress_apply(lambda x : analyser.polarity_scores(str(x)))

# df_sentiment_vader['neg'] = df_sentiment_vader.sentiment.progress_apply(lambda x : x['neg'])
# df_sentiment_vader['neu'] = df_sentiment_vader.sentiment.progress_apply(lambda x : x['neu'])
# df_sentiment_vader['pos'] = df_sentiment_vader.sentiment.progress_apply(lambda x : x['pos'])
# df_sentiment_vader['compound'] = df_sentiment_vader.sentiment.progress_apply(lambda x : x['compound'])

# df_sentiment_vader.drop(['sentiment','clean_posts'],axis=1,inplace=True)