In [54]:
import pandas as pd
import spacy
import re
import tldextract
import numpy as np
import unicodedata
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

nlp = spacy.load('en_core_web_md')

STOP_WORDS = nlp.Defaults.stop_words

nRowsRead = None  # specify 'None' if want to read whole file
postStrings = []
typeStrings = []


In [55]:
df = pd.read_csv('./input/mbti_1.csv', delimiter=',', nrows=nRowsRead)
df.dataframeName = 'mbti_1.csv'

In [56]:
# making array of array of posts
for person,type in zip(df['posts'],df['type']):
    postStrings.append([type,person.split('|||')])

# removing excess single parentheses

for i in range(0,len(postStrings)):
    postStrings[i][1][0] = postStrings[i][1][0][1:]
    postStrings[i][1][-1] = postStrings[i][1][-1][:-1]

In [57]:
#splitting posts into list of posts
df['posts'] = df['posts'].apply(lambda x: x.split('|||'))

# removing excess single parentheses
def removeParentheses(posts):
    posts[0] = posts[0][1:]
    posts[-1] = posts[-1][:-1]
    return posts
df['posts'] = df['posts'].apply(lambda x: removeParentheses(x))

df.head()

Unnamed: 0,type,posts
0,INFJ,"[http://www.youtube.com/watch?v=qsXHcwe3krw, h..."
1,ENTP,[I'm finding the lack of me in these posts ver...
2,INTP,[Good one _____ https://www.youtube.com/wat...
3,INTJ,"[Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,"[You're fired., That's another silly misconcep..."


In [58]:
#label encoding type

def labelEncodeType(type):
  if type == 'ISTJ' or type == 'ISFJ' or type == 'ESTJ' or type == 'ESFJ':
    type = 0 #GUARDIAN
  elif type == 'ISTP' or type == 'ISFP' or type == 'ESTP' or type == 'ESFP':
    type = 1 #ARTISAN
  elif type == 'INFJ' or type == 'INFP' or type == 'ENFP' or type == 'ENFJ':
    type = 2  # IDEALIST
  elif type == 'INTJ' or type == 'INTP' or type == 'ENTP' or type == 'ENTJ':
    type = 3  # RATIONALIST
  return type

df['type'] = df['type'].apply(lambda x: labelEncodeType(x))
df.head()

Unnamed: 0,type,posts
0,2,"[http://www.youtube.com/watch?v=qsXHcwe3krw, h..."
1,3,[I'm finding the lack of me in these posts ver...
2,3,[Good one _____ https://www.youtube.com/wat...
3,3,"[Dear INTP, I enjoyed our conversation the o..."
4,3,"[You're fired., That's another silly misconcep..."


In [59]:
df['type'].value_counts()

2    4167
3    3311
1     745
0     452
Name: type, dtype: int64

**PREPROCESSING PIPELINE**

In [60]:
URL_REGEX = r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))"""
DOMAIN_REGEX = "^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)"
COMMENT_REGEX = r"\[.*?\]"
ELLIPSIS_REGEX = r"^(\ *\.{3}\ *)|(\ *\.{3}\ *)$"

def urlReplace(stringToReplace):
	domain = tldextract.extract(stringToReplace.group())
	return domain.domain + '.' + domain.suffix


In [61]:
def containsOnlyUrlsOrNumbers(post):
	splits = post.split()
	for str in splits:
		if(not re.search(URL_REGEX,str) and not re.search('^\d+$',str) and not re.search('^\.\.\.+$',str)):
			return 1
	return 0

def containsComments(post):
	splits = post.split('] ')
	for str in splits:
		if(not re.search(COMMENT_REGEX,str)):
			return 1
		return 0

def removeEllipsis(post):
	return re.sub(ELLIPSIS_REGEX,'',post)

def removeWhitespace(post):
	post = " ".join(post.split())
	return post

def removeAccentedChars(post):
	post = unicodedata.normalize('NFKD',post).encode('ascii','ignore').decode('utf-8','ignore')
	return post

def formatPerson(person):
	person = list(map(lambda x: removeWhitespace(x), person))
	person = [post for post in person if (containsOnlyUrlsOrNumbers(post) and containsComments(post))]
	person = list(map(lambda x: removeEllipsis(x), person))
	person = list(map(lambda x: removeAccentedChars(x), person))
	return person

df['posts']= df['posts'].apply(lambda x:formatPerson(x))

df.head()

Unnamed: 0,type,posts
0,2,[enfp and intj moments youtube.com sportscente...
1,3,[I'm finding the lack of me in these posts ver...
2,3,[Good one _____ https://www.youtube.com/watch?...
3,3,"[Dear INTP, I enjoyed our conversation the oth..."
4,3,"[You're fired., That's another silly misconcep..."


**ADDITIONAL FEATURES AND FINAL FORMATTING**

In [62]:
def toLower(post):
	post = post.lower()
	return post

def countUpper(post):
  upperCount = sum(map(str.isupper, post.split()))
  return upperCount

def countWords(post):
  count = len(post.split())
  return count

def avgWordLen(post):
  words = post.split()
  wordLen = 0
  for word in words:
    wordLen = wordLen + len(word)
  if len(words)>0:
    return wordLen/len(words)
  else: 
    return 0

def countUrls(post):
  return len(re.findall(URL_REGEX,post))

def removeStopWords(post):
	post = " ".join([t for t in post.split() if t not in STOP_WORDS])
	return post

def removeSpecialChars(post): #ALSO REMOVES PUNCTUATION
	post = re.sub('[^A-Z a-z 0-9-]+','',post)
	return post

def removeAlphaNumeric(post):
  alphaNumRegex = r"\b([A-z]+[0-9]+[A-z0-9]*|[0-9]+[A-z]+[A-z0-9]*)\b"
  post = re.sub(alphaNumRegex, '', post)
  return post


def removeNumbers(post):
  numberRegex = r"[0-9]+"
  floatRegex = r"([1-9][0-9]*[eE][1-9][0-9]*|(([1-9][0-9]*\.)|(\.[0-9]+))([0-9]*)?([eE][\-\+]?[1-9][0-9]*)?)"
  post = re.sub(floatRegex, '', post)
  post = re.sub(numberRegex, '', post)
  return post

def shortenRepeatingChars(post):
  repeatingCharRegex = r"(.)\1{3,}" #finds character groups of more than 3 consecutive chars
  post = re.sub(repeatingCharRegex,r"\1\1\1",post)
  return post

def removeSingleChars(post): #removes free single chars EXCEPT I or i
  singleCharRegex = r"(^| )[^Ii](( )[^Ii])*( |$)"
  post = re.sub(singleCharRegex,'',post)
  return post
  

df['upperCount'] = df['posts'].apply(lambda x:sum(map(lambda y:countUpper(str(y)),x)))  #COUNT UPPERCASE WORDS
df['stopWordCount'] = df['posts'].apply(lambda x:sum(map(lambda y: len([t for t in y.split() if t in STOP_WORDS]),x)))  #COUNT STOPWORDS
df['urlCount'] = df['posts'].apply(lambda x:sum(map(lambda y:countUrls(str(y)),x)))   #COUNT URLS

df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: re.sub(URL_REGEX, urlReplace, y), x))) #remove urls

df['wordCount'] = df['posts'].apply(lambda x:sum(map(lambda y:countWords(str(y)),x)))         #COUNT WORDS
df['avgWordLen'] = df['posts'].apply(lambda x:sum(map(lambda y:avgWordLen(str(y)),x))/len(x)) #AVERAGE WORD LEN

df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeSpecialChars(str(y)), x))) #remove special chars (including punctuation)
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y:removeStopWords(str(y)),x))) #remove stopwords
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeAlphaNumeric(str(y)), x)))  #remove alphanumeric words
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeNumbers(str(y)), x))) #remove numbers (int and float)
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y:toLower(str(y)),x))) #lowercase all
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: shortenRepeatingChars(str(y)), x))) #shorten all character repeats to 3 characters
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeSingleChars(str(y)), x))) #remove all single character words except I and i
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeWhitespace(str(y)), x))) #remove excess whitespaces again to clean up
df.head()

Unnamed: 0,type,posts,upperCount,stopWordCount,urlCount,wordCount,avgWordLen
0,2,[enfp intj moments youtubecom sportscenter pla...,17,255,8,585,4.935312
1,3,"[im finding lack posts alarming, sex boring po...",91,525,4,1214,4.356184
2,3,"[good youtubecom, of course i i know thats ble...",37,354,3,873,4.789478
3,3,[dear intp i enjoyed conversation day esoteric...,69,490,2,1108,4.481933
4,3,"[youre fired, thats silly misconception that a...",51,447,3,1013,4.512231


In [63]:
#do the nlp tokenization with spaCy

def convertToLemmas(posts):
  docs = nlp.pipe(posts, n_process=-1)
  lemmaList = []
  postList = []
  for doc in docs:
    for token in doc:
      lemma = str(token.lemma_)
      if lemma == '-PRON-' or lemma == 'be':
        lemma = token.text
      lemmaList.append(lemma)
    postList.append(" ".join(lemmaList))
    lemmaList= []
  return postList
  # return " ".join(postList)

#for the dataframe:
df['posts']= df['posts'].apply(lambda x:convertToLemmas(x))
df.head()

Unnamed: 0,type,posts,upperCount,stopWordCount,urlCount,wordCount,avgWordLen
0,2,[enfp intj moment youtubecom sportscenter play...,17,255,8,585,4.935312
1,3,"[I m find lack post alarming, sex boring posit...",91,525,4,1214,4.356184
2,3,"[good youtubecom, of course I I know that s bl...",37,354,3,873,4.789478
3,3,[dear intp I enjoy conversation day esoteric g...,69,490,2,1108,4.481933
4,3,"[you re fire, that s silly misconception that ...",51,447,3,1013,4.512231


In [64]:
df = df.explode("posts")

In [65]:
df.to_pickle("./pickle_files/df_odvojeni.pkl")
# df.to_pickle("./pickle_files/df_spojeni.pkl")

In [66]:
df['type'].value_counts()

2    197249
3    155773
1     34619
0     21308
Name: type, dtype: int64

**common words removal**

In [67]:
freq_comm = pd.Series(' '.join(df['posts'].tolist()).split()).value_counts()
f20= freq_comm[:1]
f20

I    583690
dtype: int64

In [68]:
df['posts'] = df['posts'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

**rare words removal**

In [69]:
rare= freq_comm[freq_comm.values==1]
rare

yourselfbe              1
outbe                   1
leviintj                1
sensinginstantly        1
ymiristp                1
                       ..
spiritus                1
therandomsciencegirl    1
etcpresent              1
disagreeingi            1
marsall                 1
Length: 107343, dtype: int64

In [70]:
df['posts'] = df['posts'].apply(lambda x: ' '.join([t for t in x.split() if t not in rare]))

In [71]:
df.to_pickle("./pickle_files/df_odvojeni_wordsRemoved.pkl")
# df.to_pickle("./pickle_files/df_spojeni_wordsRemoved.pkl")

In [72]:
df['posts']

0       enfp intj moment youtubecom play youtubecom prank
0                      what life - change experience life
0                   youtubecom youtubecom on repeat today
0                             may perc experience immerse
0       the thing infj friend post facebook commit sui...
                              ...                        
8674    go close facebook month want able message fami...
8674              second collection it fitting mood right
8674    see agree actually think time watch movie begi...
8674    ok watch underworld awakening good film compar...
8674                    want turn emotion hide world need
Name: posts, Length: 408949, dtype: object

In [76]:
df.reset_index().drop('index',axis=1)

Unnamed: 0,type,posts,upperCount,stopWordCount,urlCount,wordCount,avgWordLen
0,2,enfp intj moment youtubecom play youtubecom prank,17,255,8,585,4.935312
1,2,what life - change experience life,17,255,8,585,4.935312
2,2,youtubecom youtubecom on repeat today,17,255,8,585,4.935312
3,2,may perc experience immerse,17,255,8,585,4.935312
4,2,the thing infj friend post facebook commit sui...,17,255,8,585,4.935312
...,...,...,...,...,...,...,...
408944,2,go close facebook month want able message fami...,104,659,2,1409,4.137663
408945,2,second collection it fitting mood right,104,659,2,1409,4.137663
408946,2,see agree actually think time watch movie begi...,104,659,2,1409,4.137663
408947,2,ok watch underworld awakening good film compar...,104,659,2,1409,4.137663
