In [None]:
import pandas as pd
import spacy
import re
import tldextract
import tensorflow as tf
import numpy as np

nlp = spacy.load('en_core_web_sm')

STOP_WORDS = nlp.Defaults.stop_words

nRowsRead = 500  # specify 'None' if want to read whole file
postStrings = []
typeStrings = []


In [None]:
df = pd.read_csv('./input/mbti_1.csv', delimiter=',', nrows=nRowsRead)
df.dataframeName = 'mbti_1.csv'

In [None]:
# making array of array of posts
for person,type in zip(df['posts'],df['type']):
    postStrings.append([type,person.split('|||')])

# removing excess single parentheses

for i in range(0,len(postStrings)):
    postStrings[i][1][0] = postStrings[i][1][0][1:]
    postStrings[i][1][-1] = postStrings[i][1][-1][:-1]

In [None]:
#splitting posts into list of posts
df['posts'] = df['posts'].apply(lambda x: x.split('|||'))

# removing excess single parentheses
def removeParentheses(posts):
    posts[0] = posts[0][1:]
    posts[-1] = posts[-1][:-1]
    return posts
df['posts'] = df['posts'].apply(lambda x: removeParentheses(x))

df.head()

In [None]:
#label encoding type

def labelEncodeType(type):
  if type == 'ISTJ' or type == 'ISFJ' or type == 'ESTJ' or type == 'ESFJ':
    type = 0 #GUARDIAN
  elif type == 'ISTP' or type == 'ISFP' or type == 'ESTP' or type == 'ESFP':
    type = 1 #ARTISAN
  elif type == 'INFJ' or type == 'INFP' or type == 'ENFP' or type == 'ENFJ':
    type = 2  # IDEALIST
  elif type == 'INTJ' or type == 'INTP' or type == 'ENTP' or type == 'ENTJ':
    type = 3  # RATIONALIST
  return type

df['type'] = df['type'].apply(lambda x: labelEncodeType(x))
df.head()

In [None]:
df['type'].value_counts()

In [None]:
URL_REGEX = r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))"""
DOMAIN_REGEX = "^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)"
COMMENT_REGEX = r"\[.*?\]"
ELLIPSIS_REGEX = r"^(\ *\.{3}\ *)|(\ *\.{3}\ *)$"

def urlReplace(stringToReplace):
	domain = tldextract.extract(stringToReplace.group())
	return domain.domain + '.' + domain.suffix


**CONSIDER TRYING WITHOUT STOPWORD REMOVAL**

In [None]:
def containsOnlyUrlsOrNumbers(post):
	splits = post.split()
	for str in splits:
		if(not re.search(URL_REGEX,str) and not re.search('^\d+$',str) and not re.search('^\.\.\.+$',str)):
			return 1
	return 0

def containsComments(post):
	splits = post.split('] ')
	for str in splits:
		if(not re.search(COMMENT_REGEX,str)):
			return 1
		return 0

def removeEllipsis(post):
	return re.sub(ELLIPSIS_REGEX,'',post)

def removeWhitespace(post):
	post = " ".join(post.split())
	return post

def removeStopWords(post):
	post = " ".join([t for t in post.split() if t not in STOP_WORDS])
	return post

def formatPerson(person):
	person = list(map(lambda x: removeWhitespace(x), person))
	person = [post for post in person if (
		containsOnlyUrlsOrNumbers(post) and containsComments(post))]
	person = [removeEllipsis(post) for post in person]
	person = list(map(lambda x: re.sub(URL_REGEX, urlReplace, x), person))
	person = list(map(lambda x: removeStopWords(x),person))
	return person

df['posts']= df['posts'].apply(lambda x:formatPerson(x))

df.head()

In [None]:
#do the nlp tokenization with spaCy

def convertToLemmas(posts):
  docs=nlp.pipe(posts,n_process=-1)
  lemmaList = []
  postList = []
  for doc in docs:
    for token in doc:
      lemma = str(token.lemma_)
      if lemma == '-PRON-' or lemma == 'be':
        lemma = token.text
      lemmaList.append(lemma)
    postList.append(" ".join(lemmaList))
    lemmaList= []
  return postList

#for the dataframe:
df['posts']= df['posts'].apply(lambda x:convertToLemmas(x))
df.head()

In [None]:
#SPLITTING POSTS INTO SEPERATE ROWS
df = df.explode('posts')
df

In [None]:
df['type'].value_counts()

In [None]:
#common words removal
freq_comm = pd.Series(' '.join(df['posts'].tolist()).split()).value_counts()
f20= freq_comm[:20]

In [None]:
df['posts'] = df['posts'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

In [None]:
#rare words removal
rare= freq_comm[freq_comm.values==1]

In [None]:
df['posts'] = df['posts'].apply(lambda x: ' '.join([t for t in x.split() if t not in rare]))

In [None]:
df0 = df[df['type'] == 0].sample(10000)
df1 = df[df['type'] == 1].sample(10000)
df2 = df[df['type'] == 2].sample(10000)
df3 = df[df['type'] == 3].sample(10000)

dfr = pd.concat([df0,df1,df2,df3])
dfr

In [None]:
df.to_csv(path="./picke_files/df.pkl")
dfr.to_csv(path="./picke_files/dfr.pkl")

In [None]:
y = dfr['type']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
text_counts = cv.fit_transform(dfr['posts'])

In [None]:
text_counts.toarray().shape

In [None]:
dfr_bow = pd.DataFrame(text_counts.toarray(),columns=cv.get_feature_names_out())

In [None]:
dfr_bow.head(2)

In [None]:
dfr_bow.to_csv(path="./picke_files/dfr_bow.pkl")


## ML ALGORITHMS

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
rfc = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators=200)

In [None]:
%%time

def classify(X,y):
  scaler = MinMaxScaler(feature_range=(0,1))
  X=scaler.fit_transform(X)

  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state=42,stratify=y)

  rfc.fit(X_train,y_train)
  y_pred = rfc.predict(X_test)
  ac = accuracy_score(y_test,y_pred)
  print("accuracy: ",ac)


classify(dfr_bow,y)