In [None]:
import pandas as pd
import spacy
import re
import tldextract
# import tensorflow as tf
import numpy as np
import unicodedata
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

nlp = spacy.load('en_core_web_sm')

STOP_WORDS = nlp.Defaults.stop_words

nRowsRead = None  # specify 'None' if want to read whole file
postStrings = []
typeStrings = []


In [None]:
df = pd.read_csv('./input/mbti_1.csv', delimiter=',', nrows=nRowsRead)
df.dataframeName = 'mbti_1.csv'

In [None]:
# making array of array of posts
for person,type in zip(df['posts'],df['type']):
    postStrings.append([type,person.split('|||')])

# removing excess single parentheses

for i in range(0,len(postStrings)):
    postStrings[i][1][0] = postStrings[i][1][0][1:]
    postStrings[i][1][-1] = postStrings[i][1][-1][:-1]

In [None]:
#splitting posts into list of posts
df['posts'] = df['posts'].apply(lambda x: x.split('|||'))

# removing excess single parentheses
def removeParentheses(posts):
    posts[0] = posts[0][1:]
    posts[-1] = posts[-1][:-1]
    return posts
df['posts'] = df['posts'].apply(lambda x: removeParentheses(x))

df.head()

In [None]:
#label encoding type

def labelEncodeType(type):
  if type == 'ISTJ' or type == 'ISFJ' or type == 'ESTJ' or type == 'ESFJ':
    type = 0 #GUARDIAN
  elif type == 'ISTP' or type == 'ISFP' or type == 'ESTP' or type == 'ESFP':
    type = 1 #ARTISAN
  elif type == 'INFJ' or type == 'INFP' or type == 'ENFP' or type == 'ENFJ':
    type = 2  # IDEALIST
  elif type == 'INTJ' or type == 'INTP' or type == 'ENTP' or type == 'ENTJ':
    type = 3  # RATIONALIST
  return type

df['type'] = df['type'].apply(lambda x: labelEncodeType(x))
df.head()

In [None]:
df['type'].value_counts()

**PREPROCESSING PIPELINE**

In [None]:
URL_REGEX = r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))"""
DOMAIN_REGEX = "^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)"
COMMENT_REGEX = r"\[.*?\]"
ELLIPSIS_REGEX = r"^(\ *\.{3}\ *)|(\ *\.{3}\ *)$"

def urlReplace(stringToReplace):
	domain = tldextract.extract(stringToReplace.group())
	return domain.domain + '.' + domain.suffix


In [None]:
def containsOnlyUrlsOrNumbers(post):
	splits = post.split()
	for str in splits:
		if(not re.search(URL_REGEX,str) and not re.search('^\d+$',str) and not re.search('^\.\.\.+$',str)):
			return 1
	return 0

def containsComments(post):
	splits = post.split('] ')
	for str in splits:
		if(not re.search(COMMENT_REGEX,str)):
			return 1
		return 0

def removeEllipsis(post):
	return re.sub(ELLIPSIS_REGEX,'',post)

def removeWhitespace(post):
	post = " ".join(post.split())
	return post

def removeAccentedChars(post):
	post = unicodedata.normalize('NFKD',post).encode('ascii','ignore').decode('utf-8','ignore')
	return post

def formatPerson(person):
	person = list(map(lambda x: removeWhitespace(x), person))
	person = [post for post in person if (containsOnlyUrlsOrNumbers(post) and containsComments(post))]
	person = list(map(lambda x: removeEllipsis(x), person))
	person = list(map(lambda x: removeAccentedChars(x), person))
	return person

df['posts']= df['posts'].apply(lambda x:formatPerson(x))

df.head()

**ADDITIONAL FEATURES AND FINAL FORMATTING**

In [None]:
def toLower(post):
	post = post.lower()
	return post

def countUpper(post):
  upperCount = sum(map(str.isupper, post.split()))
  return upperCount

def countWords(post):
  count = len(post.split())
  return count

def avgWordLen(post):
  words = post.split()
  wordLen = 0
  for word in words:
    wordLen = wordLen + len(word)
  if len(words)>0:
    return wordLen/len(words)
  else: 
    return 0

def countUrls(post):
  return len(re.findall(URL_REGEX,post))

def removeStopWords(post):
	post = " ".join([t for t in post.split() if t not in STOP_WORDS])
	return post

def removeSpecialChars(post): #ALSO REMOVES PUNCTUATION
	post = re.sub('[^A-Z a-z 0-9-]+','',post)
	return post

def removeAlphaNumeric(post):
  alphaNumRegex = r"\b([A-z]+[0-9]+[A-z0-9]*|[0-9]+[A-z]+[A-z0-9]*)\b"
  post = re.sub(alphaNumRegex, '', post)
  return post


def removeNumbers(post):
  numberRegex = r"[0-9]+"
  floatRegex = r"([1-9][0-9]*[eE][1-9][0-9]*|(([1-9][0-9]*\.)|(\.[0-9]+))([0-9]*)?([eE][\-\+]?[1-9][0-9]*)?)"
  post = re.sub(floatRegex, '', post)
  post = re.sub(numberRegex, '', post)
  return post

def shortenRepeatingChars(post):
  repeatingCharRegex = r"(.)\1{3,}" #finds character groups of more than 3 consecutive chars
  post = re.sub(repeatingCharRegex,r"\1\1\1",post)
  return post

def removeSingleChars(post): #removes free single chars EXCEPT I or i
  singleCharRegex = r"(^| )[^Ii](( )[^Ii])*( |$)"
  post = re.sub(singleCharRegex,'',post)
  return post
  

df['upperCount'] = df['posts'].apply(lambda x:sum(map(lambda y:countUpper(str(y)),x)))  #COUNT UPPERCASE WORDS
df['stopWordCount'] = df['posts'].apply(lambda x:sum(map(lambda y: len([t for t in y.split() if t in STOP_WORDS]),x)))  #COUNT STOPWORDS
df['urlCount'] = df['posts'].apply(lambda x:sum(map(lambda y:countUrls(str(y)),x)))   #COUNT URLS

df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: re.sub(URL_REGEX, urlReplace, y), x))) #remove urls

df['wordCount'] = df['posts'].apply(lambda x:sum(map(lambda y:countWords(str(y)),x)))         #COUNT WORDS
df['avgWordLen'] = df['posts'].apply(lambda x:sum(map(lambda y:avgWordLen(str(y)),x))/len(x)) #AVERAGE WORD LEN

df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeSpecialChars(str(y)), x))) #remove special chars (including punctuation)
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y:removeStopWords(str(y)),x))) #remove stopwords
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeAlphaNumeric(str(y)), x)))  #remove alphanumeric words
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeNumbers(str(y)), x))) #remove numbers (int and float)
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y:toLower(str(y)),x))) #lowercase all
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: shortenRepeatingChars(str(y)), x))) #shorten all character repeats to 3 characters
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeSingleChars(str(y)), x))) #remove all single character words except I and i
df['posts'] = df['posts'].apply(lambda x:list(map(lambda y: removeWhitespace(str(y)), x))) #remove excess whitespaces again to clean up
df.head()

In [None]:
df.to_csv(path_or_buf="./csv_files/df_spojeni_unlemmatized.csv")


In [None]:
#do the nlp tokenization with spaCy

def convertToLemmas(posts):
  docs = nlp.pipe(posts, n_process=-1, disable=["parser"])
  lemmaList = []
  postList = []
  for doc in docs:
    for token in doc:
      lemma = str(token.lemma_)
      if lemma == '-PRON-' or lemma == 'be':
        lemma = token.text
      lemmaList.append(lemma)
    postList.append(" ".join(lemmaList))
    lemmaList= []
  return " ".join(postList)

#for the dataframe:
df['posts']= df['posts'].apply(lambda x:convertToLemmas(x))
df.head()

In [27]:
# df.to_csv(path_or_buf="./csv_files/df_spojeni.csv")
df = pd.read_csv(filepath_or_buffer="./csv_files/df_spojeni.csv")
df=df.drop([3559])

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

**Wordcloud before word removal**

In [None]:
# wc = WordCloud(width=800, height=400).generate(' '.join(df['posts'].tolist()))
# plt.imshow(wc)
# plt.axis('off')
# plt.show()

**common words removal**

In [None]:
freq_comm = pd.Series(' '.join(df['posts'].tolist()).split()).value_counts()
f20= freq_comm[:1]
f20

In [None]:
df['posts'] = df['posts'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

**rare words removal**

In [None]:
rare= freq_comm[freq_comm.values==1]
rare

In [None]:
df['posts'] = df['posts'].apply(lambda x: ' '.join([t for t in x.split() if t not in rare]))

**Wordcloud after word removal**

In [None]:
# wc = WordCloud(width=800,height=400).generate(' '.join(df['posts'].apply(lambda x: str(x)).tolist()))
# plt.imshow(wc)
# plt.axis('off')
# plt.show()

In [28]:
df0 = df[df['type'] == 0].sample(450)
df1 = df[df['type'] == 1].sample(450)
df2 = df[df['type'] == 2].sample(450)
df3 = df[df['type'] == 3].sample(450)

dfr = pd.concat([df0,df1,df2,df3],)
dfr

Unnamed: 0.1,Unnamed: 0,type,posts,upperCount,stopWordCount,urlCount,wordCount,avgWordLen
3179,3179,0,aptain phillip happy nice I m currently read e...,10,178,0,391,4.534834
914,914,0,lol that s I figure mean I like way I do not s...,91,617,0,1382,4.214347
1361,1361,0,believe balance important you want able enjoyg...,67,688,2,1473,4.566697
1539,1539,0,do not know come I great love christmas grow p...,88,554,0,1279,4.978543
7559,7559,0,to honest I do not know I truly capable want m...,119,860,1,1783,4.223365
...,...,...,...,...,...,...,...,...
7200,7200,3,can not draw there s youtubecom is not obvious...,40,467,2,1061,4.666140
1848,1848,3,in time I leadership role I think I great lead...,117,689,0,1485,4.252287
5417,5417,3,yes immediately say just attention detail pape...,68,723,0,1656,4.706761
1136,1136,3,anti - anxiety medication day able function so...,29,168,0,438,4.569014


In [None]:
# dfr.to_csv(path_or_buf="./csv_files/dfr_spojeni.csv")


In [None]:
y = dfr['type']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(dtype=np.uint8)
text_counts = cv.fit_transform(dfr['posts'])

In [None]:
text_counts.toarray().shape

In [None]:
dfr_bow = pd.DataFrame(text_counts.toarray(),columns=cv.get_feature_names_out())

In [None]:
dfr_bow.head(2)

In [None]:
print("Num. of words: " + str(len(list(dfr_bow.columns))))

In [None]:
dfr_features = dfr.drop(labels=['posts','type'],axis=1).reset_index(drop=True)  

**this reset_index should be double checked**

In [None]:
# dfr_bow.to_csv(path_or_buf="./csv_files/dfr_bow.csv")
# dfr.to_csv(path_or_buf="./csv_files/dfr.csv")

## ML ALGORITHMS

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
import seaborn as sns

In [None]:
rfc = RandomForestClassifier(random_state=42,n_estimators=500,n_jobs=-1)

In [None]:
%%time

def classify(X,y):
  scaler = MinMaxScaler(feature_range=(0,1))
  X=scaler.fit_transform(X)

  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state=42,stratify=y)

  rfc.fit(X_train,y_train)
  y_pred = rfc.predict(X_test)
  ac = accuracy_score(y_test,y_pred)

  ################ CONFUSION MATRIX ####################
  # Get and reshape confusion matrix data
  matrix = confusion_matrix(y_test, y_pred)
  matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

  # Build the plot
  plt.figure(figsize=(16, 7))
  sns.set(font_scale=1.4)
  sns.heatmap(matrix, annot=True, annot_kws={'size': 10},
              cmap=plt.cm.Greens, linewidths=0.2)

  # Add labels to the plot
  class_names = ['GUARDIAN', 'ARTISAN', 'IDEALIST', 'RATIONALIST']
  tick_marks = np.arange(len(class_names))
  tick_marks2 = tick_marks + 0.5
  plt.xticks(tick_marks, class_names, rotation=25)
  plt.yticks(tick_marks2, class_names, rotation=0)
  plt.xlabel('Predicted label')
  plt.ylabel('True label')
  plt.title('Confusion Matrix for Random Forest Model')
  plt.show()
  ################ CONFUSION MATRIX ####################

  print("accuracy: ",ac)
  print(classification_report(y_test, y_pred))

  ############### FEATURE IMPORTANCE ###################
  features = scaler.get_feature_names_out()
  fi = rfc.feature_importances_
  importance = {features[i]:fi[i] for i in range(0, len(features))}
  wc = WordCloud(width=800, height=400).generate_from_frequencies(importance)
  plt.imshow(wc)
  plt.axis('off')
  plt.show()
  ############### FEATURE IMPORTANCE ###################

classify(dfr_features.join(dfr_bow),y)

### average accuracy on 100 forests

In [None]:
# def classify(X,y):
#   scaler = MinMaxScaler(feature_range=(0,1))
#   X=scaler.fit_transform(X)

#   X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state=42,stratify=y)

#   rfc.fit(X_train,y_train)
#   y_pred = rfc.predict(X_test)
#   ac = accuracy_score(y_test,y_pred)

#   return ac

# test = []

# for i in range(0,100):
#   df0 = df[df['type'] == 0].sample(450)
#   df1 = df[df['type'] == 1].sample(450)
#   df2 = df[df['type'] == 2].sample(450)
#   df3 = df[df['type'] == 3].sample(450)
#   dfr = pd.concat([df0, df1, df2, df3],)
#   y = dfr['type']
#   cv = CountVectorizer(dtype='b')
#   text_counts = cv.fit_transform(dfr['posts'])
#   dfr_bow = pd.DataFrame(text_counts.toarray(),columns=cv.get_feature_names_out())
#   dfr_features = dfr.drop(labels=['posts','type'],axis=1).reset_index(drop=True)  

#   test.append(classify(dfr_features.join(dfr_bow), y))

# print(sum(test)/len(test))