In [1]:
import numpy as np
import plotly.graph_objects as go
import plotly
import pandas as pd
import plotly.offline as py
from scipy.stats import multivariate_normal as g
py.init_notebook_mode(connected=True)

def plot_data(classes, count):
    bar = go.Bar(
        x = classes,
        y = count
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes", type='category'),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar], layout = layout)
    plotly.offline.iplot(fig)

def plot_traintest(classes, train, test):
    bar1 = go.Bar(
        name = "train",
        x = classes,
        y = train
    )

    bar2 = go.Bar(
        name = "test",
        x = classes,
        y = test
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes", type='category'),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar1, bar2], layout = layout)
    plotly.offline.iplot(fig)

# OLD
# def separate_classes(csv_obj,classno):
#     # Separate features for each class in a dictionary, i.e star type
#     classes = {}    #classes contain every feature for each star type

#     for i in range(classno):
#         classes["".join(str(i))] = []
    
#     for row in csv_obj:
#         featdict = {}
#         for feature in row:
#             if feature != "Star type":
#                 featdict[feature] = row[feature]
#         classes[row['Star type']].append(featdict)
    
#     return classes

# def frequencies(clss):
#     # Find frequencies of feature vectors for each class
#     freqs = {}
#     for c in classes:
#        freqs[c] = len(classes[c])
    
#     return freqs

# def pdf(feature, type):
#     # Prepare plots for pdf of each class for selected feature
#     plots = []
#     color = ['(255, 0, 0','(0, 255, 0','(0, 0, 255','(231, 255, 92','(188, 0, 255','(0, 211, 255']

#     for cl in classes:
#         featlist = None
#         if type == 'int': featlist = [int(x[feature]) for x in classes[cl]]
#         elif type == 'float': featlist = [float(x[feature]) for x in classes[cl]]
#         else: featlist = [x[feature] for x in classes[cl]]

#         x1 = np.array(featlist)
#         xspace = np.linspace(x1.min(), x1.max())
#         yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#         cr = color.pop()
#         plots.append(go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join(cl), marker_color='rgba'+cr+', .9)'))
    
#     return plots

# with open("../6_class_csv.csv","r") as csvf:
#     csvreader = csv.DictReader(csvf)    #read csv as a dictionary
    
#     classes = separate_classes(csvreader,6)
#     freqs = frequencies(classes)
    
#     # Perform GDA
#     # plots = pdf('Temperature (K)', 'int')
#     # layout = go.Layout(title='Star samples GE for p(x|y)', xaxis=dict(title='x'), yaxis=dict(title='p(x|y)'))
#     # fig = go.Figure(data=plots, layout=layout)
#     # plotly.offline.iplot(fig)

#     x1 = np.array([int(x['Temperature (K)']) for x in classes['0']])
#     xspace = np.linspace(x1.min(), x1.max())
#     yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#     fig = go.Figure(data=[
#         go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join('0'), marker_color='rgba(44, 193, 93, .9)')
#     ])

#     plotly.offline.iplot(fig)

## Data inspection

In [2]:
print('Reading csv file...\n')
df = pd.read_csv('../mbti_full_pull.csv') # read data from csv
print(df.info()) # print dataframe info

Reading csv file...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1794016 entries, 0 to 1794015
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   author_flair_text  object
 1   body               object
 2   subreddit          object
dtypes: object(3)
memory usage: 41.1+ MB
None


In [3]:
print(df.head()) # take a look at the content

  author_flair_text                                               body  \
0              INTJ  Knowing you're in INTJ is a tool for you to us...   
1              INTJ           You are truly an enlightened mastermind.   
2         INFJ, 26F  You should :) it will help if you have a down ...   
3              INTP  I watch a bit of everything (including hentai)...   
4              INTJ  I don't know if I would count this as a pet pe...   

  subreddit  
0      intj  
1      intj  
2      infj  
3      INTP  
4      intj  


We can see that there are 3 columns, author's description(author_flair_text), the main post texts(body) and the  
subreddit which could be used to extract the labels

## Preprocess data

Check if null values exist

In [4]:
print(df.isnull().sum())

author_flair_text     0
body                 55
subreddit             0
dtype: int64


Drop null values

In [5]:
df = df.dropna()
print('Size after removing nulls: {:d}'.format(len(df)))

Size after removing nulls: 1793961


Clean data by allocating unique and clear classes and text.  
Classes are extracted from subreddit that kind of contains natural language in some cases, so the class value (e.g "ISTJ") should be extracted from plain text, so ambiguous classes are removed.

In [6]:
classes = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ','ENFJ','ENTJ'] # all the mentioned classes

print("Size before cleaning: {:d}".format(len(df)))

# extract only the cleaned classes that consist of specific and unique value
new_class = []
new_txt = []
print('Cleaning labels...')
for i in range(len(df)):
    lbl = df.iloc[i]['subreddit']
    upperl = lbl.upper()

    if upperl in classes:
        new_class.append(upperl)
        new_txt.append(df.iloc[i]['body'])

mbti = {
    'text': new_txt,
    'class': new_class
}

data = pd.DataFrame(mbti)
del mbti
del df
del new_class
del new_txt
print('Labels cleaned!')
print("Size after cleaning: {:d}".format(len(data)))

Size before cleaning: 1793961
Cleaning labels...
Labels cleaned!
Size after cleaning: 1350826


## Obeserve distribution

In [7]:
freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

We can see that there are a lot of classes and their distribution is highly imbalanced.  
According to [this](https://www.myersbriggs.org/my-mbti-personality-type/mbti-basics/type-tables.htm) table, personality can be classified as introvert and extrovert.  
Also, since we have only two opposing classes (i.e extrovert is-not introvert), the problem can be converted to a binary classification problem.  
So we can simplify the problem by merging all the classes into one binary class (Introvert for example) that takes the value 1 if true and 0 otherwise (Extrovert).

In [8]:
merged_txt = []
merged_class = []

print('Merging labels...')
for i in range(len(data)):
    merged_txt.append(data['text'].iloc[i])
    merged_class.append(data['class'].iloc[i][0])

del data

mbti = {
    'text': merged_txt,
    'class': merged_class
}

data = pd.DataFrame(mbti)
del mbti
del merged_class
del merged_txt

freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

Merging labels...


## Shrink dataset proportionally

In [9]:
# find class with minimum length
min_length = np.inf
min_class = ''
for v in data['class'].unique():
    lngth = len(data[data['class'] == v])
    if lngth < min_length:
        min_length = lngth
        min_class = v

ratio = []
for v in data['class'].unique():
    ratio.append(len(data[data['class'] == min_class]) / len(data[data['class'] == v]))

# remove (length-remn) items from min class
remn = 1000
delindex = data[data['class'] == min_class].index[0:(len(data[data['class'] == min_class]) - remn)]
data.drop(delindex, inplace=True)

#remove items from every other class, retaining the length distribution
for v in data['class'].unique():
    if len(data[data['class'] == v]) == remn: continue
    rt = ratio.pop(0)
    new_size = round(float(remn) / rt)

    delindex = data[data['class'] == v].index[0:(len(data[data['class'] == v]) - new_size)]
    data.drop(delindex, inplace=True)

print("Size after removal: {:d}".format(len(data)))

Size after removal: 4555


In [10]:
# plot data
freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

## Further text preprocessing
1. make text lower-case
2. remove symbols and punctuation
3. lemmatize verbs

In [11]:
import re
import string
import nltk
from nltk.corpus import wordnet
from textblob import Word

nltk.download("wordnet")

# lower text
data['text'] = data['text'].str.lower()

# convert emoticons to text (not yet implemented)
emots = {
    'happy': [':-)',':)','(-:','(:',':D',':-D'],
    'sad': [':-(',':(',')-:','):'],
    'laugh': ['xD','XD'],
    'funny': [':-P',':P',':p']
}

# remove symbols
def remove_punctuation(text):
    symbols = string.punctuation
    return text.translate(str.maketrans('', '', symbols))

data['text'] = data['text'].apply(lambda text: remove_punctuation(text))

# word lemmatizing
def lem(text):
    spl = text.split()
    return " ".join([Word(word).lemmatize("v") for word in spl]) # lemmatize for verbs

data['text'] = data['text'].apply(lambda text: lem(text))

#remove links (not complete)
# def remove_links(text):
#     x = re.sub("http|https|www", "", text)
#     return x

# data['text'] = data['text'].apply(lambda text: remove_links(text))
# print(data.head())

[nltk_data] Downloading package wordnet to /home/matthew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# from gensim.models import FastText
# import multiprocessing

# cores = multiprocessing.cpu_count()

# # create a list of words of the corpus
# words = [row.split() for row in data['text']]

# model = FastText(size=4, window=2, min_count=1, workers=cores-1)
# model.build_vocab(sentences=words)
# model.train(sentences=words, total_examples=len(words), epochs=20)

In [13]:
# print(model)

## Feature extraction
Inspect text and find possible features and relations with Tf-idf  
The vectorizer parameters are tuned in order to find words with max information

In [14]:
# NEEDS INSPECTION, INACTIVE FOR NOW
from sklearn.feature_extraction import text as fet
import itertools
# find frequency of each word in each text fragment of each row (tfidf: term frequency * inverse document frequency)
# Suggestion: text embeddings with Fasttext, Glove, Word2vec etc.

stopword_list = ['the','a','an','be','am','are','for','of','or','that','this','which','is','then','than','www','youtube','com','reddit','wiki','wikipedia','org']

vectorizer = fet.TfidfVectorizer(stop_words='english', ngram_range=(1,2))
# corpus = data['text'].tolist()
tfidf = vectorizer.fit_transform(data['text'])
# vectorizer = fet.CountVectorizer(max_features=1000, min_df=0.01, max_df=0.95, stop_words=stopword_list, ngram_range=(1,1))
# vectorizer = fet.HashingVectorizer(stop_words=stopword_list, ngram_range=(1,1))
# counts = vectorizer.fit_transform(corpus)   # count each word in each document

# tfidf = transformer.fit_transform(counts) # calculate tfidf by multiplying counts from above with computed idf
features = vectorizer.get_feature_names()   # features are every counted word in the entire corpus and their values are tfidf's
print('TfidfTransformer')
print('----------------')
print(len(features))
print(tfidf.shape)

TfidfTransformer
----------------
102704
(4555, 102704)


In [15]:
# Alternative method, using gensim models for word embeddings, such as Word2Vec, FastText etc.
# from gensim.models import FastText

## Feature selection
Bring features and class labels to X, y form, where X is the feature array and y is the class array  
Find correlations among words by selecting the most informative features

In [16]:
from sklearn.feature_selection import SelectKBest, chi2

# create class index dict
class_indx = {}
i = 0
for v in data['class'].unique():
    class_indx[v] = i
    i += 1

X = tfidf
y = np.array([class_indx[v] for v in data['class'].values])

# X = SelectKBest(chi2, k=160000).fit_transform(X, y) # selecting the k most informative features
# print('Done!')

## Split dataset into train and test
Since we have imbalanced dataset, stratified split should be chosen

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

# plot train and test frequencies
traindict = {
    'features': X_train,
    'class': y_train.tolist()
}
trainpd = pd.DataFrame(traindict)
testdict = {
    'features': X_test,
    'class': y_test.tolist()
}
testpd = pd.DataFrame(testdict)

freqstrain = trainpd['class'].value_counts().tolist()
freqstest = testpd['class'].value_counts().tolist()
# data_classes = trainpd['class'].value_counts().index.tolist()

plot_traintest(['1','0'], freqstrain, freqstest)
del freqstrain
del freqstest

## Undersampling
The dataset is highly imbalanced and the smallest class has a few data.  
Since there is a satisfying amount of training data, undersampling of the major 'I' class can be performed.

In [18]:
# find class with minimum length
min_length = np.inf
for v in trainpd['class'].unique():
    lngth = len(trainpd[trainpd['class'] == v])
    if lngth < min_length:
        min_length = lngth

delindex = trainpd[trainpd['class'] == 0].index[0:(len(trainpd[trainpd['class'] == 0]) - min_length)]
trainpd.drop(delindex, inplace=True)

freqstrain = trainpd['class'].value_counts().tolist()
freqstest = testpd['class'].value_counts().tolist()
# data_classes = testpd['class'].value_counts().index.tolist()

plot_traintest(['1','0'], freqstrain, freqstest)

## Oversampling
The dataset is highly imbalanced and the smallest class has too few data.  
Smote can be used for that case, in order to oversample the training dataset and to balance all classes

In [19]:
# from imblearn.over_sampling import SMOTE

# oversample = SMOTE()
# X_train, y_train = oversample.fit_resample(X_train, y_train)

# # plot train frequencies
# traindict = {
#     'features': X_train,
#     'class': y_train.tolist()
# }
# trainpd = pd.DataFrame(traindict)

# freqstrain = trainpd['class'].value_counts().tolist()
# data_classes = trainpd['class'].value_counts().index.tolist()

# plot_data(data_classes, freqstrain)

In [20]:
# save the cleaned dataset to a new csv file
# data.to_csv("./mbti_classes_cleaned.csv")

## Model
Train different binary classifiers starting from the fastest (e.g Naive Bayes, Linear, Decision Tree)  
I also used classifiers such as gradient boosting and neural networks which support high feature dimensionality  
Trying to "wrapperify" the method of feature suitability by evaluating those different classifiers' performance

In [27]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn import metrics

# Split training dataset in 10 folds and keep 1/10 for validation and 9/10 for training
kf = KFold(n_splits=10, shuffle=False, random_state=None)

avg_acc = 0.0
scaler = StandardScaler()
f1_scores_train = []
f1_scores_validation = []
i = 0
best_clf = None
best_acc = 0.0
for train_indx, test_indx in kf.split(X_train):
    Xvalid_train, Xvalid_test = X_train[train_indx], X_train[test_indx]
    yvalid_train, yvalid_test = y_train[train_indx], y_train[test_indx]

    # fit/predict
    # clf = SVC(C=0.1, kernel='rbf', random_state=None)
    # clf = GradientBoostingClassifier(max_depth=1)
    clf = GaussianNB()
    # scaler.fit(Xvalid_train.toarray())
    # Xvalid_train = scaler.transform(Xvalid_train.toarray())
    clf.fit(Xvalid_train.toarray(), yvalid_train)
    ypred_eval = clf.predict(Xvalid_train.toarray())
    ypred_valid = clf.predict(Xvalid_test.toarray())
    # acc_train = metrics.accurracy(y_train, ypred_eval)

    # Since we have binary classification, micro f1 can be computed
    f1_scores_train.append(metrics.f1_score(yvalid_train, ypred_eval, average='micro'))
    acc_test = metrics.f1_score(yvalid_test, ypred_valid, average='micro')
    f1_scores_validation.append(acc_test)

    # keep the clf with the best performance
    if acc_test > best_acc:
        best_acc = acc_test
        best_clf = clf
    
    # confusion matrix structure:
    # -----------
    # | TP | FN |
    # -----------
    # | FP | TN |
    # -----------

    print('For k = {:d}\n'.format(i+1))
    print('Kfold train confusion matrix')
    print('----------------------------\n')
    print(confusion_matrix(yvalid_train, ypred_eval))
    print('\nKfold validation confusion matrix')
    print('-----------------------------------\n')
    print(confusion_matrix(yvalid_test, ypred_valid))
    print('\n')

    i += 1
    
    # avg_acc += acc_train / 10.0

f1_scores_train = np.array(f1_scores_train)
print('Kfold train f1')
print('--------------\n')
print(f"mean f1: {f1_scores_train.mean():.3f}, sigma f1: {f1_scores_train.std():.3f}, 95% conf: {f1_scores_train.mean()-2*f1_scores_train.std():.3f} - {f1_scores_train.mean()+2*f1_scores_train.std():.3f}")

f1_scores_validation = np.array(f1_scores_validation)
print('\nKfold validation f1')
print('--------------\n')
print(f"mean f1: {f1_scores_validation.mean():.3f}, sigma f1: {f1_scores_validation.std():.3f}, 95% conf: {f1_scores_validation.mean()-2*f1_scores_validation.std():.3f} - {f1_scores_validation.mean()+2*f1_scores_validation.std():.3f}")

# print(avg_acc)

For k = 1

Kfold train confusion matrix
----------------------------

[[2301   99]
 [   0  674]]

Kfold validation confusion matrix
-----------------------------------

[[187  79]
 [ 51  25]]


For k = 2

Kfold train confusion matrix
----------------------------

[[2276  114]
 [   0  684]]

Kfold validation confusion matrix
-----------------------------------

[[233  43]
 [ 41  25]]


For k = 3

Kfold train confusion matrix
----------------------------

[[2296  104]
 [   0  674]]

Kfold validation confusion matrix
-----------------------------------

[[204  62]
 [ 61  15]]


For k = 4

Kfold train confusion matrix
----------------------------

[[2281  110]
 [   0  683]]

Kfold validation confusion matrix
-----------------------------------

[[198  77]
 [ 47  20]]


For k = 5

Kfold train confusion matrix
----------------------------

[[2294  105]
 [   0  675]]

Kfold validation confusion matrix
-----------------------------------

[[209  58]
 [ 57  18]]


For k = 6

Kfold train confusi

## Production test

In [33]:
from sklearn.metrics import classification_report

# predict actual test data
y_pred = best_clf.predict(X_test.toarray())

f1_score = metrics.f1_score(y_test, y_pred, average='micro')
print('f1 score: {:.2f}'.format(f1_score))
print(classification_report(y_test, y_pred))

f1 score: 0.65
              precision    recall  f1-score   support

           0       0.80      0.75      0.77       889
           1       0.27      0.33      0.30       250

    accuracy                           0.65      1139
   macro avg       0.53      0.54      0.53      1139
weighted avg       0.68      0.65      0.67      1139

