In [24]:
import numpy as np
import plotly.graph_objects as go
import plotly
import pandas as pd
import plotly.offline as py
from scipy.stats import multivariate_normal as g
py.init_notebook_mode(connected=True)

def plot_data(classes, count):
    bar = go.Bar(
        x = classes,
        y = count
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes", type='category'),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar], layout = layout)
    plotly.offline.iplot(fig)

def plot_traintest(classes, train, test):
    bar1 = go.Bar(
        name = "train",
        x = classes,
        y = train
    )

    bar2 = go.Bar(
        name = "test",
        x = classes,
        y = test
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes", type='category'),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar1, bar2], layout = layout)
    plotly.offline.iplot(fig)

# OLD
# def separate_classes(csv_obj,classno):
#     # Separate features for each class in a dictionary, i.e star type
#     classes = {}    #classes contain every feature for each star type

#     for i in range(classno):
#         classes["".join(str(i))] = []
    
#     for row in csv_obj:
#         featdict = {}
#         for feature in row:
#             if feature != "Star type":
#                 featdict[feature] = row[feature]
#         classes[row['Star type']].append(featdict)
    
#     return classes

# def frequencies(clss):
#     # Find frequencies of feature vectors for each class
#     freqs = {}
#     for c in classes:
#        freqs[c] = len(classes[c])
    
#     return freqs

# def pdf(feature, type):
#     # Prepare plots for pdf of each class for selected feature
#     plots = []
#     color = ['(255, 0, 0','(0, 255, 0','(0, 0, 255','(231, 255, 92','(188, 0, 255','(0, 211, 255']

#     for cl in classes:
#         featlist = None
#         if type == 'int': featlist = [int(x[feature]) for x in classes[cl]]
#         elif type == 'float': featlist = [float(x[feature]) for x in classes[cl]]
#         else: featlist = [x[feature] for x in classes[cl]]

#         x1 = np.array(featlist)
#         xspace = np.linspace(x1.min(), x1.max())
#         yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#         cr = color.pop()
#         plots.append(go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join(cl), marker_color='rgba'+cr+', .9)'))
    
#     return plots

# with open("../6_class_csv.csv","r") as csvf:
#     csvreader = csv.DictReader(csvf)    #read csv as a dictionary
    
#     classes = separate_classes(csvreader,6)
#     freqs = frequencies(classes)
    
#     # Perform GDA
#     # plots = pdf('Temperature (K)', 'int')
#     # layout = go.Layout(title='Star samples GE for p(x|y)', xaxis=dict(title='x'), yaxis=dict(title='p(x|y)'))
#     # fig = go.Figure(data=plots, layout=layout)
#     # plotly.offline.iplot(fig)

#     x1 = np.array([int(x['Temperature (K)']) for x in classes['0']])
#     xspace = np.linspace(x1.min(), x1.max())
#     yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#     fig = go.Figure(data=[
#         go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join('0'), marker_color='rgba(44, 193, 93, .9)')
#     ])

#     plotly.offline.iplot(fig)

## Preprocess data
Check data structure and clean data by allocating unique and clear classes and text.  
Classes are decribed with natural language, so the class value (e.g "ISTJ") should be extracted from plain text, so ambiguous classes are removed.  
Cases such as "not ISTJ" are not not taken into account for the sake of simplification.

In [3]:
# read csv into dataframe
df = pd.read_csv('../mbti_full_pull.csv')
new_class = []
new_txt = []
classes = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ','ENFJ','ENTJ'] # all the classes

print("Size before cleaning: {:d}".format(len(df)))

# extract only the cleaned classes that consist of specific and unique value
for i in range(len(df)):
    txt = df.iloc[i]['author_flair_text']
    uppert = txt.upper()

    count = 0
    clss = ''
    for cl in classes:
        if uppert.find(cl) != -1:
            count += 1
            clss = cl
    if count == 1:
        new_class.append(clss)
        new_txt.append(df.iloc[i]['body'])

mbti = {
    'text': new_txt,
    'class': new_class
}

data = pd.DataFrame(mbti)

data.isnull().sum() # check if null values exist
data = data.dropna() # drop null values (55)

print('Classes cleaned')

print("Size after cleaning: {:d}".format(len(data)))

Size before cleaning: 1794016
Classes cleaned
Size after cleaning: 1782056


## Shrink dataset proportionally

In [4]:
# find class with minimum length
min_length = np.inf
min_class = ''
for v in data['class'].unique():
    lngth = len(data[data['class'] == v])
    if lngth < min_length:
        min_length = lngth
        min_class = v

ratio = []
for v in data['class'].unique():
    ratio.append(len(data[data['class'] == min_class]) / len(data[data['class'] == v]))

# remove (length-remn) items from min class
remn = 200
delindex = data[data['class'] == min_class].index[0:(len(data[data['class'] == min_class]) - remn)]
data.drop(delindex, inplace=True)

#remove items from every other class, retaining the length distribution
for v in data['class'].unique():
    if len(data[data['class'] == v]) == remn: continue
    rt = ratio.pop(0)
    new_size = round(float(remn) / rt)

    delindex = data[data['class'] == v].index[0:(len(data[data['class'] == v]) - new_size)]
    data.drop(delindex, inplace=True)

print("Size after removal: {:d}".format(len(data)))

Size after removal: 129714


## Find imbalanced classes

In [5]:
# observe each class' distribution
freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

## Feature extraction
Inspect text and find possible features and relations with Tf-idf  
The vectorizer parameters are tuned in order to find words with max information

In [100]:
# NEEDS INSPECTION, INACTIVE FOR NOW
from sklearn.feature_extraction import text as fet
import itertools
# find frequency of each word in each text fragment of each row (tfidf: term frequency * inverse document frequency)
# Suggestion: text embeddings with Fasttext, Glove, Word2vec etc.

stopword_list = ['the','a','an','be','am','are','for','of','or','that','this','which','is','then','than','www','youtube','com','reddit','wiki','wikipedia','org']

transformer = fet.TfidfTransformer()
corpus = data['text'].tolist()
vectorizer = fet.CountVectorizer(max_features=1000, max_df=0.6, stop_words=stopword_list, ngram_range=(1,1))
# vectorizer = fet.HashingVectorizer(stop_words=stopword_list, ngram_range=(3,3))
counts = vectorizer.fit_transform(corpus)   # count each word in each document

tfidf = transformer.fit_transform(counts) # calculate tfidf by multiplying counts from above with computed idf
# features = vectorizer.get_feature_names()   # features are every counted word in the entire corpus and their values are tfidf's
# print('TfidfTransformer')
# print('----------------')
# print(features)
# print(tfidf.toarray())

In [None]:
# Alternative method, using gensim models for word embeddings, such as Word2Vec, FastText etc.
from gensim.models import FastText

## Feature selection
Bring features and class labels to X, y form, where X is the feature array and y is the class array  
Find correlations among words by selecting the most informative features

In [113]:
from sklearn.feature_selection import SelectKBest, chi2

# create class index dict
class_indx = {}
i = 0
for v in data['class'].unique():
    class_indx[v] = i
    i +=1

X = tfidf
# y = data['class'].to_numpy()
y = np.array([class_indx[v] for v in data['class'].values])
print(y)

X = SelectKBest(chi2, k=90).fit_transform(X, y) # selecting the k most informative features
print(X)

[ 0  0  0 ... 11  8  9]
  (0, 88)	0.44723100741680344
  (0, 83)	0.16899373629738218
  (1, 88)	0.3147091660788233
  (1, 76)	0.19033338499721744
  (1, 42)	0.3478840108808384
  (2, 83)	0.09722443525222352
  (2, 67)	0.23097349635546915
  (2, 21)	0.20778915043233465
  (2, 1)	0.10301847029967626
  (3, 89)	0.24345929942683855
  (3, 83)	0.06643185492164605
  (3, 77)	0.1754654474033711
  (3, 62)	0.19069842304482781
  (3, 59)	0.20703470307726643
  (3, 53)	0.07402990376450488
  (3, 27)	0.18456648379059573
  (3, 0)	0.40707400485309253
  (4, 75)	0.6446849765547926
  (4, 51)	0.6840278267961689
  (5, 69)	0.12970093356743181
  (5, 62)	0.2606790929020376
  (5, 59)	0.07075257589405679
  (5, 55)	0.1285278181307809
  (5, 53)	0.15179502682426363
  (5, 52)	0.15629257290417628
  :	:
  (129703, 1)	0.2106319363301372
  (129704, 56)	0.5982178250538785
  (129704, 53)	0.27582960558750125
  (129705, 42)	0.5286766489785585
  (129706, 84)	0.21375295460242666
  (129706, 83)	0.10042829450273134
  (129706, 2)	0.1742577

## Split dataset into train and test
Since we have multi-class classification and imbalanced dataset, stratified split should be chosen

In [114]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

# plot train and test frequencies
traindict = {
    'features': X_train,
    'class': y_train.tolist()
}
trainpd = pd.DataFrame(traindict)
testdict = {
    'features': X_test,
    'class': y_test.tolist()
}
testpd = pd.DataFrame(testdict)

freqstrain = trainpd['class'].value_counts().tolist()
freqstest = testpd['class'].value_counts().tolist()
data_classes = trainpd['class'].value_counts().index.tolist()

plot_traintest(data_classes, freqstrain, freqstest)

## Oversampling
The dataset is highly imbalanced and the smallest class has too few data.  
Smote can be used for that case, in order to oversample the training dataset and to balance all classes

In [115]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

# plot train frequencies
traindict = {
    'features': X_train,
    'class': y_train.tolist()
}
trainpd = pd.DataFrame(traindict)

freqstrain = trainpd['class'].value_counts().tolist()
data_classes = trainpd['class'].value_counts().index.tolist()

plot_data(data_classes, freqstrain)

In [116]:
# save the cleaned dataset to a new csv file
# data.to_csv("./mbti_classes_cleaned.csv")

## Model
Train different binary classifiers using OneVsRest method because we have multi-class classification

In [120]:
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics

# Split training dataset in 20 folds and keep 1/20 for validation and 19/20 for training
kf = KFold(n_splits=20, shuffle=False, random_state=None)

avg_acc = 0.0
for train_indx, test_indx in kf.split(X_train):
    Xvalid_train, Xvalid_test = X_train[train_indx], X_train[test_indx]
    yvalid_train, yvalid_test = y_train[train_indx], y_train[test_indx]

    # fit/predict
    clf = tree.DecisionTreeClassifier(max_depth=6)
    ovr = OneVsRestClassifier(clf)
    ovr.fit(Xvalid_train.toarray(), yvalid_train)
    ypred = ovr.predict(Xvalid_train.toarray())

    # since we have multi-class problem, accuracy metric is probably useless
    # precision and recall metrics shoulb be used, since they are class specific
    acc_train = metrics.f1_score(yvalid_train, ypred, average='macro')
    avg_acc += acc_train / 20.0

print(avg_acc)

# clf = GaussianNB()
# ovr = OneVsRestClassifier(clf)
# ovr = ovr.fit(X_train, y_train)
# ypred = ovr.predict(X_train)

# print(metrics.classification_report(y_train, ypred))