In [31]:
import numpy as np
import plotly.graph_objects as go
import plotly
import pandas as pd
import plotly.offline as py
from scipy.stats import multivariate_normal as g
py.init_notebook_mode(connected=True)

def plot_data(classes, count):
    bar = go.Bar(
        x = classes,
        y = count
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes"),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar], layout = layout)
    plotly.offline.iplot(fig)

def x_y_plot(X, y):
    meanX = [np.mean(xlist) for xlist in X.tolist()]
    scatter = go.Scatter(
        x = meanX,
        y = y,
        mode = "lines+markers",
        name = "mean X",
        marker = dict(color = 'rgba(80, 26, 80, 0.8)')
    )

    layout = dict(
        title = "Tf-idf plot",
        xaxis = dict(title = "Classes"),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [scatter], layout = layout)
    plotly.offline.iplot(fig)

# OLD
# def separate_classes(csv_obj,classno):
#     # Separate features for each class in a dictionary, i.e star type
#     classes = {}    #classes contain every feature for each star type

#     for i in range(classno):
#         classes["".join(str(i))] = []
    
#     for row in csv_obj:
#         featdict = {}
#         for feature in row:
#             if feature != "Star type":
#                 featdict[feature] = row[feature]
#         classes[row['Star type']].append(featdict)
    
#     return classes

# def frequencies(clss):
#     # Find frequencies of feature vectors for each class
#     freqs = {}
#     for c in classes:
#        freqs[c] = len(classes[c])
    
#     return freqs

# def pdf(feature, type):
#     # Prepare plots for pdf of each class for selected feature
#     plots = []
#     color = ['(255, 0, 0','(0, 255, 0','(0, 0, 255','(231, 255, 92','(188, 0, 255','(0, 211, 255']

#     for cl in classes:
#         featlist = None
#         if type == 'int': featlist = [int(x[feature]) for x in classes[cl]]
#         elif type == 'float': featlist = [float(x[feature]) for x in classes[cl]]
#         else: featlist = [x[feature] for x in classes[cl]]

#         x1 = np.array(featlist)
#         xspace = np.linspace(x1.min(), x1.max())
#         yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#         cr = color.pop()
#         plots.append(go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join(cl), marker_color='rgba'+cr+', .9)'))
    
#     return plots

# with open("../6_class_csv.csv","r") as csvf:
#     csvreader = csv.DictReader(csvf)    #read csv as a dictionary
    
#     classes = separate_classes(csvreader,6)
#     freqs = frequencies(classes)
    
#     # Perform GDA
#     # plots = pdf('Temperature (K)', 'int')
#     # layout = go.Layout(title='Star samples GE for p(x|y)', xaxis=dict(title='x'), yaxis=dict(title='p(x|y)'))
#     # fig = go.Figure(data=plots, layout=layout)
#     # plotly.offline.iplot(fig)

#     x1 = np.array([int(x['Temperature (K)']) for x in classes['0']])
#     xspace = np.linspace(x1.min(), x1.max())
#     yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#     fig = go.Figure(data=[
#         go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join('0'), marker_color='rgba(44, 193, 93, .9)')
#     ])

#     plotly.offline.iplot(fig)

## Preprocess data
Check data structure and clean data by allocating unique and clear classes and text.  
Classes are decribed with natural language, so the class value (e.g "ISTJ") should be extracted from plain text, so ambiguous classes are removed.  
Cases such as "not ISTJ" are not not taken into account for the sake of simplification.

In [32]:
# read csv into dataframe
df = pd.read_csv('../mbti_full_pull.csv')
new_class = []
new_txt = []
classes = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ','ENFJ','ENTJ'] # all the classes

print("Size before cleaning: {:d}".format(len(df)))

# extract only the cleaned classes that consist of specific and unique value
for i in range(len(df)):
    txt = df.iloc[i]['author_flair_text']
    uppert = txt.upper()

    count = 0
    clss = ''
    for cl in classes:
        if uppert.find(cl) != -1:
            count += 1
            clss = cl
    if count == 1:
        new_class.append(clss)
        new_txt.append(df.iloc[i]['body'])

mbti = {
    'text': new_txt,
    'class': new_class
}

data = pd.DataFrame(mbti)

data.isnull().sum() # check if null values exist
data = data.dropna() # drop null values (55)

print('Classes cleaned')

print("Size after cleaning: {:d}".format(len(data)))

Size before cleaning: 1794016
Classes cleaned
Size after cleaning: 1782056


## Shrink dataset proportionally

In [33]:
# find class with minimum length
min_length = np.inf
min_class = ''
for v in data['class'].unique():
    lngth = len(data[data['class'] == v])
    if lngth < min_length:
        min_length = lngth
        min_class = v

ratio = []
for v in data['class'].unique():
    ratio.append(len(data[data['class'] == min_class]) / len(data[data['class'] == v]))

# remove (length-remn) items from min class
remn = 100
delindex = data[data['class'] == min_class].index[0:(len(data[data['class'] == min_class]) - remn)]
data.drop(delindex, inplace=True)

#remove items from every other class, retaining the length distribution
for v in data['class'].unique():
    if len(data[data['class'] == v]) == remn: continue
    rt = ratio.pop(0)
    new_size = round(float(remn) / rt)

    delindex = data[data['class'] == v].index[0:(len(data[data['class'] == v]) - new_size)]
    data.drop(delindex, inplace=True)

print("Size after removal: {:d}".format(len(data)))

Size after removal: 64855


## Find imbalanced classes

In [34]:
# observe each class' distribution
freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

## Feature extraction/selection
Inspect text and find possible features and relations with Tf-idf  
The vectorizer parameters are tuned in order to find words with max information

In [64]:
from sklearn.feature_extraction import text as fet
# find frequency of each word in each text fragment of each row (tfidf: term frequency * inverse document frequency)
# Suggestion: text embeddings with Fasttext, Glove, Word2vec etc.

stopword_list = ['the','a','an','be','am','are','for','of','or','that','this','which','is','then','than','www','youtube','com','reddit','wiki','wikipedia','org']

transformer = fet.TfidfTransformer()
corpus = data['text'].tolist()
vectorizer = fet.CountVectorizer(max_features=1000, max_df=0.5, stop_words=stopword_list, ngram_range=(3,3))
counts = vectorizer.fit_transform(corpus)   # count each word in each document

tfidf = transformer.fit_transform(counts.toarray()) # calculate tfidf by multiplying counts from above with computed idf
features = vectorizer.get_feature_names()   # features are every counted word in the entire corpus and their values are tfidf's
print('TfidfTransformer')
print('----------------')
print(features)
print(tfidf.toarray())

TfidfTransformer
----------------
['able to do', 'able to get', 'about how you', 'about it and', 'about it but', 'about it it', 'about what you', 'agree with you', 'agree with your', 'all time and', 'allows me to', 'allows you to', 'amp nbsp amp', 'and able to', 'and don have', 'and don know', 'and don think', 'and don want', 'and end up', 'and even if', 'and feel like', 'and focus on', 'and had to', 'and have been', 'and have no', 'and have to', 'and how it', 'and how they', 'and how to', 'and how you', 'and if it', 'and if they', 'and if you', 'and it doesn', 'and it has', 'and it just', 'and it makes', 'and it not', 'and it really', 'and it was', 'and it will', 'and like to', 'and move on', 'and need to', 'and no one', 'and see how', 'and see if', 'and see what', 'and so on', 'and tend to', 'and there no', 'and they re', 'and think it', 'and try to', 'and trying to', 'and vice versa', 'and want to', 'and we can', 'and we have', 'and we re', 'and what they', 'and what you', 'and when

## X, y form
Bring features and class labels to X, y form, where X is the feature array and y is the class array  
Find correlations among words by selecting the most informative features

In [65]:
from sklearn.feature_selection import SelectKBest, chi2

X = tfidf.toarray()
y = data['class'].to_numpy()

X = SelectKBest(chi2, k=20).fit_transform(X, y) # selecting the k most informative features
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Split dataset into train and test
Since we have multi-class classification, stratified split should be chosen

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

trainarr = np.column_stack((X_train, y_train))
trainpd = pd.DataFrame(trainarr)

# perform class balancing in training data

        0    1    2    3    4    5    6    7    8    9   ...   11   12   13  \
0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
48636  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
48637  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
48638  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
48639  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
48640  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

        14   15   16   17   18   19    20  
0      

## Remove data for class balancing
The undersampling method can be used for example reduction, because the dataset is huge and the majority classes' data can be removed after shuffling, resulting in 2740 examples for each class, as seen below

In [4]:
# there are many samples, so all classes can have the frequency of the class with the lowest frequency
min_class_freq = min(freqs)

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)

for v in data['class'].unique():
    delindex = data[data['class'] == v].index[min_class_freq:]
    data.drop(delindex, inplace=True)

new_freqs = data['class'].value_counts().tolist()
new_data_classes = data['class'].value_counts().index.tolist()

plot_data(new_data_classes, new_freqs)

In [5]:
# save the cleaned dataset to a new csv file
# data.to_csv("./mbti_classes_cleaned.csv")

## Model

In [93]:
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import GaussianNB
# from sklearn.metrics import accuracy_score
# from sklearn import tree

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# clf = tree.DecisionTreeClassifier()
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# acc = accuracy_score(y_test, y_pred)
# print(acc)

0.9378649635036497
