In [14]:
import numpy as np
import plotly.graph_objects as go
import plotly
import pandas as pd
import plotly.offline as py
from scipy.stats import multivariate_normal as g
py.init_notebook_mode(connected=True)

def plot_data(classes, count):
    bar = go.Bar(
        x = classes,
        y = count
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes"),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar], layout = layout)
    plotly.offline.iplot(fig)

# OLD
# def separate_classes(csv_obj,classno):
#     # Separate features for each class in a dictionary, i.e star type
#     classes = {}    #classes contain every feature for each star type

#     for i in range(classno):
#         classes["".join(str(i))] = []
    
#     for row in csv_obj:
#         featdict = {}
#         for feature in row:
#             if feature != "Star type":
#                 featdict[feature] = row[feature]
#         classes[row['Star type']].append(featdict)
    
#     return classes

# def frequencies(clss):
#     # Find frequencies of feature vectors for each class
#     freqs = {}
#     for c in classes:
#        freqs[c] = len(classes[c])
    
#     return freqs

# def pdf(feature, type):
#     # Prepare plots for pdf of each class for selected feature
#     plots = []
#     color = ['(255, 0, 0','(0, 255, 0','(0, 0, 255','(231, 255, 92','(188, 0, 255','(0, 211, 255']

#     for cl in classes:
#         featlist = None
#         if type == 'int': featlist = [int(x[feature]) for x in classes[cl]]
#         elif type == 'float': featlist = [float(x[feature]) for x in classes[cl]]
#         else: featlist = [x[feature] for x in classes[cl]]

#         x1 = np.array(featlist)
#         xspace = np.linspace(x1.min(), x1.max())
#         yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#         cr = color.pop()
#         plots.append(go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join(cl), marker_color='rgba'+cr+', .9)'))
    
#     return plots

# with open("../6_class_csv.csv","r") as csvf:
#     csvreader = csv.DictReader(csvf)    #read csv as a dictionary
    
#     classes = separate_classes(csvreader,6)
#     freqs = frequencies(classes)
    
#     # Perform GDA
#     # plots = pdf('Temperature (K)', 'int')
#     # layout = go.Layout(title='Star samples GE for p(x|y)', xaxis=dict(title='x'), yaxis=dict(title='p(x|y)'))
#     # fig = go.Figure(data=plots, layout=layout)
#     # plotly.offline.iplot(fig)

#     x1 = np.array([int(x['Temperature (K)']) for x in classes['0']])
#     xspace = np.linspace(x1.min(), x1.max())
#     yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#     fig = go.Figure(data=[
#         go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join('0'), marker_color='rgba(44, 193, 93, .9)')
#     ])

#     plotly.offline.iplot(fig)

## Preprocess data

In [15]:
# read csv into dataframe
df = pd.read_csv('../mbti_full_pull.csv')
new_class = []
new_txt = []
classes = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ','ENFJ','ENTJ']

print("Size before cleaning: {:d}".format(len(df)))

# extract only the cleaned classes that consist of specific and unique value
for i in range(len(df)):
    txt = df.iloc[i]['author_flair_text']
    uppert = txt.upper()

    count = 0
    clss = ''
    for cl in classes:
        if uppert.find(cl) != -1:
            count += 1
            clss = cl
    if count == 1:
        new_class.append(clss)
        new_txt.append(df.iloc[i]['body'])

mbti = {
    'text': new_txt,
    'class': new_class
}

data = pd.DataFrame(mbti)
print('Classes cleaned')

print("Size after cleaning: {:d}".format(len(data)))

Size before cleaning: 1794016
Classes cleaned
Size after cleaning: 1782111


## Find imbalanced classes

In [46]:
data.isnull().sum() # check if null values exist
data = data.dropna() # drop null values (55)

# observe each class' distribution
freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

## Remove data for class balancing

In [58]:
# there are many samples, so all classes can have the frequency of the class with the lowest frequency
min_class_freq = min(freqs)

for v in data['class'].unique():
    print(data[data['class'] == v].iloc[0:min_class_freq, :]) # join the dataframes of the unique classes

                                                    text class
0      Knowing you're in INTJ is a tool for you to us...  INTJ
1               You are truly an enlightened mastermind.  INTJ
4      I don't know if I would count this as a pet pe...  INTJ
6      I think that the military is a job people volu...  INTJ
8      Based on two politicians who in my eyes, some ...  INTJ
...                                                  ...   ...
13489  Next time try putting your hand behind you, th...  INTJ
13494  I know that thats a comparatively easy to inte...  INTJ
13500  That's me. Complaining away. \n\nSorry, dude. ...  INTJ
13503  At that stage I would just jump ship and let i...  INTJ
13508  Definitely. And most of the time if I am inter...  INTJ

[2740 rows x 2 columns]
                                                    text class
2      You should :) it will help if you have a down ...  INFJ
7      Mostly I try not to get too caught up in anyth...  INFJ
14     Wow, i might be using a