In [74]:
import numpy as np
import plotly.graph_objects as go
import plotly
import pandas as pd
import plotly.offline as py
from scipy.stats import multivariate_normal as g
from sklearn.feature_extraction import text as fet
py.init_notebook_mode(connected=True)

def plot_data(classes, count):
    bar = go.Bar(
        x = classes,
        y = count
    )

    layout = dict(
        title = "Barchart with frequencies",
        xaxis = dict(title = "Classes"),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [bar], layout = layout)
    plotly.offline.iplot(fig)

def x_y_plot(X, y):
    meanX = [np.mean(xlist) for xlist in X.tolist()]
    scatter = go.Scatter(
        x = meanX,
        y = y,
        mode = "lines+markers",
        name = "mean X",
        marker = dict(color = 'rgba(80, 26, 80, 0.8)')
    )

    layout = dict(
        title = "Tf-idf plot",
        xaxis = dict(title = "Classes"),
        xaxis_tickangle = 45
    )

    fig = go.Figure(data = [scatter], layout = layout)
    plotly.offline.iplot(fig)

# OLD
# def separate_classes(csv_obj,classno):
#     # Separate features for each class in a dictionary, i.e star type
#     classes = {}    #classes contain every feature for each star type

#     for i in range(classno):
#         classes["".join(str(i))] = []
    
#     for row in csv_obj:
#         featdict = {}
#         for feature in row:
#             if feature != "Star type":
#                 featdict[feature] = row[feature]
#         classes[row['Star type']].append(featdict)
    
#     return classes

# def frequencies(clss):
#     # Find frequencies of feature vectors for each class
#     freqs = {}
#     for c in classes:
#        freqs[c] = len(classes[c])
    
#     return freqs

# def pdf(feature, type):
#     # Prepare plots for pdf of each class for selected feature
#     plots = []
#     color = ['(255, 0, 0','(0, 255, 0','(0, 0, 255','(231, 255, 92','(188, 0, 255','(0, 211, 255']

#     for cl in classes:
#         featlist = None
#         if type == 'int': featlist = [int(x[feature]) for x in classes[cl]]
#         elif type == 'float': featlist = [float(x[feature]) for x in classes[cl]]
#         else: featlist = [x[feature] for x in classes[cl]]

#         x1 = np.array(featlist)
#         xspace = np.linspace(x1.min(), x1.max())
#         yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#         cr = color.pop()
#         plots.append(go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join(cl), marker_color='rgba'+cr+', .9)'))
    
#     return plots

# with open("../6_class_csv.csv","r") as csvf:
#     csvreader = csv.DictReader(csvf)    #read csv as a dictionary
    
#     classes = separate_classes(csvreader,6)
#     freqs = frequencies(classes)
    
#     # Perform GDA
#     # plots = pdf('Temperature (K)', 'int')
#     # layout = go.Layout(title='Star samples GE for p(x|y)', xaxis=dict(title='x'), yaxis=dict(title='p(x|y)'))
#     # fig = go.Figure(data=plots, layout=layout)
#     # plotly.offline.iplot(fig)

#     x1 = np.array([int(x['Temperature (K)']) for x in classes['0']])
#     xspace = np.linspace(x1.min(), x1.max())
#     yspace1 = g.pdf(xspace, mean=x1.mean(),cov=x1.std())

#     fig = go.Figure(data=[
#         go.Scatter(x=xspace, y=yspace1, name="p(x|y=)".join('0'), marker_color='rgba(44, 193, 93, .9)')
#     ])

#     plotly.offline.iplot(fig)

## Preprocess data
Check data structure and clean data by allocating unique and clear classes and text. Classes are decribed with natural language, so the class value (e.g "ISTJ") should be extracted from plain text

In [33]:
# read csv into dataframe
df = pd.read_csv('../mbti_full_pull.csv')
new_class = []
new_txt = []
classes = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ','ENFJ','ENTJ'] # all the classes

print("Size before cleaning: {:d}".format(len(df)))

# extract only the cleaned classes that consist of specific and unique value
for i in range(len(df)):
    txt = df.iloc[i]['author_flair_text']
    uppert = txt.upper()

    count = 0
    clss = ''
    for cl in classes:
        if uppert.find(cl) != -1:
            count += 1
            clss = cl
    if count == 1:
        new_class.append(clss)
        new_txt.append(df.iloc[i]['body'])

mbti = {
    'text': new_txt,
    'class': new_class
}

data = pd.DataFrame(mbti)
print('Classes cleaned')

print("Size after cleaning: {:d}".format(len(data)))

Size before cleaning: 1794016
Classes cleaned
Size after cleaning: 1782111


## Find imbalanced classes

In [34]:
data.isnull().sum() # check if null values exist
data = data.dropna() # drop null values (55)

# observe each class' distribution
freqs = data['class'].value_counts().tolist()
data_classes = data['class'].value_counts().index.tolist()

plot_data(data_classes, freqs)

## Remove data for class balancing
The undersampling method can be used for example reduction, because the dataset is huge and the majority classes' data can be removed after shuffling, resulting in 2740 examples for each class, as seen below

In [35]:
# there are many samples, so all classes can have the frequency of the class with the lowest frequency
min_class_freq = min(freqs)

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)

for v in data['class'].unique():
    delindex = data[data['class'] == v].index[min_class_freq:]
    data.drop(delindex, inplace=True)

new_freqs = data['class'].value_counts().tolist()
new_data_classes = data['class'].value_counts().index.tolist()

plot_data(new_data_classes, new_freqs)

In [63]:
# save the cleaned dataset to a new csv file
# data.to_csv("./mbti_classes_cleaned.csv")

## Feature extraction
Inspect text and find possible features and relations with Tf-idf

In [89]:
# find frequency of each word in each text fragment of each row (tfidf: term frequency * inverse document frequency)
# Suggestion: text embeddings with Fasttext, Glove, Word2vec
#import fasttext as ft

# newdf = pd.read_csv("../mbti_classes_cleaned.csv")

# tokenized = ft.tokenize(newdf.iloc[0]['text'])
# print(tokenized)

transformer = fet.TfidfTransformer()
corpus = data['text'].tolist()
vectorizer = fet.CountVectorizer()
counts = vectorizer.fit_transform(corpus)   # count each word in each document

tfidf = transformer.fit_transform(counts.toarray()) # calculate tfidf by multiplying counts from above with computed idf
features = vectorizer.get_feature_names()   # features are every counted word in the entire corpus and their values are tfidf's
print('TfidfTransformer')
print('----------------')
print(features)
print(tfidf.toarray())

KeyboardInterrupt: 

Bring features and class labels to X, y form, where X is the feature array and y is the class array

In [84]:
X = tfidf.toarray()
y = data['class'].tolist()

# plot tfidf
print(len(corpus))

43840
