# Example of Using preprocess.py and embedding.py

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re

import preprocess as pre
import embedding as em

In [2]:
pre.train.head()

Unnamed: 0,type,posts,Average Words Per Comment,Variance of Word Counts,Cleaned Posts,IE,NS,TF,JP,Average Words Per Comment Scaled,Variance of Word Counts Scaled
1228,INFP,'We are mandarin speakers. He receive educati...,16.78,187.3024,'we are mandarin speakers. he receive educatio...,I,N,F,P,-1.245319,1.039153
1290,ISTP,"'Nope. Not now, not ever. I'm too busy with ...",24.38,145.0304,"'nope. not now, not ever. i'm too busy with wo...",I,S,T,P,-0.021923,0.161601
6756,ENFJ,'That's the only one I haven't gotten to read ...,23.38,182.9104,'that's the only one i haven't gotten to read ...,E,N,F,J,-0.182897,0.947976
1662,INFP,'I used to think that maturity was burning bri...,27.38,148.0304,'i used to think that maturity was burning bri...,I,N,F,P,0.460996,0.22388
3338,INFP,'I get typed as both a 4w5 and 5w6 as well but...,20.94,157.8736,'i get typed as both a <NUM> w <NUM> and <NUM>...,I,N,F,P,-0.575671,0.428221


In [3]:
cleaned_posts = pre.train['Cleaned Posts'].values
cleaned_posts.shape

(6940,)

# Words to Vectors

In [4]:
maxlen = 20 # maximum number of words, the rest of the comment would be cut off
# 20 for now, but we actually do not want a maximum length
max_words = 10000
embedding_dim = 100
embeddings_index = em.get_GloVe()
word_input, word_index = em.map_words_to_int(cleaned_posts, max_words, maxlen)

embedding_matrix = em.create_embedding_matrix (
    word_index, 
    embeddings_index, 
    max_words, 
    embedding_dim
)

Found 400000 word vectors.


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 97151 unique tokens.


In [5]:
labels = pre.train['type']

print('Shape of data tensor:', word_input.shape)
print('Shape of label tensor:', labels.shape)

from sklearn.preprocessing import LabelEncoder

lab_encoder = LabelEncoder()
label_encoded = lab_encoder.fit_transform(labels)
label_encoded

Shape of data tensor: (6940, 20)
Shape of label tensor: (6940,)


array([ 9, 15,  0, ...,  2, 11,  3])

In [6]:
lab_encoder.classes_

array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype=object)

# Sample Model Training (Results are not meaningful here)

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

def get_model():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = False
    return model

In [18]:
get_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 100)           1000000   
_________________________________________________________________
flatten_5 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 32)                64032     
_________________________________________________________________
dense_10 (Dense)             (None, 16)                528       
Total params: 2,064,560
Trainable params: 1,064,560
Non-trainable params: 1,000,000
_________________________________________________________________


  'Discrepancy between trainable weights and collected trainable'


In [19]:
k = 3
num_validation_samples = len(word_input) // k

np.random.shuffle(word_input)

validation_scores = []
for fold in range(k):
    validation_data = word_input[num_validation_samples * fold:
                                 num_validation_samples * (fold + 1)]
    validation_label = label_encoded[num_validation_samples * fold:
                                 num_validation_samples * (fold + 1)]
#     training_data = word_input[:num_validation_samples * fold] + \
#         word_input[num_validation_samples * (fold + 1):]
    training_data = np.vstack((
        word_input[:num_validation_samples * fold],
        word_input[num_validation_samples * (fold + 1):]
    ))

    model = get_model()
    model.fit(
        word_input,
        label_encoded,
        epochs=3,
        batch_size=32,
    )
    validation_score = model.evaluate (validation_data, validation_label)[1] # get the accuracy
    validation_scores.append(validation_score)
validation_score = np.average(validation_scores)

  'Discrepancy between trainable weights and collected trainable'


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
pre.test.head()

Unnamed: 0,type,posts,Average Words Per Comment,Variance of Word Counts,Cleaned Posts,IE,NS,TF,JP,Average Words Per Comment Scaled,Variance of Word Counts Scaled
7814,INFP,"'Macona , it depends if the big family has ext...",33.16,70.6004,"'macona , it depends if the big family has ext...",I,N,F,P,1.371698,-1.364
4635,ENFJ,'My Brother is an ISTP and oddly enough I get ...,29.76,169.2324,'my brother is an istp and oddly enough i get ...,E,N,F,J,0.829438,0.660141
3509,INFJ,'I do this but my violent reaction is to give ...,25.36,163.5044,'i do this but my violent reaction is to give ...,I,N,F,J,0.12769,0.54259
1882,INFJ,"'I do this all the time in relation to people,...",19.86,68.322581,"'i do this all the time in relation to people,...",I,N,F,J,-0.749495,-1.410746
2950,INTJ,'The title of this thread is misleading; there...,25.58,107.344045,'the title of this thread is misleading; there...,I,N,T,J,0.162777,-0.609941


In [10]:
test = pre.test
test.head()

Unnamed: 0,type,posts,Average Words Per Comment,Variance of Word Counts,Cleaned Posts,IE,NS,TF,JP,Average Words Per Comment Scaled,Variance of Word Counts Scaled
7814,INFP,"'Macona , it depends if the big family has ext...",33.16,70.6004,"'macona , it depends if the big family has ext...",I,N,F,P,1.371698,-1.364
4635,ENFJ,'My Brother is an ISTP and oddly enough I get ...,29.76,169.2324,'my brother is an istp and oddly enough i get ...,E,N,F,J,0.829438,0.660141
3509,INFJ,'I do this but my violent reaction is to give ...,25.36,163.5044,'i do this but my violent reaction is to give ...,I,N,F,J,0.12769,0.54259
1882,INFJ,"'I do this all the time in relation to people,...",19.86,68.322581,"'i do this all the time in relation to people,...",I,N,F,J,-0.749495,-1.410746
2950,INTJ,'The title of this thread is misleading; there...,25.58,107.344045,'the title of this thread is misleading; there...,I,N,T,J,0.162777,-0.609941


In [11]:
test_cleaned_posts = test['Cleaned Posts'].values
test_labels = test['type']

test_word_input, word_index_test = em.map_words_to_int(test_cleaned_posts, max_words, maxlen)

Found 47539 unique tokens.


In [12]:
test_label_encoded = lab_encoder.fit_transform(test_labels)
model.evaluate (test_word_input, test_label_encoded)



[2.364882653316091, 0.21152737754737952]