In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Conv1D,MaxPooling1D
from tensorflow.keras.layers import LSTM,Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
np.random.seed(7)
import warnings
warnings.filterwarnings('ignore')

In [2]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
str_punc = string.punctuation

engstopwords = stopwords.words("english")
engstopwordsV2 = re.sub('[' + re.escape(string.punctuation) + ']', '',
                        ' '.join(engstopwords)).split()

engstopwords = set(engstopwords).union(set(engstopwordsV2))

def lemmatize(word):
    word = wnl.lemmatize(word, 'a')
    word = wnl.lemmatize(word, 'v')
    word = wnl.lemmatize(word, 'n')
    return word

def clean(text):
    # Remove URLs from text
    text = re.sub("http.*?([ ]|\|\|\||$)", "", text).lower()
    url_regex = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    text = re.sub(url_regex, "", text)

    # Remove specific punctuation (usually associated with a word)
    text = re.sub(r'(:|;).', " ", text)
    
    # Remove punctuations
    text = re.sub('['+re.escape(str_punc)+']'," ",  text)
    
    # Remove parantheses, brackets
    text = re.sub('(\[|\()*\d+(\]|\))*', ' ', text)
    
    # Remove string marks
    text = re.sub('[’‘“\.”…–]', '', text)
    text = re.sub('[^(\w|\s)]', '', text)
    text = re.sub('(gt|lt)', '', text)
    
    #Check that each word is not stopword, and lemmatize it
    text = list(map(lemmatize, text.split()))
    text = [word for word in text if (word not in engstopwords)]
    text = " ".join(text)
    return text

In [3]:
data=pd.read_csv("mbti_1.csv")
data.describe(include=object)

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,"'Yeah, I suppose so. I built my current comput..."
freq,1832,1


In [4]:
data['posts']=data['posts'].apply(lambda x : clean(x))

In [5]:
processed_data = data.copy()
processed_data = processed_data.reindex(columns=["posts",'type','Extrovert','Sensitive','Thinking','Judging'])

for index,value in enumerate(processed_data['type']):
    if 'E' in value:
        processed_data.iloc[index,2]=1
    else:
        processed_data.iloc[index,2]=0
    if 'S' in value:
        processed_data.iloc[index,3]=1
    else:
        processed_data.iloc[index,3]=0
    if 'T' in value:
        processed_data.iloc[index,4]=1
    else:
        processed_data.iloc[index,4]=0
    if 'J' in value:
        processed_data.iloc[index,5]=1
    else:
        processed_data.iloc[index,5]=0

processed_data = processed_data.astype({"Extrovert": int,'Sensitive':int,'Thinking':int,'Judging':int}, errors='ignore')
processed_data.head()

Unnamed: 0,posts,type,Extrovert,Sensitive,Thinking,Judging
0,enfp intj moment sportscenter top ten play pra...,INFJ,0,0,0,1
1,find lack post alarm sex bore position often e...,ENTP,1,0,1,0
2,good one course say know bless curse absolutel...,INTP,0,0,1,0
3,dear intp enjoy conversation day esoteric gabb...,INTJ,0,0,1,1
4,fire another silly misconception approach logi...,ENTJ,1,0,1,1


In [6]:
data_X = processed_data['posts'].copy()
data_Y = processed_data['type'].copy()
data_1 = processed_data['Extrovert'].copy()
data_2 = processed_data['Sensitive'].copy()
data_3 = processed_data['Thinking'].copy()
data_4 = processed_data['Judging'].copy()

In [7]:
# for binary classification
X_train_1,X_test_1,Y_train_1,Y_test_1 = train_test_split(data_X,data_1,test_size=.2,random_state=1)
X_train_2,X_test_2,Y_train_2,Y_test_2 = train_test_split(data_X,data_2,test_size=.2,random_state=1)
X_train_3,X_test_3,Y_train_3,Y_test_3 = train_test_split(data_X,data_3,test_size=.2,random_state=1)
X_train_4,X_test_4,Y_train_4,Y_test_4 = train_test_split(data_X,data_4,test_size=.2,random_state=1)

In [8]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(data_X)

In [9]:
word_index = tokenizer.word_index
# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(X_train_1)
# Get max training sequence length
maxlen = max([len(x) for x in train_sequences])
# Pad the training sequences
train_padded = pad_sequences(train_sequences, maxlen=maxlen)

In [10]:
# Encode test data sentences into sequences
test_sequences = tokenizer.texts_to_sequences(X_test_1)
# Pad the training sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

embedding_vector_length=32

In [14]:
type(np.asarray(Y_train_1))

numpy.ndarray

In [15]:
model_1 = Sequential()
model_1.add(Embedding(100000, embedding_vector_length, input_length=maxlen))
model_1.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_1.add(MaxPooling1D(pool_size=2))
model_1.add(LSTM(100))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_1.summary())
filepath="/Users/moni/Code/PE-Product/weights_best_cnn_1.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model_1.fit(train_padded, np.asarray(Y_train_1), epochs=6, batch_size=256,verbose = 1,callbacks = callbacks_list)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 922, 32)           3200000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 922, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 461, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 3,256,405
Trainable params: 3,256,405
Non-trainable params: 0
_________________________________________________________________
None
Train on 6940 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7fa0753e80b8>

In [16]:
model_2 = Sequential()
model_2.add(Embedding(100000, embedding_vector_length, input_length=maxlen))
model_2.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_2.add(MaxPooling1D(pool_size=2))
model_2.add(LSTM(100))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_2.summary())
filepath="/Users/moni/Code/PE-Product/weights_best_cnn_2.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model_2.fit(train_padded, np.asarray(Y_train_2), epochs=6, batch_size=256,verbose = 1,callbacks = callbacks_list)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 922, 32)           3200000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 922, 32)           3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 461, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 3,256,405
Trainable params: 3,256,405
Non-trainable params: 0
_________________________________________________________________
None
Train on 6940 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f9fb4744ba8>

In [17]:
model_3 = Sequential()
model_3.add(Embedding(100000, embedding_vector_length, input_length=maxlen))
model_3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_3.add(MaxPooling1D(pool_size=2))
model_3.add(LSTM(100))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_3.summary())
filepath="/Users/moni/Code/PE-Product/weights_best_cnn_3.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model_3.fit(train_padded, np.asarray(Y_train_3), epochs=6, batch_size=256,verbose = 1,callbacks = callbacks_list)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 922, 32)           3200000   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 922, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 461, 32)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 3,256,405
Trainable params: 3,256,405
Non-trainable params: 0
_________________________________________________________________
None
Train on 6940 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f9f76e64668>

In [18]:
model_4 = Sequential()
model_4.add(Embedding(100000, embedding_vector_length, input_length=maxlen))
model_4.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_4.add(MaxPooling1D(pool_size=2))
model_4.add(LSTM(100))
model_4.add(Dense(1, activation='sigmoid'))
model_4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_4.summary())
filepath="/Users/moni/Code/PE-Product/weights_best_cnn_4.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model_4.fit(train_padded, np.asarray(Y_train_4), epochs=6, batch_size=256,verbose = 1,callbacks = callbacks_list)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 922, 32)           3200000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 922, 32)           3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 461, 32)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 3,256,405
Trainable params: 3,256,405
Non-trainable params: 0
_________________________________________________________________
None
Train on 6940 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f9f34489ac8>

In [20]:
model_1.save("/Users/moni/code/PE-Product/model_1.h5")
model_2.save("/Users/moni/code/PE-Product/model_2.h5")
model_3.save("/Users/moni/code/PE-Product/model_3.h5")
model_4.save("/Users/moni/code/PE-Product/model_4.h5")

import pickle

with open("tokenizer.pickle","wb") as token_handle:
    pickle.dump(tokenizer,token_handle)

In [None]:
text = "I don't like to be social, I solve problems using highly optimized solutions, and I use my intuition to predict possible scenarios in the future, some people consider me as bossy, but I'm not, recognized me?"
text = clean(text)
test = tokenizer.texts_to_sequences([text])

## Pad the training sequences
#test_padded = pad_sequences(test, maxlen=maxlen)
#print(test_padded.shape)
#model_1.predict(test_padded)

In [21]:
from tensorflow.keras import models
model = models.load_model("model_1.h5")

In [27]:
text = "find lack post alarm sex bore position often!"
text = clean(text)

with open("tokenizer.pickle","rb") as token_handle:
    token = pickle.load(token_handle)

test = token.texts_to_sequences([text])
test_padded = pad_sequences(test, maxlen=maxlen)
#print(test_padded.shape)
model.predict(test_padded)

array([[0.43012494]], dtype=float32)

In [31]:
data_1.value_counts()

0    6676
1    1999
Name: Extrovert, dtype: int64

In [32]:
data_2.value_counts()

0    7478
1    1197
Name: Sensitive, dtype: int64

In [33]:
data_3.value_counts()

0    4694
1    3981
Name: Thinking, dtype: int64

In [34]:
data_4.value_counts()

0    5241
1    3434
Name: Judging, dtype: int64