In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from matplotlib.pyplot import *
import seaborn as sns
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

%matplotlib inline
from nltk import tokenize

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.layers import Layer, InputSpec
from tensorflow.keras import initializers as initializers, regularizers, constraints
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\65873\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\65873\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\65873\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data=pd.read_csv('mbti_1.csv')
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## Data Cleaning

the preprocessing include lemmatization and removes stop words, do not do stemming 

In [3]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import word_tokenize

stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer() 
cachedStopWords = stopwords.words("english")

def cleaning_data(data, remove_stop_words=True):
    list_posts = []
    i=0   
    for row in data.iterrows():
        posts = row[1].posts
        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', posts) #remove urls
        temp = re.sub("[^a-zA-Z.]", " ", temp) #remove all punctuations except fullstops.
        temp = re.sub(' +', ' ', temp).lower() 
        temp=re.sub(r'\.+', ".", temp) #remove multiple fullstops.
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
        list_posts.append(temp)

    text = np.array(list_posts)
    return text

In [4]:
clean_text = cleaning_data(data, remove_stop_words=True)
data['clean_text']=clean_text
data = data[['clean_text', 'type']]
data.head()

Unnamed: 0,clean_text,type
0,enfp intj moment sportscenter top ten play pr...,INFJ
1,finding lack post alarming. sex boring positi...,ENTP
2,good one course say know blessing curse. abso...,INTP
3,dear intp enjoyed conversation day. esoteric ...,INTJ
4,fired. another silly misconception. approachi...,ENTJ


In [5]:
types=data['type']
text=data['clean_text']
tps=data.groupby('type')
print("total types:",tps.ngroups)
print(tps.size())

total types: 16
type
ENFJ     190
ENFP     675
ENTJ     231
ENTP     685
ESFJ      42
ESFP      48
ESTJ      39
ESTP      89
INFJ    1470
INFP    1832
INTJ    1091
INTP    1304
ISFJ     166
ISFP     271
ISTJ     205
ISTP     337
dtype: int64


## word embedding

In [18]:
sentences=[]

In [19]:
for post in data['clean_text']:
    post=post.strip(' ').split('.')
    for s in post:
        if s.strip(' ').split(' ')!=['']:
            sentences.append(s.strip(' ').split(' '))

In [20]:
# Creating the model and setting values for the various parameters
num_features = 250  # Word vector dimensionality
min_word_count = 1 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 7      # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

In [21]:
# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          vector_size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

Training model....


In [22]:
# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "100features_10context"
model_path = F"C:/Users/65873/Downloads/NLP project/classification{model_name}"
model.save(model_path)
print("model saved")


  model.init_sims(replace=True)


model saved


## import the saved model

In [6]:
from gensim.models import word2vec
model_name = "100features_10context"
model_path = F"C:/Users/65873/Downloads/NLP project/classification{model_name}"
embed_model = word2vec.Word2Vec.load(model_path)
embed_model.wv.most_similar('horror')

[('comedy', 0.8270348906517029),
 ('movies', 0.8157392144203186),
 ('disney', 0.8114413022994995),
 ('film', 0.8023772835731506),
 ('animated', 0.8007003664970398),
 ('thriller', 0.7653870582580566),
 ('pixar', 0.7639893293380737),
 ('films', 0.7565824389457703),
 ('cartoon', 0.7564561367034912),
 ('television', 0.7562302947044373)]

## Input 

In [7]:
sequences=[]
for post in data['clean_text']:
    p=[]
    post=post.strip(' ').split('.')
    for s in post:
        if s.strip(' ').split(' ')!=['']:
            p+=s.strip(' ').split(' ')
    sequences.append(p)

In [8]:
embed_model.wv['shake']
padvec=np.zeros(250)
input_sequences=[]
for i in range(len(sequences)):
    post=[]
    for j in range(len(sequences[i])):
        if sequences[i][j] in embed_model.wv:
            post.append(embed_model.wv[sequences[i][j]])
    if len(post)<300:
        for k in range(300-len(post)):
            post.append(padvec)
    post=np.array(post[:300])
    input_sequences.append(post)

In [62]:
len(input_sequences)

8675

In [9]:
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
oh=[[i] for i in data['type']]
# define one hot encoding
encoder = OneHotEncoder(sparse=False)
# transform data
onehot = encoder.fit_transform(oh)
print(onehot)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
typecode = {"type":{"INFJ": 1, "INFP": 2, "INTJ":3, "INTP":4, "ISFJ":5, "ISFP":6, 
                   "ISTJ":7, "ISTP":8, "ENFJ":9, "ENFP":10, "ENTJ":11, "ENTP":12, "ESFJ":13, "ESFP":14, "ESTJ":15, "ESTP":16}}

In [11]:
data = data.replace(typecode)


In [12]:
data['type']

0        1
1       12
2        4
3        3
4       11
        ..
8670     6
8671    10
8672     4
8673     2
8674     2
Name: type, Length: 8675, dtype: int64

In [13]:
ys=np.array(onehot)
xs=np.array(input_sequences)

## train test split

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xs, ys, test_size=0.20, random_state=1, stratify=ys)

## build model and train, oveall

In [16]:
model = Sequential()
model.add(tf.keras.Input(shape=(300,250)))
model.add(tf.keras.layers.Conv1D(32, (30), padding='same', activation='relu'))
model.add(tf.keras.layers.Conv1D(32, (20), activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size =4 , strides = 2))
model.add(tf.keras.layers.Dropout(0.3)) 

model.add(tf.keras.layers.Conv1D(64, (10), padding='same', activation='relu'))
model.add(tf.keras.layers.Conv1D(64, (5), activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size =4 , strides = 2))
model.add(tf.keras.layers.Dropout(0.3)) 

model.add(LSTM(64,return_sequences=True))
model.add(tf.keras.layers.Dropout(0.2)) 
model.add(LSTM(128))
#model.add(Bidirectional(LSTM(64)))

model.add(Dense(16, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])
history = model.fit(x_train, y_train, epochs = 3, validation_data=(x_test, y_test), verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
score, acc = model.evaluate(x_test, y_test)

