In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import io
import os
import re
import time
import string
import torch
import torchtext
import pickle as pk
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split

from torchtext.vocab import build_vocab_from_iterator
from torchtext.utils import unicode_csv_reader
from torchtext.data.utils import get_tokenizer, ngrams_iterator


NGRAMS = 2
BATCH_SIZE = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
          'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
type_dict = {_type:i for i, _type in enumerate(_types)}

In [2]:
device

device(type='cuda')

In [3]:
def csv_reader(file_path, ngrams, yield_label=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(file_path, encoding="utf8") as file:
        reader = unicode_csv_reader(file)
        next(reader)
        for line in reader:
            tokens = ' '.join(line[1:])
            tokens = tokenizer(tokens)            
            if yield_label:
                yield type_dict[line[0]], ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
                

def build_data(vocab, iterator, include_unk):
    data = []
    labels = []
    # Use tqdm to show building speed
    with tqdm(unit_scale=0, unit='lines') as t:
        for label, tokens in iterator:
            if include_unk:
                tokens = torch.tensor([vocab[token] for token in tokens])
            else:
                token_ids = list(filter(lambda x: x is not '<unk>'),
                                 [vocab[token] for token in tokens])
                tokens = torch.tensor(token_ids)
                
            # Update data and labels
            data.append((label, tokens))
            labels.append(label)
            t.update(1)
    return data, set(labels)

In [4]:
class postsDataset(Dataset):
    
    def __init__(self, vocab, data, labels):
        super(postsDataset, self).__init__()
        self._vocab = vocab
        self._data = data
        self._labels = labels
            
    def __getitem__(self, i):
        return self._data[i]

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for x in self._data:
            yield x

    def get_labels(self):
        return self._labels

    def get_vocab(self):
        return self._vocab
    

def build_dataset(train_name, ngrams, include_unk=True):
    # Build vocabulary
    vocab = build_vocab_from_iterator(csv_reader(train_name, ngrams))
    
    # Build train (data, label) tuples
    train_data, train_labels = build_data(
        vocab,
        csv_reader(train_name, ngrams, yield_label=True),
        include_unk)
    
    return postsDataset(vocab, train_data, train_labels)

In [5]:
train_data = build_dataset('pickles/sep_mbti.csv', 2)

8675lines [00:10, 789.03lines/s]
8675lines [00:17, 482.18lines/s]


In [7]:
class MbtiClf(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [8]:
VOCAB_SIZE = len(train_data.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_data.get_labels())
model = MbtiClf(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [9]:
model

MbtiClf(
  (embedding): EmbeddingBag(2113142, 32, mode=mean)
  (fc): Linear(in_features=32, out_features=16, bias=True)
)

In [10]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [11]:

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [12]:

N_EPOCHS = 100
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_data) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_data, [train_len, len(train_data) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    if epoch % 5 == 0:
        print('Epoch: %d' %(epoch), " | time in %d minutes, %d seconds" %(mins, secs))
        print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
        print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 0  | time in 0 minutes, 1 seconds
	Loss: 0.1473(train)	|	Acc: 19.0%(train)
	Loss: 0.0101(valid)	|	Acc: 0.9%(valid)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 0.1267(train)	|	Acc: 34.4%(train)
	Loss: 0.0056(valid)	|	Acc: 23.0%(valid)
Epoch: 10  | time in 0 minutes, 1 seconds
	Loss: 0.1064(train)	|	Acc: 50.3%(train)
	Loss: 0.0045(valid)	|	Acc: 38.9%(valid)
Epoch: 15  | time in 0 minutes, 1 seconds
	Loss: 0.0954(train)	|	Acc: 58.0%(train)
	Loss: 0.0054(valid)	|	Acc: 38.2%(valid)
Epoch: 20  | time in 0 minutes, 1 seconds
	Loss: 0.0898(train)	|	Acc: 61.3%(train)
	Loss: 0.0042(valid)	|	Acc: 52.8%(valid)
Epoch: 25  | time in 0 minutes, 1 seconds
	Loss: 0.0864(train)	|	Acc: 63.2%(train)
	Loss: 0.0032(valid)	|	Acc: 52.1%(valid)
Epoch: 30  | time in 0 minutes, 1 seconds
	Loss: 0.0844(train)	|	Acc: 64.0%(train)
	Loss: 0.0041(valid)	|	Acc: 52.3%(valid)
Epoch: 35  | time in 0 minutes, 1 seconds
	Loss: 0.0834(train)	|	Acc: 64.8%(train)
	Loss: 0.0044(valid)	|	Acc: 53.9%(valid)
Epoch: 40  

In [13]:
# print('Checking the results of test dataset...')
# test_loss, test_acc = test(test_dataset)
# print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

In [32]:
with open('pickles/type_explanation.pk', 'rb') as pkl:
    type_explanation = pk.load(pkl)
    
def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

test_post = "Can’t believe how rude this guy is busting shapes during Boris’ speech smh"

vocab = train_data.get_vocab()
model = model.to("cpu")
predict_type = _types[predict(test_post, model, vocab, 2)]

print("Type: %s" % predict_type)
for i in type_explanation[predict_type]:
    print(i)

Type: INTJ
1.具强大动力与本意来达成目的与创意—固执顽固者
2.有宏大的愿景且能快速在众多外界事件中找出有意义的模范
3.对所承负职务，具良好能力于策划工作并完成
4.具怀疑心、挑剔性、独立性、果决，对专业水准及绩效要求高
