In [20]:
import torch
import os
import gc
import pickle
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torchtext.data
import spacy
from torchtext.data import Field, BucketIterator, Example, Dataset
import matplotlib.pyplot as plt

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x107cc9af0>

In [4]:
#filepaths
clean_img_path = 'clean_img'
clean_txt_path = 'clean_txt'
clean_entity_path = 'clean_entity'

In [31]:
#collect all records into a list
records = []
clean_imgs = torch.load(os.path.join(clean_img_path, 'clean.pt'))

for i, name in enumerate(os.listdir(clean_txt_path)):
    name, __ = os.path.splitext(name)
    img = clean_imgs[i]
    txt_filepath = os.path.join(clean_txt_path, name + '.txt')
    txt_file = open(txt_filepath, 'rb')
    text = txt_file.read()
    txt_file.close()
    ent_filepath = os.path.join(clean_entity_path, name + '.pkl')
    #print(ent_filepath)
    #ent_file = open(ent_filepath, 'rb')
    try:
        with open(ent_filepath, 'rb') as my_file:
            unpickler = pickle.Unpickler(my_file)
            entities = unpickler.load()
    except EOFError:
        print(ent_filepath)
        print('An EOFError exception occurred. The file is empty')
    #entities = pickle.load(ent_file)
    #record = [img, text, entities]
    record = [text, entities]
    records.append(record)

In [32]:
TEXT = Field(sequential=True, tokenize=lambda x: x)
ENTITIES = Field(sequential=True, tokenize=lambda x: x.split())

# Create examples from dataset
data_examples = [Example.fromlist([text, entities], fields=[('text', TEXT), ('entities', ENTITIES)]) for text, entities in records]
data_set = Dataset(data_examples, fields=[('text', TEXT), ('entities', ENTITIES)])

# Build vocab (if necessary)
TEXT.build_vocab(data_set)

train_set, valid_set, test_set = data_set.split(split_ratio=[0.7, 0.15, 0.15])

In [33]:
test_set[0].entities #name, date, address, total

['MR. D.I.Y. (M) SDN BHD',
 '19-04-18',
 'LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERINDUSTRIAN BALAKONG, 43300 SERI KEMBANGAN, SELANGOR',
 'RM 34.80']

In [34]:
train_iterator, val_iterator, test_iterator = BucketIterator.splits(
    (train_set, valid_set, test_set),
    batch_size=32,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
)
print(len(train_iterator))
print(len(val_iterator))
print(len(test_iterator))

22
5
5


In [35]:
class EXT(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EXT, self).__init__()
        self.emb = nn.Embedding(input_size, input_size)
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 8)

    def forward(self, x):
        x = self.emb(x)
        h0 = torch.zeros(1, len(x), self.hidden_size)  # Initial hidden state
        out, _ = self.rnn(x, h0) #[batch index, char index, vocab index]
        out = self.fc(out) #[batch index, char index, 8]
        return out #The output_probs tensor contains probabilities for each class at each position in the sequence

In [36]:
#probabilities of [start name, end name, start date, end date, start address, end address, start total, end total]
#return tensor of ground truth probabilities
def get_ground_probs(text, ents):
    name_start = torch.zeros(1, len(text))
    name_end = torch.zeros(1, len(text))
    date_start = torch.zeros(1, len(text))
    date_end = torch.zeros(1, len(text))
    address_start = torch.zeros(1, len(text))
    address_end = torch.zeros(1, len(text))
    total_start = torch.zeros(1, len(text))
    total_end = torch.zeros(1, len(text))
    name_start_index = text.index(ents[0])
    name_end_index = name_start_index + len(ents[0])
    date_start_index = text.index(ents[1])
    date_end_index = date_start_index + len(ents[1])
    address_start_index = text.index(ents[2])
    address_end_index = date_start_index + len(ents[2])
    total_start_index = text.index(ents[3])
    total_end_index = total_start_index + len(ents[3])
    name_start[name_start_index] = 1
    name_end[name_end_index] = 1
    date_start[date_start_index] = 1
    date_end[date_end_index] = 1
    address_start[address_start_index] = 1
    address_end[address_end_index] = 1
    total_start[total_start_index] = 1
    total_end[total_end_index] = 1
    return torch.stack([name_start, name_end, date_start, date_end, address_start, address_end, total_start, total_end])

def get_batch_ground_probs(batch_text, batch_ents):
    ans = []
    for text, ents in zip(batch_text, batch_ents):
        ans.append(get_ground_probs(text, ents))
    return torch.stack(ans)

In [37]:
def cer(reference, hypothesis):
    # Convert the sentences into character lists
    ref = list(reference)
    hyp = list(hypothesis)

    # Create a matrix of size (len(ref)+1) x (len(hyp)+1)
    d = np.zeros((len(ref) + 1) * (len(hyp) + 1), dtype=np.uint8)
    d = d.reshape((len(ref) + 1, len(hyp) + 1))

    # Initialize the first row and column to be the distance from the empty string
    for i in range(len(ref) + 1):
        d[i][0] = i
    for j in range(len(hyp) + 1):
        d[0][j] = j

    # Populate the rest of the matrix
    for i in range(1, len(ref) + 1):
        for j in range(1, len(hyp) + 1):
            if ref[i - 1] == hyp[j - 1]:
                cost = 0
            else:
                cost = 1
            d[i][j] = min(d[i - 1][j] + 1,      # deletion
                          d[i][j - 1] + 1,      # insertion
                          d[i - 1][j - 1] + cost)  # substitution

    # The CER is the cost of transforming hypothesis into reference divided by the number of characters in the reference
    cer_value = float(d[len(ref)][len(hyp)]) / len(ref)

    return cer_value

In [None]:
def get_model_name(name, batch_size, learning_rate, epoch):
  return "model_{}_bs{}_lr{}_epoch{}".format(name, batch_size, learning_rate, epoch)

#based on CER
def get_accuracy(model, data_itr):
    correct = 0
    total = 0
    for batch_text, batch_ents in data_itr:
        output = model(batch_text)

        #select index with maximum prediction score
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += imgs.shape[0]
    return correct / total

In [115]:
def train_model(model, train_set, val_set, batch_size=32, num_epochs=5, learning_rate=1e-5):
    # DataLoaders for train and validation sets
    # Create BucketIterator
    train_itr = BucketIterator(train_set, batch_size=2, sort_key=lambda x: len(x.text),
                                    sort_within_batch=True, device=torch.device('cpu'), shuffle=True,
                                    train=True, sort=True)
    val_itr = BucketIterator(val_set, batch_size=2, sort_key=lambda x: len(x.text),
                                    sort_within_batch=True, device=torch.device('cpu'), shuffle=True,
                                    train=True, sort=True)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    train_loss, val_loss, train_acc, valid_acc = [], [], [], []
    epochs = []

    for epoch in range(num_epochs):
        for i, (texts, ents) in enumerate(train_itr):
            optimizer.zero_grad()
            #print(batch.sms[0])
            assert(torch.is_tensor(batch.sms[0]))
            pred = model(batch.sms[0])
            loss = criterion(pred, batch.label)
            loss.backward()
            optimizer.step()

        train_loss.append(get_loss(model, train_iter, criterion))
        val_loss.append(get_loss(model, valid_iter, criterion))

        epochs.append(epoch)
        train_acc.append(get_accuracy(model, train_iter, batch_size))
        valid_acc.append(get_accuracy(model, valid_iter, batch_size))
        print("Epoch %d; Train Loss %f; Val Loss %f Train Acc %f; Val Acc %f" % (
            epoch+1, train_loss[-1], val_loss[-1], train_acc[-1], valid_acc[-1]))

    # plotting
    plt.title("Training Curve")
    plt.plot(epochs, train_loss, label="Train")
    plt.plot(epochs, val_loss, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(epochs, train_acc, label="Train")
    plt.plot(epochs, valid_acc, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

In [117]:
george = EXT(len(char_vocab), 144, len(char_vocab))

train_model(george, train_set, val_set)

AttributeError: 'tuple' object has no attribute 'dim'