In [1]:
import time
import numpy as np
from sklearn.utils import shuffle

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.optim.lr_scheduler import StepLR
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


# Part 1

In [2]:
train_indx, train_word, train_gold = [], [], []
with open('./data/train') as f:
    for line in f.readlines():
        split = line.split()
        if not split: continue
        train_indx.append(int(split[0])), train_word.append(split[1]), train_gold.append(split[2])
        
dev_indx, dev_word, dev_gold = [], [], []
with open('./data/dev') as f:
    for line in f.readlines():
        split = line.split()
        if not split: continue
        dev_indx.append(int(split[0])), dev_word.append(split[1]), dev_gold.append(split[2])

test_indx, test_word, test_gold = [], [], []
with open('./data/test') as f:
    for line in f.readlines():
        split = line.split()
        if not split: continue
        test_indx.append(int(split[0])), test_word.append(split[1]), test_gold.append('O')

In [132]:
train_counts = np.unique(train_word, return_counts=True)
for indx, cnts in enumerate(train_counts[1]):
    if cnts <= 3:
        train_word[indx] = 'unk'

In [133]:
# creating encoders
x_encoder = dict((val, indx) for indx, val in enumerate(sorted(set(train_word))))
x_encoder_reverse = dict((val, key) for key,val in x_encoder.items())
unk_value = x_encoder['unk']

y_encoder = dict((val, indx) for indx, val in enumerate(sorted(set(train_gold))))
y_encoder_reverse = dict((val, key) for key,val in y_encoder.items())

# encoding train
train_x = [x_encoder.get(token, unk_value) for token in train_word]
train_y = [y_encoder.get(token) for token in train_gold]

# encoding dev
dev_x = [x_encoder.get(token, unk_value) for token in dev_word]
dev_y = [y_encoder.get(token) for token in dev_gold]

# encoding test
test_x = [x_encoder.get(token, unk_value) for token in test_word]
test_y = [y_encoder.get(token) for token in test_gold]

In [31]:
# making sentences
train_x_sentences, train_y_sentences, temp_x, temp_y = [], [], [], []
for i,indx in enumerate(train_indx):
    if indx==1 and i>0:
        train_x_sentences.append(torch.tensor(temp_x, device=device, dtype=torch.long))
        train_y_sentences.append(torch.tensor(temp_y, device=device, dtype=torch.long))
        temp_x, temp_y = [], []
    temp_x.append(train_x[i])
    temp_y.append(train_y[i])
    
train_x_sentences.append(torch.tensor(temp_x, device=device, dtype=torch.long))
train_y_sentences.append(torch.tensor(temp_y, device=device, dtype=torch.long))


dev_x_sentences, dev_y_sentences, temp_x, temp_y = [], [], [], []
for i,indx in enumerate(dev_indx):
    if indx==1 and i>0:
        dev_x_sentences.append(torch.tensor(temp_x, device=device, dtype=torch.long))
        dev_y_sentences.append(torch.tensor(temp_y, device=device, dtype=torch.long))
        temp_x, temp_y = [], []
    temp_x.append(dev_x[i])
    temp_y.append(dev_y[i])

dev_x_sentences.append(torch.tensor(temp_x, device=device, dtype=torch.long))
dev_y_sentences.append(torch.tensor(temp_y, device=device, dtype=torch.long))
    
test_x_sentences, test_y_sentences, temp_x, temp_y = [], [], [], []
for i,indx in enumerate(test_indx):
    if indx==1 and i>0:
        test_x_sentences.append(torch.tensor(temp_x, device=device, dtype=torch.long))
        test_y_sentences.append(torch.tensor(temp_y, device=device, dtype=torch.long))
        temp_x, temp_y = [], []
    temp_x.append(test_x[i])
    temp_y.append(test_y[i])

test_x_sentences.append(torch.tensor(temp_x, device=device, dtype=torch.long))
test_y_sentences.append(torch.tensor(temp_y, device=device, dtype=torch.long))

In [32]:
def shuffle_pad_batch(x_sentences, y_sentences, batch_size, padding_value, test_set):
    num_batches = len(x_sentences) // batch_size
    if not test_set:
        x_sentences, y_sentences = shuffle(train_x_sentences, train_y_sentences)

    data = []
    for i in range(1, num_batches+1):
        start, stop = (i-1)*batch_size, i*batch_size
        x = pad_sequence(x_sentences[start:stop],batch_first=True, padding_value=padding_value)
        y = pad_sequence(y_sentences[start:stop],batch_first=True, padding_value=padding_value)
        lengths = [len(j) for j in x_sentences[start:stop]]
        data.append(((x,lengths),y))
    x = pad_sequence(x_sentences[stop:],batch_first=True, padding_value=padding_value)
    y = pad_sequence(y_sentences[stop:],batch_first=True, padding_value=padding_value)
    lengths = [len(j) for j in x_sentences[stop:]]

    data.append(((x,lengths),y))
    
    return data

In [33]:
def train(dataloader, model, loss_fn, optimizer, scheduler):
    size = len(dataloader)
    epoch_loss = 0
    
    model.train()
    for batch, data in enumerate(dataloader):
        x, lengths, y = data[0][0], data[0][1], data[1]
        pred = model(x, lengths)
        loss = loss_fn(pred.float(), y.flatten())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

#         if batch % 100 == 0:
#             print('Loss: {:.4f}, Sample: {}/{}'.format(loss.item(), batch*len(x), size))
        
        epoch_loss+=loss.item()
    
    scheduler.step()
    
    return (epoch_loss / batch+1)

def test(dataloader, model, loss_fn, padding_value):
    num_batches = len(dataloader)
    epoch_loss = 0
    correct = 0
    size=0
    y_pred = []
    
    model.eval()
    with torch.no_grad():
        for batch, data in enumerate(dataloader):
            x, lengths, y = data[0][0], data[0][1], data[1]
            pred = model(x, lengths)
            loss = loss_fn(pred.float(), y.flatten())
            epoch_loss += loss.item()
            
            pred = pred.argmax(1)
            y = y.flatten()
            
            # accuracy calculation
            for indx, num in enumerate(y):
                if num==padding_value: 
                    continue
                if pred[indx] == num:
                    correct+=1
                size+=1
                y_pred.append(pred[indx].item())
                
    print('Loss: {:.3f} Accuracy: {:.2f}%'.format((epoch_loss/num_batches),(100*correct/size)))
    
    return (epoch_loss/num_batches), y_pred

In [34]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers, batch_first, num_embeddings, embedding_dim, padding_value, dropout, bidirectional):
        super(LSTM, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_value)
    
        self.hidden_size = hidden_size
        self.device = device
        
        if bidirectional: 
            fc_size = hidden_size*2
        else:
            fc_size = hidden_size

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,bidirectional=bidirectional, batch_first=batch_first)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(fc_size, 128)
        self.fc2 = nn.Linear(128, output_size)
        self.ELU = nn.ELU()    

    def forward(self, x, lengths):
        x = self.embedding(x)
        x = self.dropout(x)
        x = pack_padded_sequence(x, lengths=lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, batch_first=True)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.ELU(x)
        x = self.fc2(x)
        x = x.view(x.shape[0] * x.shape[1], -1)
        return x

In [60]:
batch_size = 16
padding_value = len(x_encoder)+1

num_embeddings = len(x_encoder)+2
embedding_dim = 100
input_size = embedding_dim
output_size = 9
hidden_size = 256
num_layers = 1
batch_first=True
bidirectional = True
dropout = .33


model = LSTM(input_size=input_size, output_size=output_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=batch_first, num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_value=padding_value, dropout=dropout, bidirectional=bidirectional)
model.to(torch.float64)
model.to(device=device)


learning_rate = .45



optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=.9)
scheduler = StepLR(optimizer, step_size=10, gamma=0.90)
loss_fn = nn.CrossEntropyLoss(ignore_index=padding_value)

epochs = 105

loss_over_time_train = []
loss_over_time_test = []

start = time.time()

dev_dataloader = shuffle_pad_batch(dev_x_sentences, dev_y_sentences, batch_size, padding_value, test_set=True)
test_dataloader = shuffle_pad_batch(test_x_sentences, test_y_sentences, batch_size, padding_value, test_set=True)

for e in range(epochs):
    train_dataloader = shuffle_pad_batch(train_x_sentences, train_y_sentences, batch_size, padding_value, test_set=False)

    print('Epoch:{}\n----------------------------------'.format(e))
    epoch_avg_train_loss = train(train_dataloader[:], model, loss_fn, optimizer, scheduler)
    loss_over_time_train.append(epoch_avg_train_loss)
    
    if e < (epochs - 3): continue
    epoch_avg_test_loss, dev_y_pred = test(dev_dataloader[:], model, loss_fn, padding_value)
    loss_over_time_test.append(epoch_avg_test_loss)
    
    print('Time Elapsed: {} min'.format(round((time.time() - start) / 60, 2)))
        
print('\nUsing Testing Dataset .....')
epoch_avg_test_loss, test_y_pred = test(test_dataloader[:], model, loss_fn, padding_value)

    
print("Done!")

In [61]:
saving the model
EPOCH = e
PATH = "blstm1.pt"

torch.save({
            'epoch': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict()
            }, PATH)

In [64]:
checkpoint = torch.load('blstm1.pt')
model.load_state_dict(checkpoint['model_state_dict'])

dev_dataloader = shuffle_pad_batch(dev_x_sentences, dev_y_sentences, batch_size, padding_value, test_set=True)
test_dataloader = shuffle_pad_batch(test_x_sentences, test_y_sentences, batch_size, padding_value, test_set=True)

epoch_avg_test_loss, dev_y_pred = test(dev_dataloader[:], model, loss_fn, padding_value)
epoch_avg_test_loss, test_y_pred = test(test_dataloader[:], model, loss_fn, padding_value)

<All keys matched successfully>

In [None]:
# processed 51578 tokens with 5942 phrases; found: 5514 phrases; correct: 4748.
# accuracy:  96.52%; precision:  86.11%; recall:  79.91%; FB1:  82.89