In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, IterableDataset, DataLoader
from itertools import cycle, islice
from string import punctuation
import random
import linecache
import mmap
import numpy as np
from nltk.stem import PorterStemmer

from lstm import LSTM
from utils import encode, load_vocab

In [2]:
# Data set definition

class MyIterableDataset(IterableDataset):
    """
    inputs:
      filepath - tweets dataset filepath
      vocab - vocabulary
      wordsNb - number of words used in one sequence
    """

    def __init__(self, filepath, vocab, wordsNb, shuffled = True):
      self.filepath = filepath
      self.examplesNum = self.linesNb(self.filepath)
      self.order = np.random.permutation(np.arange(self.examplesNum))
      self.vocab = vocab
      self.shuffled = shuffled
      self.ps = PorterStemmer()
      self.length = wordsNb
      

    def linesNb(self, filepath):
      
      f = open(filepath, "r+")
      buf = mmap.mmap(f.fileno(), 0)
      lines = 0
      readline = buf.readline

      while readline():
        lines += 1

      return lines


    def __iter__(self):
      if self.shuffled:
        return self.get_shuffled_Stream(self.filepath, self.order, self.vocab, self.ps, self.length)
      else:
        return self.get_stream(self.filepath, self.vocab, self.ps, self.length)  

    
    def get_stream(self, filepath, vocab, ps, length):
      return self.parse_file(filepath, vocab, ps, length)


    def get_shuffled_Stream(self, filepath, order, vocab, ps, length):
      return self.parse_file_ord(filepath, order, vocab, ps, length)


    # parsing the file with examples order inherited from the orginal file
    def parse_file(self, filepath, vocab, ps, length):
      
      with open(filepath, 'r', encoding="latin-1") as file_obj:
        for line in file_obj:
          try:
            yield self.process_line(line, vocab, ps, length)
          except IndexError:
            pass


    # parsing the file with new examples order
    def parse_file_ord(self, filepath, order, vocab, ps, length):
      
      for indx in order:
        line = linecache.getline(filepath, indx)
        try:
          yield self.process_line(line, vocab, ps, length)
        except IndexError:
          pass

    
    # processing the data into the pytorch tensors
    def process_line(self, line, vocab, ps, length):
      token = (line.strip("\n").split('","')[0].translate(str.maketrans("", "", punctuation)), line.strip("\n").split('","')[5].translate(str.maketrans("", "", punctuation)).lower())
      tensor = encode(token[1], vocab, ps, length)
      label = float(token[0])
      if label == 4:
        label = 1
      label = torch.unsqueeze(torch.tensor(label), dim = 0)
      return (label, tensor)

In [3]:
# Initializing the parameters of the network, loading the vocabulary and constructing the data set

batch_size = 256
wordsNb = 30
hidden_size = 128
classes = 1
num_layers = 1
learning_rate = 0.0001

vocabulary = load_vocab("./data/vocab.txt")
input_size = len(vocabulary) + 2

shuffled = MyIterableDataset('./data/tweets.csv', vocabulary, wordsNb, shuffled = True)
examplesNum = shuffled.examplesNum
testNum = round(0.01*examplesNum)
trainNum = examplesNum - testNum
trainBatches = round(trainNum/batch_size)
testBatches = round(testNum/batch_size) - 2

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
# Initializing the model and defining the loss

lstm = LSTM(input_size, hidden_size, num_layers, wordsNb, classes, batch_size)
lstm.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr = learning_rate)
loader = DataLoader(shuffled, batch_size = batch_size)

In [6]:
# Training

for epoch in range(5):
    lstm.train()
    batch = 0
    epoch_loss = 0
    ave_loss = 0
    acc = 0

    for y_batch, x_batch in loader:

        y_batch1 = y_batch.to(device)
        x_batch1 = x_batch.to(device)

        batch += 1
        lstm.zero_grad()        
        output = lstm(x_batch1) 
        loss = criterion(output, y_batch1)
        loss.backward()
        optimizer.step()

        for i in range(len(y_batch)):
          
          
          if round(output[i].item()) == (y_batch1[i].item()):
            acc += 1

        ave_loss += loss.item()
        epoch_loss += loss.item() 

        if batch % 100 == 0:
            print("Epoch", epoch + 1, "Batch:", batch, "TR:", ave_loss/100, "Acc:", acc/(100*batch_size))
            ave_loss = 0
            acc = 0

        if batch >= trainBatches:
          break

    print("Epoch: ", epoch + 1, "loss:", epoch_loss/batch)

Epoch 1 Batch: 100 TR: 0.6935465967655182 Acc: 0.4991015625
Epoch 1 Batch: 200 TR: 0.6927824187278747 Acc: 0.5103515625
Epoch 1 Batch: 300 TR: 0.687918428182602 Acc: 0.5533203125
Epoch 1 Batch: 400 TR: 0.6148731642961502 Acc: 0.6703515625
Epoch 1 Batch: 500 TR: 0.5393537142872811 Acc: 0.73328125
Epoch 1 Batch: 600 TR: 0.517388376891613 Acc: 0.7511328125
Epoch 1 Batch: 700 TR: 0.5072230377793312 Acc: 0.756171875
Epoch 1 Batch: 800 TR: 0.49535665482282637 Acc: 0.766015625
Epoch 1 Batch: 900 TR: 0.49432676047086715 Acc: 0.766484375
Epoch 1 Batch: 1000 TR: 0.4845884299278259 Acc: 0.773828125
Epoch 1 Batch: 1100 TR: 0.47900554895401 Acc: 0.7770703125
Epoch 1 Batch: 1200 TR: 0.47515646934509276 Acc: 0.7786328125
Epoch 1 Batch: 1300 TR: 0.4719967800378799 Acc: 0.781796875
Epoch 1 Batch: 1400 TR: 0.4712123861908913 Acc: 0.779921875
Epoch 1 Batch: 1500 TR: 0.4695120421051979 Acc: 0.7821484375
Epoch 1 Batch: 1600 TR: 0.46804407984018326 Acc: 0.7828515625
Epoch 1 Batch: 1700 TR: 0.463068149089813

In [7]:
path = "biLSTM.pt"

torch.save(lstm.state_dict(), "./models/biLSTM.pt")

In [8]:
# Evaluating

lstm.eval()
eval_loss = 0
acc = 0
for y_batch, x_batch in islice(loader, trainBatches, trainBatches + testBatches):
      
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        output = lstm(x_batch)

        loss = criterion(output, y_batch1)

        for i in range(len(y_batch)):
          
          if round(output[i].item()) == (y_batch[i].item()):
            acc += 1


        eval_loss += loss.item()
        
eval_loss /= testBatches
acc /= batch_size*testBatches

print("Test loss:", eval_loss)
print("Test acc:", acc)

Test loss: 1.1944562196731567
Test acc: 0.81484375
