In [1]:
import numpy as np
import torch
import torch.nn as tnn
import torch.nn.functional as F
import torch.optim as topti
from torchtext import data
from torchtext.vocab import GloVe
from imdb_dataloader import IMDB

import random

In [2]:
class PreProcessing():
    def pre(x):
        """Called after tokenization"""
        #print('pre')
        #print(x)
        return x

    def post(batch, vocab):
        """Called after numericalization but prior to vectorization"""
        #print(batch)
        #print(vocab)
        return batch

    text_field = data.Field(lower=True, include_lengths=True, batch_first=True, preprocessing=pre, postprocessing=post)

In [3]:
def lossFunc():
    """
    Define a loss function appropriate for the above networks that will
    add a sigmoid to the output and calculate the binary cross-entropy.
    """
    return tnn.BCEWithLogitsLoss()

In [4]:
# Use a GPU if available, as it should be faster.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device: " + str(device))

# Load the training dataset, and create a data loader to generate a batch.
textField = PreProcessing.text_field
print('before labelField')
labelField = data.Field(sequential=False)

print("Train and dev")
train, dev = IMDB.splits(textField, labelField, train="train", validation="dev")
print(type(train))
print(len(train))
print(train[0].text)
print(type(train[0].text))
print(len(train[0].text))

print("Build Vocab")
textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
labelField.build_vocab(train, dev)

#return
print("Loaders")
trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64,
                                                     sort_key=lambda x: len(x.text), sort_within_batch=True)
# already batched up by here

Using device: cpu
before labelField
Train and dev
<class 'imdb_dataloader.IMDB'>
25000
['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem.', 'imagine', 'a', 'movie', 'where', 'joe', 'piscopo', 'is', 'actually', 'funny!', 'maureen', 'stapleton', 'is', 'a', 'scene', 'stealer.', 'the', 'moroni', 'character', 'is', 'an', 'absolute', 'scream.', 'watch', 'for', 'alan', '"the', 'skipper"', 'hale', 'jr.', 'as', 'a', 'police', 'sgt.']
<class 'list'>
52
Build Vocab
Loaders


In [5]:
def main():
    print("Going to train")
    net = Network().to(device)
    criterion =lossFunc()
    optimiser = topti.Adam(net.parameters(), lr=0.001)  # Minimise the loss using the Adam algorithm.

    for epoch in range(10):
        running_loss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
                device), batch.label.type(torch.FloatTensor).to(device)

            labels -= 1

            # PyTorch calculates gradients by accumulating contributions to them (useful for
            # RNNs).  Hence we must manually set them to zero before calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            output = net(inputs, length)

            loss = criterion(output, labels)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            running_loss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32))
                running_loss = 0

    num_correct = 0

    # Save mode
    torch.save(net.state_dict(), "./model_notebook.pth")
    print("Saved model")

    # Evaluate network on the test dataset.  We aren't calculating gradients, so disable autograd to speed up
    # computations and reduce memory usage.
    with torch.no_grad():
        for batch in testLoader:
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
                device), batch.label.type(torch.FloatTensor).to(device)

            labels -= 1

            # Get predictions
            outputs = torch.sigmoid(net(inputs, length))
            predicted = torch.round(outputs)

            num_correct += torch.sum(labels == predicted).item()

    accuracy = 100 * num_correct / len(dev)

    print(f"Classification accuracy: {accuracy}")

In [6]:
# Use a GPU if available, as it should be faster.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device: " + str(device))

# Load the training dataset, and create a data loader to generate a batch.
textField = PreProcessing.text_field
print('before labelField')
labelField = data.Field(sequential=False)

print("Train and dev")
train, dev = IMDB.splits(textField, labelField, train="train", validation="dev")

Using device: cpu
before labelField
Train and dev


In [7]:
def dropsies_d(sentence, min_drops, max_drops, keep_half=False):
    if len(sentence) == 1:
        return sentence
    safe_max = max_drops if len(sentence)/(keep_half+1) > max_drops else len(sentence)/(keep_half+1)-(not keep_half)
    num_drops = random.randint(min_drops, safe_max)
    new_sentence = sentence.copy()
    for _ in range(num_drops):
        del new_sentence[random.randint(0, len(new_sentence)-1)]
    
    return new_sentence

def dropsies_nd(sentence, prob, keep_half=False):
    if len(sentence) == 1:
        return sentence
    
    new_sentence = []
    for word in sentence:
        r = random.uniform(0,1)
        if r > prob:
            new_sentence.append(word)
    
    # if no words left, return a random number of words
    if len(new_sentence) == 0 or (keep_half and len(new_sentence) < len(sentence)/2):
        new_sentence = sentence.copy()
        for _ in range(random.randint(1,len(new_sentence)/(keep_half+1)-(not keep_half))):
            del new_sentence[random.randint(0,len(new_sentence)-1)]
    
    return new_sentence

In [8]:
def swapsie(sentence, to_swap_idx, max_dist):
    '''
    Single inplace swap
    '''
    distance = random.randint(1, max_dist)
    direction = -1 if random.randint(0,1) == 0 else 1
    other = to_swap_idx + (distance * direction)
    other = other if other in range(0, len(sentence)) else [0,0,len(sentence)-1][direction+1]
    tmp = sentence[to_swap_idx]
    sentence[to_swap_idx] = sentence[other]
    sentence[other] = tmp
    
def swapsies_nd(sentence, prob, max_dist):
    new_sentence = sentence.copy()
    did_swap = False
    for i in range(0, len(new_sentence)):
        r = random.uniform(0,1)
        if r < prob:
            did_swap = True
            swapsie(new_sentence, i, max_dist)
    if not did_swap:
        swapsie(new_sentence, random.randint(0,len(sentence)-1), max_dist)
    return new_sentence

def swapsies_d(sentence, min_swaps, max_swaps, max_dist):
    new_sentence = sentence.copy()
    num_swaps = round(random.uniform(min_swaps, max_swaps))
    print(num_swaps)
    for _ in range(num_swaps):
        to_swap_idx = random.randint(0, len(new_sentence)-1)
        swapsie(new_sentence, to_swap_idx, max_dist)
    return new_sentence

In [9]:
def augmentie_nd(sentence, drop_prob, swap_prob, max_dist, keep_half=False):
    new_sentence = swapsies_nd(sentence, swap_prob, max_dist)
    new_sentence = dropsies_nd(new_sentence, drop_prob, keep_half=False)
    return new_sentence

In [10]:
def more(train_set, drop_prob, swap_prob, max_dist, keep_half=False):
    new = []
    for example in train_set:
        ne = data.example.Example()
        ne.text = augmentie_nd(example.text, drop_prob, swap_prob, max_dist, keep_half=keep_half)
        ne.label = example.label
        new.append(ne)
    return new

In [11]:
m = more(train, 0.2, 0.2, 5)

In [12]:
train.examples += m

In [13]:
len(train.examples)

50000

In [14]:
print("Build Vocab")
textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
labelField.build_vocab(train, dev)

#return
print("Loaders")
trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64,
                                                     sort_key=lambda x: len(x.text), sort_within_batch=True)
# already batched up by here

Build Vocab
Loaders


In [58]:
def conv_len_fn(padding, kernel_size, dilation, stride):
    def conv_len(length):
        return ((length + 2 * padding - dilation * (kernel_size - 1) - 1) / stride) + 1
    return conv_len

In [39]:
# Class for creating the neural network.
class Network(tnn.Module):
    def __init__(self):
        super(Network, self).__init__()
        # config
        self.hiddenSize = 100
        self.bidirectional = True
        
        self.padding = 5
        self.kernel_size = 8
        self.conv_formula = lambda l: ((l + 2 * self.padding - 1 * (self.kernel_size - 1) - 1) / 1) + 1
        self.conv = tnn.Conv1d(50, 50, kernel_size=self.kernel_size, padding=self.padding)
        self.mp_kernel_size = 4
        self.mp = tnn.MaxPool1d(self.mp_kernel_size)
        self.mp_formula = lambda l: ((l + 2 * 0 - 1 * (self.mp_kernel_size - 1) - 1) / 1) + 1
        self.lstm = tnn.LSTM(50,
                            self.hiddenSize,
                            batch_first=True,
                            bidirectional=self.bidirectional,
                            num_layers=1,
                            dropout=0)
        self.fc1 = tnn.Linear(self.hiddenSize*(self.bidirectional+1), 100)
        self.drop = tnn.Dropout(0.5)
        self.relu = tnn.ReLU()
        self.fc2 = tnn.Linear(100, 1)

    def forward(self, input, length):
        """
        DO NOT MODIFY FUNCTION SIGNATURE
        Create the forward pass through the network.
        """
        print(input.shape)
        X = input.permute((0,2,1))
        X = self.mp(self.conv(X))
        X = X.permute((0,2,1))
        print(X.shape)
        new_lengths = self.mp_formula(self.conv_formula(length))
        print(length)
        print(new_lengths)
        return
        packed = tnn.utils.rnn.pack_padded_sequence(X, conv_lengths, batch_first=True)
        y, (hn, cn) = self.lstm(packed)
        y = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)
        y = self.relu(self.fc1(y))
        y = self.fc2(y)
        return y.squeeze()

In [59]:
import functools

def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
    return functools.reduce(compose2, functions, lambda x: x)

In [175]:
# Class for creating the neural network.
class Network(tnn.Module):
    def __init__(self):
        super(Network, self).__init__()
        # config
        self.conv_hidden = 100
        self.conv1 = tnn.Conv1d(50, self.conv_hidden, kernel_size=1, padding=5)
        self.conv1_len = conv_len_fn(5, 1, 1, 1)
        self.conv2 = tnn.Conv1d(self.conv_hidden, self.conv_hidden, kernel_size=2, padding=5)
        self.conv2_len = conv_len_fn(5, 2, 1, 1)
        self.conv3 = tnn.Conv1d(self.conv_hidden, self.conv_hidden, kernel_size=3, padding=5)
        self.conv3_len = conv_len_fn(5, 3, 1, 1)
        self.conv4 = tnn.Conv1d(self.conv_hidden, self.conv_hidden, kernel_size=4, padding=5)
        self.conv4_len = conv_len_fn(5, 4, 1, 1)
        
        self.mp = tnn.MaxPool1d(2)
        self.mp_len = conv_len_fn(0, 2, 1, 2)
        
        self.new_len = compose(self.mp_len, self.conv4_len,
                               self.mp_len, self.conv3_len,
                               self.mp_len, self.conv2_len,
                               self.mp_len, self.conv1_len)
        
        self.mpot = tnn.AdaptiveMaxPool1d(1)
        
        self.conv_fc1 = tnn.Linear(self.conv_hidden, 1)
        
        self.lstm = tnn.LSTM(100, 200, batch_first=True)
        self.fc1 = tnn.Linear(200, 100)
        self.fc2 = tnn.Linear(100, 1)
        
        self.do = tnn.Dropout(0.5)
        self.relu = tnn.ReLU()
        
        self.ffc = tnn.Linear(2, 1)
        
    def forward(self, input, length):
        """
        DO NOT MODIFY FUNCTION SIGNATURE
        Create the forward pass through the network.
        """
        print(input.shape)
        C = input.permute((0,2,1)) #1,2,0?
        print(C.shape)
        C = C.permute((0,2,1))
        print(torch.cat((C, C), dim=1).shape)
        C = self.mp(self.relu(self.conv1(C)))
        C = C.permute((0,2,1))
        return
        nl = self.mp_len(self.conv1_len(length))
        packed = tnn.utils.rnn.pack_padded_sequence(C, nl, batch_first=True)
        _, (L, _) = self.lstm(packed)
        L = self.relu(self.fc1(L))
        L = self.fc2(L)
        return L.squeeze()
        L = L.reshape((L.shape[1],1))
        
        X = torch.cat((C, L), dim=1)
        X = self.ffc(X)
        return X.squeeze()

In [176]:
main()

Going to train
torch.Size([64, 165, 50])
torch.Size([64, 50, 165])
torch.Size([64, 330, 50])


RuntimeError: Given groups=1, weight of size 100 50 1, expected input[64, 165, 50] to have 50 channels, but got 165 channels instead

In [180]:
len(dev.examples)

6248

In [157]:
print(train.examples[10].text)

['aileen', 'gonsalves,', 'my', 'girlfriend,', 'is', 'in', 'this', 'film', 'playing', 'a', 'secretary', 'at', 'the', 'main', "character's", 'bank.', 'she', 'has', 'a', 'lovely', 'scene', 'with', 'roshan', 'seth', 'in', 'a', 'restaurant.', "there's", 'more', 'information', 'on', 'her', 'website', 'at', '>having', 'stated', 'my', 'personal', 'interest', 'in', 'the', 'film,', 'i', 'have', 'to', 'say', 'that', 'i', 'think', 'it', 'is', 'a', 'beautiful', 'movie', '-', 'moving,', 'funny', 'and', 'beautifully', 'filmed.']


In [158]:
import re

In [164]:
only_alphabet = lambda w: re.sub('[^a-z]', '', w)
not_empty = lambda w: len(w) != 0
print(list(filter(not_empty, map(only_alphabet, train.examples[10].text))))

['aileen', 'gonsalves', 'my', 'girlfriend', 'is', 'in', 'this', 'film', 'playing', 'a', 'secretary', 'at', 'the', 'main', 'characters', 'bank', 'she', 'has', 'a', 'lovely', 'scene', 'with', 'roshan', 'seth', 'in', 'a', 'restaurant', 'theres', 'more', 'information', 'on', 'her', 'website', 'at', 'having', 'stated', 'my', 'personal', 'interest', 'in', 'the', 'film', 'i', 'have', 'to', 'say', 'that', 'i', 'think', 'it', 'is', 'a', 'beautiful', 'movie', 'moving', 'funny', 'and', 'beautifully', 'filmed']
