In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import spacy
import random
import math
import time
import pickle
import os
import copy
import pandas as pd

# Define the seq2seq RNN sturcture

In [2]:
# build the encoder that generates the context vector of the sentences
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout      
        self.embedding = nn.Embedding(input_dim, emb_dim)     
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)   
        self.dropout = nn.Dropout(dropout)
 

    def forward(self, src):
        
        # src shape: [longest src sent len, batch size]
        # the source is still a batch of sentences
        
        embedded = self.dropout(self.embedding(src))  # in this step, each word is converted from a number to a vector
        
        # embedded shape: [longest src sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        
        # the rnn will do a forward propagation on each embedded sentence
        
        # outputs shape: [src sent len, batch size, hid dim * n directions]
        # hidden shape: [n layers * n directions, batch size, hid dim]
        # cell shape: [n layers * n directions, batch size, hid dim]
        
        # outputs are always from the top hidden layer. since this is an encoder, it is discarded
        return hidden, cell

In [3]:
# buid the decoder that deciphers the context vectors to source sentences
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim  # this is a probability distribution over all the vocabs
        self.n_layers = n_layers
        self.dropout = dropout       
        self.embedding = nn.Embedding(output_dim, emb_dim)        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)     
        self.out = nn.Linear(hid_dim, output_dim)     
        self.dropout = nn.Dropout(dropout)
  

    def forward(self, input, hidden, cell):
        
        # input shape: [batch size]
        # hidden shape: [n layers * n directions, batch size, hid dim]
        # cell shape: [n layers * n directions, batch size, hid dim]
        
        # n directions in the decoder will always be 1, therefore:
        # hidden shape: [n layers, batch size, hid dim]
        # context shape: [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        # input shape: [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded shape: [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        # output shape: [sent len, batch size, hid dim * n directions]
        # hidden shape: [n layers * n directions, batch size, hid dim]
        # cell shape: [n layers * n directions, batch size, hid dim]
        
        # sent len and n directions will always be 1 in the decoder, therefore:
        # output shape: [1, batch size, hid dim]
        # hidden shape: [n layers, batch size, hid dim]
        # cell shape: [n layers, batch size, hid dim]
        
        prediction = self.out(output.squeeze(0))
        
        # prediction shape: [batch size, output dim]
        
        return prediction, hidden, cell

In [4]:
# build a sequence to sequence network that train the encoders to produce context vectors that represent the sentences well
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
    
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        # src shape: [longest src sent len, batch size]
        # trg shape: [longest trg sent len, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing (correct word)
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]  # the number of sentences in the batch
        max_len = trg.shape[0]  # length of the longest sentence in the batch
        trg_vocab_size = self.decoder.output_dim  # literal meaning of the variable
        
        # tensor to store decoder outputs. initialized with all 0s
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        # last hidden states for all sentences of the encoder
        # they are used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)  # this triggers the forward function defined in Encoder class
        context_vectors = hidden.data.clone()  # this is a list of context vectors generated by each encoder layer
        
        # the decoder netword doesn't predict sentence by sentence, but word by word
        # it predicts the 1st word for all sentences, then the 2nd word for all sentences, and so on
        
        # first input to the decoder is the <sos> tokens
        input = trg[0,:]  # get all the first characters from the sentences, which are all <sos>
        
        # max_len is the length of the longest sentence in the batch
        for t in range(1, max_len):  
            output, hidden, cell = self.decoder(input, hidden, cell)  # invoke the decoder forward function
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]  # choose the word based on the probability distribution
            input = (trg[t] if teacher_force else top1)  # update the input. sometimes use the ground truth
        
        return outputs, context_vectors

In [22]:
# define the helper functions that interact with the seq2seq network

# initialize the weights with normal distribution
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        

# count the number of trainable parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# train
def train(model, iterator, optimizer, criterion, clip):   
    model.train()   
    epoch_loss = 0
    
    # the enumerator partitions all the sentences into small batches
    for i, batch in enumerate(iterator):
        
        src = batch.src  # all the source sentences in a batch
        trg = batch.trg
        
        optimizer.zero_grad()
        
        # train the model on src sentences and target sentences
        # this calls the forward function defined in the model(seq2seq) class
        output, context_vectors = model(src, trg)
        
        # the shape of trg: [trg sent len, batch size]
        # output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        # trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# evaluation
def evaluate(model, iterator, criterion):   
    model.eval()   
    epoch_loss = 0 
    
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, context_vectors = model(src, trg, 0) #turn off teacher forcing

            # trg shape: [trg sent len, batch size]
            # output shape: [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            # trg shape: [(trg sent len - 1) * batch size]
            # output shape: [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), context_vectors


# epoch timing
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Prepare the data

In [6]:
spacy_en = spacy.load('en')

In [31]:
# define the path variables
BASE_DIR = os.path.abspath('')
DATA_DIR = os.path.join(BASE_DIR, 'data')
REQUEST_DATA_PATH = os.path.join(DATA_DIR, 'clean_requests.pickle')
TEXT_LABEL_PATH = os.path.join(DATA_DIR, 'twitter-airline-sentiment/Tweets.csv')
TRAIN_EN_PATH = os.path.join(DATA_DIR, 'multi30k/train.en')
VAL_EN_PATH = os.path.join(DATA_DIR, 'multi30k/val.en')
TEST_EN_PATH = os.path.join(DATA_DIR, 'multi30k/test2016.en')
# TRAIN_SRC_PATH = os.path.join(DATA_DIR, 'multi30k/train.src')
# VAL_SRC_PATH = os.path.join(DATA_DIR, 'multi30k/val.src')
# TEST_SRC_PATH = os.path.join(DATA_DIR, 'multi30k/test2016.src')
TRAIN_TOKEN_PATH = os.path.join(DATA_DIR, 'tokens.pickle')
CONTEXT_VECTORS_PATH = os.path.join(DATA_DIR, 'context_vectors.pickle')

In [8]:
# set the random seeds for deterministic results
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [9]:
"""
Tokenizes English text from a string into a list of strings (tokens)
"""
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
# define the source text and target text of the seq2seq network
# use 'tokenize_en' defined above as the tokenizer
# <sos>: start of the sentence
# <eos>: end of the sentence
# lower case char only
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [11]:
# read the data from the CSV file. use only two columns
data = pd.read_csv(TEXT_LABEL_PATH, usecols=['airline_sentiment', 'text'])

# drop that '@VirginAmerica' at the beginning of every sentence
for index, row in data.iterrows():
    row['text'] = row['text'].replace('@VirginAmerica ', '')

# view the stats of the dataframe
print("There are", data.shape[0], "rows and", data.shape[1], "coloumns in the data set.")
print("The columns are:", list(data.columns))
data.head()

There are 14640 rows and 2 coloumns in the data set.
The columns are: ['airline_sentiment', 'text']


Unnamed: 0,airline_sentiment,text
0,neutral,What @dhepburn said.
1,positive,plus you've added commercials to the experienc...
2,neutral,I didn't today... Must mean I need to take ano...
3,negative,"it's really aggressive to blast obnoxious ""ent..."
4,negative,and it's a really big bad thing about it


In [12]:
len(list(data['text']))

14640

In [13]:
# load the text data (an array of strings)
comments = list(data['text'])[:50]  # change it to normal size later! 
# requests = pickle.load(open(REQUEST_DATA_PATH, "rb"))
# requests = requests[:50]

# devide the text data into training, validating, and testing set
# random.shuffle(comments)
comment_number = len(comments)
test_text = comments[:int(comment_number*0.03)]
validation_text = comments[int(comment_number*0.03):int(comment_number*0.06)]
train_text = comments[int(comment_number*0.06):]
#random.shuffle(requests)
# request_number = len(requests)
# test = requests[:int(request_number*0.03)]
# validation = requests[int(request_number*0.03):int(request_number*0.06)]
# train = requests[int(request_number*0.06):]

# save the data for Multi30k
with open(TEST_EN_PATH, 'w') as target:
    for t in test_text:
        line = t + '\n'
        target.write(line)
target.close()
with open(VAL_EN_PATH, 'w') as target:
    for v in validation_text:
        line = v + '\n'
        target.write(line)
target.close()
with open(TRAIN_EN_PATH, 'w') as target:
    for t in train_text:
        line = t + '\n'
        target.write(line)
target.close()

In [14]:
# tokenize the text
# exts = ('.en', '.en'): source and target are both in English
# fields = (SRC, TRG): source and target are actually the same
train_data, valid_data, test_data = Multi30k.splits(exts = ('.en', '.en'), 
                                                    fields = (SRC, TRG), root=DATA_DIR)
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

# with open(TRAIN_TOKEN_PATH, 'wb') as f:
#     pickle.dump(train_data, f)
# f.close()
# vars(test_data[0])

Number of training examples: 48
Number of validation examples: 2
Number of testing examples: 1


In [15]:
# build the vocabulary from the training set
# each word is represented by a unique number(maybe it's the order they appear in the corpus?)
# min_freq = 2: a valid word must appear at least twice
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in request vocabulary: {len(TRG.vocab)}")
vars(TRG.vocab)

Unique tokens in request vocabulary: 112


{'freqs': Counter({'it': 7,
          "'s": 4,
          'really': 4,
          'aggressive': 1,
          'to': 18,
          'blast': 1,
          'obnoxious': 1,
          '"': 2,
          'entertainment': 1,
          'in': 4,
          'your': 6,
          'guests': 1,
          "'": 1,
          'faces': 1,
          '&': 3,
          'amp': 2,
          ';': 3,
          'they': 1,
          'have': 4,
          'little': 1,
          'recourse': 1,
          'and': 9,
          'a': 11,
          'big': 1,
          'bad': 2,
          'thing': 2,
          'about': 4,
          'seriously': 1,
          'would': 3,
          'pay': 1,
          '$': 1,
          '30': 1,
          'flight': 6,
          'for': 8,
          'seats': 2,
          'that': 5,
          'did': 2,
          "n't": 8,
          'this': 6,
          'playing': 1,
          '.': 27,
          'the': 11,
          'only': 3,
          'flying': 3,
          'va': 1,
          'yes': 1,
          ',': 1

In [18]:
# split the data into mini batches
BATCH_SIZE = 128

# use GPU if possible. cuda is not available on most Mac GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# create iterators that iterate through the batches
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

# Sentence Vectorization

In [20]:
print("I'm using", device)

# create a model
INPUT_DIM = len(SRC.vocab)  # each word is represented with a one-hot vector, which has teh length of the vocab size
OUTPUT_DIM = len(TRG.vocab)  
ENC_EMB_DIM = 256  # each word will be compressed to EMB_DIM by embedding matrices
DEC_EMB_DIM = 256
HID_DIM = 512  # sentences of different length will be converted into vecters of fixed length of HID_DIM
N_LAYERS = 2  # use a multilayer RNN
ENC_DROPOUT = 0.5  # randomly drop the value for some nodes
DEC_DROPOUT = 0.5

# instantiate encoder, decoder, and seq2seq networks
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)
model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')

# define the optimizer as Adam
optimizer = optim.Adam(model.parameters())

# loss function
PAD_IDX = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

I'm using cpu
The model has 7,471,216 trainable parameters


In [23]:
# train
N_EPOCHS = 1  # number of epochs the seq2seq network is going to train through the entier dataset
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):  
    start_time = time.time()
   
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss, _ = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # save the model that generates the best validation error
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save (model.state_dict(), 'tut1-model.pt')
    
    # report the status
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 0s
	Train Loss: 4.727 | Train PPL: 112.981
	 Val. Loss: 4.453 |  Val. PPL:  85.922


In [32]:
# generate the context vectors for the test set
model.load_state_dict(torch.load('tut1-model.pt'))
test_loss, context_vectors = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
print('The shape of the context vector set is', context_vectors.shape)

# save the context vectors to disk
with open(CONTEXT_VECTORS_PATH, 'wb') as f:
    pickle.dump(context_vectors, f)
f.close()

# # load the context vectors from disk
# context_vectors = pickle.load(open(CONTEXT_VECTORS_PATH, "rb"))

| Test Loss: 4.616 | Test PPL: 101.055 |
The shape of the context vector set is torch.Size([2, 1, 512])


# Some highlights in our experiment

In [25]:
for i, batch in enumerate(train_iterator):
    if i == 0:
        print(batch.src.transpose(0,1))

tensor([[ 2, 50, 49,  ...,  1,  1,  1],
        [ 2,  0,  0,  ...,  1,  1,  1],
        [ 2,  0,  4,  ...,  0,  0,  3],
        ...,
        [ 2,  6,  0,  ...,  1,  1,  1],
        [ 2,  0, 24,  ...,  1,  1,  1],
        [ 2,  0, 11,  ...,  1,  1,  1]])


In [26]:
a = torch.tensor([[1,2],[3,4]])
b = a
c = a.data.clone()
a[0][1] = 3
a = torch.tensor([[0,0],[0,0]])
print(b)
print(c)

tensor([[1, 3],
        [3, 4]])
tensor([[1, 2],
        [3, 4]])


In [None]:
len(vars(train_data.examples[0])['src'])

In [None]:
vars(train_iterator.dataset[0])

In [None]:
model.context_vectors[0][1][0].shape

In [None]:
model.context_vectors[0][0]