In [37]:
from google.colab import drive
import sys, os
drive.mount('/content/drive/')

Mounted at /content/drive/


In [38]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

from sklearn.model_selection import train_test_split
import numpy as np
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
training_data = "/content/drive/My Drive/Sussex AI/Spring Semester/Advanced NLP/Week 2/sentence-completion/Holmes_Training_Data"

In [84]:
text = ''
n_docs = 50
i = 0
for f,file in enumerate(os.listdir(training_data)):
  print('Processing file {}'.format(f))
  if i >= n_docs:
    break
  try:
    with open(os.path.join(training_data,file)) as instream:
      line = instream.read().replace('\n', '')
      text += line
      i += 1
  except UnicodeDecodeError:
    print('Error processing file {}'.format(f))

Processing file 0
Processing file 1
Processing file 2
Processing file 3
Processing file 4
Processing file 5
Processing file 6
Processing file 7
Processing file 8
Processing file 9
Processing file 10
Processing file 11
Processing file 12
Processing file 13
Processing file 14
Processing file 15
Processing file 16
Processing file 17
Processing file 18
Processing file 19
Processing file 20
Processing file 21
Processing file 22
Error processing file 22
Processing file 23
Processing file 24
Processing file 25
Processing file 26
Processing file 27
Processing file 28
Processing file 29
Processing file 30
Processing file 31
Processing file 32
Processing file 33
Processing file 34
Processing file 35
Processing file 36
Processing file 37
Processing file 38
Processing file 39
Processing file 40
Processing file 41
Processing file 42
Processing file 43
Processing file 44
Processing file 45
Processing file 46
Processing file 47
Processing file 48
Processing file 49
Processing file 50
Processing file 

In [85]:
tokenized_text = [w.lower() for w in word_tokenize(text) if w.isalpha()]

Get vocabulary from corpus

In [86]:
vocab = {}
for token in tokenized_text:
  vocab[token] = vocab.get(token, 0) + 1

Limit vocabulary to the $n$ most common words and assign unique number to each token

In [87]:
def word_to_index(tokens):

  word_to_id = {k:i for i,(k,v) in enumerate(sorted(tokens.items(), key=lambda item: item[1], reverse=True))}
  return word_to_id

word_to_id = word_to_index(vocab)
total_vocab = list(word_to_id.keys())

In [88]:
def index_to_word(dic_inds, ind):
  token = list(dic_inds.keys())[list(dic_inds.values()).index(ind)] 

  return token

Create sequences

In [89]:
def create_seq(text, seq_len = 5):
    
    sequences = []

    if len(text) > seq_len:
      for i in range(seq_len, len(text)):
        seq = text[i-seq_len:i+1]
        sequences.append(" ".join(seq))

      return sequences

    else:
      
      return [text]

In [90]:
sequences = create_seq(tokenized_text)

Split sequences into inputs and targets

In [91]:
X = []
y = []

for seq in sequences:
  X.append(" ".join(seq.split()[:-1]))
  y.append(" ".join(seq.split()[1:]))

Convert inputs and targets to integers

In [92]:
def to_integer_sequence(sequence):
  return [word_to_id[token] for token in word_tokenize(sequence) if word_to_id.get(token,0) != 0]

X_int = [to_integer_sequence(seq) for seq in X if len(to_integer_sequence(seq)) >= 5]
y_int = [to_integer_sequence(seq) for seq in y if len(to_integer_sequence(seq)) >= 5]

X_int = np.array(X_int, dtype=int)
y_int = np.array(y_int, dtype=int)

# RNN

In [49]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

In [129]:
ts = 0.25
n = 1000

X_train, X_val, y_train, y_val = train_test_split(X_int[:-1], y_int, test_size=ts,
                                                  random_state=1)

features_train = torch.from_numpy(X_train).type(torch.long)
targets_train = torch.from_numpy(y_train).type(torch.long) 

features_val = torch.from_numpy(X_val).type(torch.long)
targets_val = torch.from_numpy(y_val).type(torch.long) 

train_data = data_utils.TensorDataset(features_train, targets_train)
val_data = data_utils.TensorDataset(features_val, targets_val)

In [130]:
class LSTM(nn.Module):
    
    def __init__(self, vocab_size, embedding_size, n_hidden, n_layers,
                 drop_prob):
        super().__init__()

        self.device = ("cuda" if torch.cuda.is_available() else "cpu")
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        self.emb_layer = nn.Embedding(vocab_size, embedding_size)
      
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        embeddings = self.emb_layer(x)     
        
        lstm_output, hidden = self.lstm(embeddings, hidden)
        
        out = self.dropout(lstm_output)
        
        out = out.reshape(-1, self.n_hidden) 

        out = self.fc(out)

        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(self.device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(self.device))
        
        return hidden

In [131]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

def train(model, train_data, val_data, optimizer, criterion, bs, epochs, clip):
    
    # Model to device: cuda (if available) or cpu
    model.to(device)

    # Training data
    train_loader = data_utils.DataLoader(train_data, batch_size=bs,
                                          shuffle=True, drop_last=True)
    # Validation data
    validation_loader = data_utils.DataLoader(val_data, batch_size=bs,
                                          shuffle=True, drop_last=True)
    
    train_losses = []
    val_losses = []
    for e in range(epochs):

        # initialize hidden state
        h = model.init_hidden(bs)
        train_loss_epoch = 0
        for inputs_train, targets_train in train_loader:
            
            # Tensors to device: cuda (if available) or cpu
            inputs_train, targets_train = inputs_train.to(device), targets_train.to(device)

            # Detach hidden states
            h = tuple([each.data for each in h])

            # Remove accumulated gradients
            optimizer.zero_grad()

            # Get output and hidden layer from forward pass
            output, h = model(inputs_train, h)

            # Get loss 
            loss = criterion(output, targets_train.view(-1))
            train_loss_epoch += loss.item()

            # Backpropagation
            loss.backward()

            # Clip to prevent exploding gradient 
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # Update weigths
            optimizer.step()

        train_losses.append(train_loss_epoch / train_loader.batch_size) 

        valid_loss_epoch = 0
        h = model.init_hidden(bs)
        model.eval()
        with torch.no_grad():  
          for inputs_val, targets_val in validation_loader:
              
              inputs_val, targets_val = inputs_val.to(device), targets_val.to(device)

              h = tuple([each.data for each in h])
              # Forward Pass
              output, h = model(inputs_val, h)
              # Get Loss
              loss = criterion(output, targets_val.view(-1))
              valid_loss_epoch += loss.item()

          val_losses.append(valid_loss_epoch / validation_loader.batch_size)   

        clear_output(wait=True)                          
        plt.plot(train_losses, label='Training')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.plot(val_losses, label='Validation')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend(loc='best')
        plt.show()
        
        print("Epoch: {}/{}".format(e+1, epochs),
          "| Training Loss: {}".format(train_loss_epoch),
          "| Validation Loss: {}".format(valid_loss_epoch))

    return model

def predict(model, tkn, h=None):      
  # tensor inputs
  word_id = np.array([[word_to_id[tkn]]])
  input_ids = torch.from_numpy(word_id)
  
  # Tensors to device: cuda (if available) or cpu
  inputs = input_ids.to(device)

  # Detach hidden state from history
  h = tuple([each.data for each in h])

  # Get output and hidden layer
  out, h = model(inputs, h)

  # Get probabilities
  probs = F.softmax(out, dim=1).data

  probs = probs.cpu().numpy()

  probs = probs.reshape(probs.shape[1],)

  # Indices for top n values
  n = 5
  top_n_idx = probs.argsort()[-n:][::-1]

  # Sample from the top 5
  inds = list(range(n))
  sampled_token_index = top_n_idx[random.sample(inds,1)[0]]

  # Decode inds into its corresponding word
  token = index_to_word(word_to_id, sampled_token_index)
  return token, h


# function to generate text
def sample(model, size, prime='She'):
    # Model to device (cuda or cpu)
    model.to(device)
    # Evaluation mode
    model.eval()

    # Batch size of 1
    h = model.init_hidden(1)
    # Split input tokens
    toks = prime.split()

    # Predict next token using trained model
    for t in prime.split():
      token, h = predict(model, t, h)
    
    toks.append(token)

    # Predict next tokens
    for i in range(size-1):
        token, h = predict(model, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)
            

In [None]:
e_s = 100
n_h = 512
n_l = 2
d_o = 0.2
model = LSTM(len(total_vocab), embedding_size=e_s, n_hidden=n_h, n_layers=n_l,
             drop_prob=d_o)

bs = 64
e = 10
lr = 1e-4
c = 1

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Loss function
criterion = nn.CrossEntropyLoss()

trained_model = train(model, train_data, val_data, optimizer, criterion, bs, e,
                      clip=c)

In [None]:
first_tokens = 'She has'
predicted_length = 10
sample(trained_model, predicted_length, first_tokens)