In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import random

import torch
import torch.nn as nn
import torch.nn.functional as F


https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/

In [3]:
# read pickle file
pickle_in = open("plots_text.pickle","rb")
movie_plots = pickle.load(pickle_in)

# count of movie plot summaries
print(len(movie_plots))

movie_plots = [re.sub("[^a-z' ]", "", i).lower() for i in movie_plots]

500


In [4]:
def create_seq(text, seq_len = 5):

    sequences = []

    # if the number of tokens in 'text' is greater than 5
    if len(text.split()) > seq_len:
      for i in range(seq_len, len(text.split())):
        # select sequence of tokens
        seq = text.split()[i-seq_len:i+1]
        # add to the list
        sequences.append(" ".join(seq))

      return sequences

    # if the number of tokens in 'text' is less than or equal to 5
    else:

      return [text]

In [5]:
seqs = [create_seq(i) for i in movie_plots]

# merge list-of-lists into a single list
seqs = sum(seqs, [])

# count of sequences
len(seqs)

152644

In [5]:
x = []
y = []

for s in seqs:
  x.append(" ".join(s.split()[:-1]))
  y.append(" ".join(s.split()[1:]))

In [6]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in set(" ".join(movie_plots).split()):
  int2token[cnt] = w
  cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

token2int["the"], int2token[14271]

(591, 'ensue')

In [13]:
def get_integer_seq(seq):
  return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [14]:
def get_batches(arr_x, arr_y, batch_size):

    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

In [19]:
class ResumeNLP(nn.Module):

    def __init__(self, vocab_size : int, n_hidden=256, n_layers=4, dropout_prob=0.3, lr=0.003):
        super().__init__()

        self.drop_prob = dropout_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        self.intermediate_layer_size = 200
        self.emb_layer = nn.Embedding(vocab_size, self.intermediate_layer_size)

        self.lstm = nn.LSTM(self.intermediate_layer_size, self.n_hidden, self.n_layers,
                                dropout=self.drop_prob, batch_first=True)

        self.dropout = nn.Dropout(dropout_prob)

        self.fc = nn.Linear(self.n_hidden, vocab_size)

    def forward(self, x, hidden):
        embedded = self.emb_layer(x)
        lstm_output, hidden = self.lstm(embedded, hidden)
        out = self.dropout(lstm_output)
        out = out.reshape(-1, self.n_hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        if torch.cuda.is_available():
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else :
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden

In [20]:

vocab_size = len(int2token)
# instantiate the model
net = ResumeNLP(vocab_size)

# push the model to GPU (avoid it if you are not using the GPU)
net.cuda()

print(net)

ResumeNLP(
  (emb_layer): Embedding(16592, 200)
  (lstm): LSTM(200, 256, num_layers=32, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=16592, bias=True)
)


In [21]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):

    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)

    # loss
    criterion = nn.CrossEntropyLoss()

    # push model to GPU
    net.cuda()

    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)

        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1

            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # update weigths
            opt.step()

            if counter % print_every == 0:

              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [24]:
train(net, batch_size = 1024, epochs=20, print_every=256)

OutOfMemoryError: CUDA out of memory. Tried to allocate 326.00 MiB (GPU 0; 3.82 GiB total capacity; 2.99 GiB already allocated; 305.31 MiB free; 3.17 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [14]:
# predict next token
def predict(net, tkn, h=None):

  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)

  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h


# function to generate text
def sample(net, size, prime='it is'):

    # push to GPU
    net.cuda()

    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, h)

    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [15]:
sample(net, 15)

'it is the first thing she has not been waiting by his family and that the two'

In [16]:
sample(net, 15, prime = "one of the")

'one of the film is a vampire of divine sports in a small village with a neighboring life'