# Next Word Predictor

## Import Libraries and Packages

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

## Preprocessing/Cleaning data, Build Datasets, One-hot vectors/Embeddings

In [None]:
# Open and read the text file
f_list = open("train4.txt").readlines()

# Strip all the newline (\n) character
f_lines = [s.rstrip("\n") for s in f_list]
        
lines = list()
# Split the lines into list of words
for i in f_lines:
    lines.append(i.split(" "))

wds = list()
# Separate all the words and append them into a list
for i in lines:
    for j in i:
        wds.append(j)
        
# Remove all the special characters from each word and store all the unique words
valid = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
p_valid = [".", ",", "!", "?", ":", ";"]
words_dict = list()
seq = list()
        
for i in p_valid:
    words_dict.append(i)
for word in wds:
    s = ""
    p_s = ""
    for char in word:
        if char in valid:
            s += char
        if char in p_valid:
            p_s = char
    seq.append(s)
    if s not in words_dict:
        words_dict.append(s)
    if p_s != "":
        seq.append(p_s)
                
words_dict.append(" ")
words_dict.sort()

In [None]:
# Build dataset

X = list()
Y = list()
Tx = 32
len_seq = len(seq)
                
# Using sequence length we create training examples
for i in range(0, (len_seq-Tx), 2):
    i_x = i
    i_y = i+Tx
    X.append(seq[i_x:i_y])
    Y.append(seq[(i_x+1):(i_y+1)])

In [None]:
# One hot vector or Embedding

m = len(X)
nx = len(words_dict)
x = np.zeros((m, Tx, nx))
y = np.zeros((m, Tx, nx))
idx_word = dict((i,w) for i,w in enumerate(words_dict))
word_idx = dict((w,i) for i,w in enumerate(words_dict))
        
# Creating one hot vectors
for i, exp in enumerate(X):
    for t, wrd in enumerate(exp):
        x[i, t, word_idx[wrd]] = 1
        y[i, t, word_idx[Y[i][t]]] = 1

## Model Initialization, Train for number of Epochs, Store/Save Parameters

In [None]:
Ty = Tx
h_size = 64
num_layers = 2
dropout = 0.2
batch_sz = 20
num_batches = int(m/batch_sz)
out_loss = 0

x = torch.from_numpy(x).type(torch.FloatTensor)
y = torch.from_numpy(y).type(torch.FloatTensor)

h_0 = torch.rand(num_layers*2, batch_sz, h_size).type(torch.FloatTensor)
c_0 = torch.rand(num_layers*2, batch_sz, h_size).type(torch.FloatTensor)

optimizer = optim.Adam((h_0, c_0))
        
rnn_lstm = nn.LSTM(input_size=nx, hidden_size=h_size, num_layers=num_layers, bias=False, batch_first=True, dropout=dropout, bidirectional=True)
        
linear = nn.Linear(h_size*2, nx)
        
softmax = nn.Softmax()
        
loss = nn.CrossEntropyLoss()

In [None]:
epochs = int(input("Enter number of epochs: "))

for i in range(epochs):
    for j in range(num_batches):
        x_batch = x[(j*batch_sz):((j+1)*batch_sz), :, :]
        y_batch = y[(j*batch_sz):((j+1)*batch_sz), :, :]
        y_batch_m = torch.mean(y_batch, 1, True)
        y_batch_s = torch.squeeze(y_batch_m, 1)
                
        if torch.cuda.is_available():
            print("CUDA")
            device = torch.device("cuda")
            x_batch = x_batch.to(device)
            y_batch_s = y_batch_s.to(device)
            h_0 = h_0.to(device)
            c_0 = c_0.to(device)
                
        out, (h_0, c_0) = rnn_lstm(x_batch, (h_0, c_0))
        out_l = linear(out)
        y_pred = softmax(out_l)
        out_loss = loss(y_pred, y_batch_s.type(torch.LongTensor))
        out_loss.backward(retain_graph=True)
        optimizer.step()
                
    print("Epoch:", (i+1), "\t Loss:", out_loss, "\n")
            
torch.save(h_0, "Parameters/h.pt")
torch.save(c_0, "Parameters/c.pt")

## Sampling and Predicting