# 1)-Importing key Modules



In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [0]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable

In [3]:
! pip install version_information



In [4]:
# first install: pip install version_information
%reload_ext version_information
%version_information pandas,torch,numpy

Software,Version
Python,3.6.8 64bit [GCC 8.3.0]
IPython,5.5.0
OS,Linux 4.14.137+ x86_64 with Ubuntu 18.04 bionic
pandas,0.25.3
torch,1.3.1+cu100
numpy,1.17.4
Sat Nov 16 13:09:20 2019 UTC,Sat Nov 16 13:09:20 2019 UTC


# 2)- Setting up neural network

In [0]:
# Feed-forward Neural Network Language Model
class FNN_LM(nn.Module):
  def __init__(self, nwords, emb_size, hid_size, num_hist, dropout):
    super(FNN_LM, self).__init__()
    self.embedding = nn.Embedding(nwords, emb_size)
    self.fnn = nn.Sequential(
      nn.Linear(num_hist*emb_size, hid_size), nn.Tanh(),
      nn.Dropout(dropout),
      nn.Linear(hid_size, nwords)
    )

  def forward(self, words):
    emb = self.embedding(words)      # 3D Tensor of size [batch_size x num_hist x emb_size]
    feat = emb.view(emb.size(0), -1) # 2D Tensor of size [batch_size x (num_hist*emb_size)]
    logit = self.fnn(feat)           # 2D Tensor of size [batch_size x nwords]

    return logit

### 2.1)- Parameters

In [0]:
N = 2 # The length of the n-gram
EMB_SIZE = 128 # The size of the embedding
HID_SIZE = 128 # The size of the hidden layer

In [0]:
USE_CUDA = torch.cuda.is_available()

### 2.2)-Functions to read in the corpus

- NOTE: We are using data from the Penn Treebank, which is already converted
into an easy-to-use format with "<unk>" symbols. If we were using other
data we would have to do pre-processing and consider how to choose unknown words, etc.

In [0]:

w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      yield [w2i[x] for x in line.strip().split(" ")]

# 3)- Loading Data

### loading data using traditional format
using read()

In [0]:
train = list(read_dataset("train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)

# 4)- Model Building

### 4.1)- Initialize the model and the optimizer

In [0]:
model = FNN_LM(nwords=nwords, emb_size=EMB_SIZE, hid_size=HID_SIZE, num_hist=N, dropout=0.2)
if USE_CUDA:
  model = model.cuda()
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# convert a (nested) list of int into a pytorch Variable
def convert_to_variable(words):
  var = Variable(torch.LongTensor(words))
  if USE_CUDA:
    var = var.cuda()

  return var

### 4.2)-calculate scores

In [0]:
# A function to calculate scores for one value
def calc_score_of_histories(words):
  # This will change from a list of histories, to a pytorch Variable whose data type is LongTensor
  words_var = convert_to_variable(words)
  logits = model(words_var)
  return logits

### 4.3)-loss value for the entire sentence

In [0]:
def calc_sent_loss(sent):
  # The initial history is equal to end of sentence symbols
  hist = [S] * N
  # Step through the sentence, including the end of sentence token
  all_histories = []
  all_targets = []
  for next_word in sent + [S]:
    all_histories.append(list(hist))
    all_targets.append(next_word)
    hist = hist[1:] + [next_word]

  logits = calc_score_of_histories(all_histories)
  loss = nn.functional.cross_entropy(logits, convert_to_variable(all_targets), size_average=False)

  return loss

### 4.4)-Generate a sentence

In [0]:
MAX_LEN = 100

In [15]:
def generate_sent():
  hist = [S] * N
  sent = []
  while True:
    logits = calc_score_of_histories([hist])
    prob = nn.functional.softmax(logits)
    next_word = prob.multinomial().data[0,0]
    if next_word == S or len(sent) == MAX_LEN:
      break
    sent.append(next_word)
    hist = hist[1:] + [next_word]
  return sent

last_dev = 1e20
best_dev = 1e20

for ITER in range(5):
  # Perform training
  random.shuffle(train)
  # set the model to training mode
  model.train()
  train_words, train_loss = 0, 0.0
  start = time.time()
  for sent_id, sent in enumerate(train):
    my_loss = calc_sent_loss(sent)
    train_loss += my_loss.data
    train_words += len(sent)
    optimizer.zero_grad()
    my_loss.backward()
    optimizer.step()
    if (sent_id+1) % 5000 == 0:
      print("--finished %r sentences (word/sec=%.2f)" % (sent_id+1, train_words/(time.time()-start)))
  print("iter %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))

--finished 5000 sentences (word/sec=8074.55)
--finished 10000 sentences (word/sec=8059.94)
--finished 15000 sentences (word/sec=8212.21)
--finished 20000 sentences (word/sec=8202.63)
--finished 25000 sentences (word/sec=8231.49)
--finished 30000 sentences (word/sec=8244.91)
--finished 35000 sentences (word/sec=8234.18)
--finished 40000 sentences (word/sec=8257.45)
iter 0: train loss/word=6.2654, ppl=526.0477 (word/sec=8273.65)
--finished 5000 sentences (word/sec=8264.47)
--finished 10000 sentences (word/sec=8363.97)
--finished 15000 sentences (word/sec=8349.43)
--finished 20000 sentences (word/sec=8290.85)
--finished 25000 sentences (word/sec=8315.94)
--finished 30000 sentences (word/sec=8341.71)
--finished 35000 sentences (word/sec=8390.00)
--finished 40000 sentences (word/sec=8394.06)
iter 1: train loss/word=5.7669, ppl=319.5546 (word/sec=8366.39)
--finished 5000 sentences (word/sec=8633.65)
--finished 10000 sentences (word/sec=8465.88)
--finished 15000 sentences (word/sec=8431.93)
-

# 5)- Evaluate

In [0]:
  model.eval()
  dev_words, dev_loss = 0, 0.0
  start = time.time()
  for sent_id, sent in enumerate(dev):
    my_loss = calc_sent_loss(sent)
    dev_loss += my_loss.data
    dev_words += len(sent)

In [0]:
 # Keep track of the development accuracy and reduce the learning rate if it got worse
  if last_dev < dev_loss:
    optimizer.learning_rate /= 2
  last_dev = dev_loss

In [0]:
 # Keep track of the best development accuracy, and save the model only if it's the best one
  if best_dev > dev_loss:
    torch.save(model, "model.pt")
    best_dev = dev_loss

In [19]:
# Save the model
  print("iter %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words/(time.time()-start)))

iter 4: dev loss/word=5.7182, ppl=304.3606 (word/sec=16255.00)


# 6)-Generate a few sentences

In [0]:
 for _ in range(5):
    sent = generate_sent()
    print(" ".join([i2w[x] for x in sent]))