In [None]:
!pip install nltk

In [None]:
import re
import random
import time
import math
import numpy as np
import nltk
import matplotlib.pyplot as plt
plt.style.use('default')

In [None]:
import urllib.request
import zipfile
import os

filename = "trainDevTestTrees_PTB.zip"
# --- Unzipping the file ---
print("Extracting files...")
with zipfile.ZipFile(filename, 'r') as zip_ref:
    # Extracts all contents into the current working directory
    zip_ref.extractall(os.getcwd()) 
print("Extraction complete.")

In [None]:
# this function reads in a textfile and fixes an issue with "\\"
def filereader(path):
  with open(path, mode="r", encoding="utf-8") as f:
    for line in f:
      yield line.strip().replace("\\","")

In [None]:
from nltk import Tree
from nltk.treeprettyprinter import TreePrettyPrinter

s = next(filereader("trees/dev.txt"))
print(s)
# We can use NLTK to better visualise the tree structure of the sentence
from nltk import Tree
from nltk.treeprettyprinter import TreePrettyPrinter
tree = Tree.fromstring(s)
print(TreePrettyPrinter(tree))

In [None]:
# Let's first make a function that extracts the tokens (the leaves).

def tokens_from_treestring(s):
  """extract the tokens from a sentiment tree"""
  return re.sub(r"\([0-9] |\)", "", s).split()

# let's try it on our example tree
tokens = tokens_from_treestring(s)
print(tokens)
print(len(tokens))

In [None]:
# We will also need the following function, but you can ignore this for now.
# It is explained later on.

SHIFT = 0
REDUCE = 1


def transitions_from_treestring(s):
  s = re.sub("\([0-5] ([^)]+)\)", "0", s)
  s = re.sub("\)", " )", s)
  s = re.sub("\([0-4] ", "", s)
  s = re.sub("\([0-4] ", "", s)
  s = re.sub("\)", "1", s)
  return list(map(int, s.split()))

In [None]:
# Now let's first see how large our data sets are.
for path in ("trees/train.txt", "trees/dev.txt", "trees/test.txt"):
  print("{:16s} {:4d}".format(path, sum(1 for _ in filereader(path))))

In [None]:
from collections import namedtuple
from nltk import Tree

# A simple way to define a class is using namedtuple.
Example = namedtuple("Example", ["tokens", "tree", "label", "transitions"])


def examplereader(path, lower=False):
  """Returns all examples in a file one by one."""
  for line in filereader(path):
    line = line.lower() if lower else line
    tokens = tokens_from_treestring(line)
    tree = Tree.fromstring(line)  # use NLTK's Tree
    label = int(line[1])
    trans = transitions_from_treestring(line)
    yield Example(tokens=tokens, tree=tree, label=label, transitions=trans)


# Let's load the data into memory.
LOWER = False  # we will keep the original casing
train_data = list(examplereader("trees/train.txt", lower=LOWER))
dev_data = list(examplereader("trees/dev.txt", lower=LOWER))
test_data = list(examplereader("trees/test.txt", lower=LOWER))

print("train", len(train_data))
print("dev", len(dev_data))
print("test", len(test_data))

In [None]:
# Here we first define a class that can map a word to an ID (w2i)
# and back (i2w).

from collections import Counter, OrderedDict, defaultdict


class OrderedCounter(Counter, OrderedDict):
  """Counter that remembers the order elements are first seen"""
  def __repr__(self):
    return '%s(%r)' % (self.__class__.__name__,
                      OrderedDict(self))
  def __reduce__(self):
    return self.__class__, (OrderedDict(self),)


class Vocabulary:
  """A vocabulary, assigns IDs to tokens"""

  def __init__(self):
    self.freqs = OrderedCounter()
    self.w2i = {}
    self.i2w = []

  def count_token(self, t):
    self.freqs[t] += 1

  def add_token(self, t):
    self.w2i[t] = len(self.w2i)
    self.i2w.append(t)

  def build(self, min_freq=0):
    '''
    min_freq: minimum number of occurrences for a word to be included
              in the vocabulary
    '''
    self.add_token("<unk>")  # reserve 0 for <unk> (unknown words)
    self.add_token("<pad>")  # reserve 1 for <pad> (discussed later)

    tok_freq = list(self.freqs.items())
    tok_freq.sort(key=lambda x: x[1], reverse=True)
    for tok, freq in tok_freq:
      if freq >= min_freq:
        self.add_token(tok)

In [None]:
# This process should be deterministic and should have the same result
# if run multiple times on the same data set.

v = Vocabulary()
for data_set in (train_data,):
  for ex in data_set:
    for token in ex.tokens:
      v.count_token(token)

v.build()
print("Vocabulary size:", len(v.w2i))

In [None]:
# Now let's map the sentiment labels 0-4 to a more readable form
i2t = ["very negative", "negative", "neutral", "positive", "very positive"]
print(i2t)
print(i2t[4])

In [None]:
# And let's also create the opposite mapping.
# We won't use a Vocabulary for this (although we could), since the labels
# are already numeric.
t2i = OrderedDict({p : i for p, i in zip(i2t, range(len(i2t)))})
print(t2i)
print(t2i['very positive'])

In [None]:
import torch
print("Using torch", torch.__version__)

In [None]:
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
device

In [None]:
def prepare_example(example, vocab):
  """
  Map tokens to their IDs for a single example
  """

  # vocab returns 0 if the word is not there (i2w[0] = <unk>)
  x = [vocab.w2i.get(t, 0) for t in example.tokens]

  x = torch.LongTensor([x])
  x = x.to(device)

  y = torch.LongTensor([example.label])
  y = y.to(device)

  return x, y

In [None]:
def simple_evaluate(model, data, prep_fn=prepare_example, **kwargs):
  """Accuracy of a model on given data set."""
  correct = 0
  total = 0
  model.eval()  # disable dropout (explained later)

  for example in data:

    # convert the example input and label to PyTorch tensors
    x, target = prep_fn(example, model.vocab)

    # forward pass without backpropagation (no_grad)
    # get the output from the neural network for input x
    with torch.no_grad():
      logits = model(x)

    # get the prediction
    prediction = logits.argmax(dim=-1)

    # add the number of correct predictions to the total correct
    correct += (prediction == target).sum().item()
    total += 1

  return correct, total, correct / float(total)

In [None]:
def get_examples(data, shuffle=True, **kwargs):
  """Shuffle data set and return 1 example at a time (until nothing left)"""
  if shuffle:
    print("Shuffling training data")
    random.shuffle(data)  # shuffle training data each epoch
  for example in data:
    yield example

In [None]:
from torch import optim

In [None]:
from tqdm import tqdm

def train_model(model, optimizer, num_iterations=10000,
                print_every=1000, eval_every=1000,
                batch_fn=get_examples,
                prep_fn=prepare_example,
                eval_fn=simple_evaluate,
                batch_size=1, eval_batch_size=None, seed=17):
  """Train a model."""
  iter_i = 0
  train_loss = 0.
  print_num = 0
  start = time.time()
  criterion = nn.CrossEntropyLoss() # loss function
  best_eval = 0.
  best_iter = 0

  # store train loss and validation accuracy during training
  # so we can plot them afterwards
  losses = []
  accuracies = []

  if eval_batch_size is None:
    eval_batch_size = batch_size

  while True:  # when we run out of examples, shuffle and continue
    for batch in tqdm(batch_fn(train_data, batch_size=batch_size)): 

      # forward pass
      model.train()
      x, targets = prep_fn(batch, model.vocab)
      logits = model(x)

      B = targets.size(0)  # later we will use B examples per update

      # compute cross-entropy loss (our criterion)
      # note that the cross entropy loss function computes the softmax for us
      loss = criterion(logits.view([B, -1]), targets.view(-1))
      train_loss += loss.item()

      # backward pass (tip: check the Introduction to PyTorch notebook)

      # ========== MY CODE ==============
      # erase previous gradients
      optimizer.zero_grad()

      # compute gradients
      loss.backward()

      # update weights - take a small step in the opposite dir of the gradient
      optimizer.step()

      print_num += 1
      iter_i += 1

      # print info
      if iter_i % print_every == 0:
        print("Iter %r: loss=%.4f, time=%.2fs" %
              (iter_i, train_loss, time.time()-start))
        losses.append(train_loss)
        print_num = 0
        train_loss = 0.

      # evaluate
      if iter_i % eval_every == 0:
        _, _, accuracy = eval_fn(model, dev_data, batch_size=eval_batch_size,
                                 batch_fn=batch_fn, prep_fn=prep_fn)
        accuracies.append(accuracy)
        print("iter %r: dev acc=%.4f" % (iter_i, accuracy))

        # save best model parameters
        if accuracy > best_eval:
          print("new highscore")
          best_eval = accuracy
          best_iter = iter_i
          path = "{}_{}.pt".format(model.__class__.__name__, seed)
          ckpt = {
              "state_dict": model.state_dict(),
              "optimizer_state_dict": optimizer.state_dict(),
              "best_eval": best_eval,
              "best_iter": best_iter
          }
          torch.save(ckpt, path)

      # done training
      if iter_i == num_iterations:
        print("Done training")

        # evaluate on train, dev, and test with best model
        print("Loading best model")
        path = "{}_{}.pt".format(model.__class__.__name__, seed)
        ckpt = torch.load(path)
        model.load_state_dict(ckpt["state_dict"])

        _, _, train_acc = eval_fn(
            model, train_data, batch_size=eval_batch_size,
            batch_fn=batch_fn, prep_fn=prep_fn)
        _, _, dev_acc = eval_fn(
            model, dev_data, batch_size=eval_batch_size,
            batch_fn=batch_fn, prep_fn=prep_fn)
        _, _, test_acc = eval_fn(
            model, test_data, batch_size=eval_batch_size,
            batch_fn=batch_fn, prep_fn=prep_fn)

        print("best model iter {:d}: "
              "train acc={:.4f}, dev acc={:.4f}, test acc={:.4f}".format(
                  best_iter, train_acc, dev_acc, test_acc))

        return losses, accuracies, best_iter, train_acc, dev_acc, test_acc

In [None]:
import copy
import json
import random

def train_model_w_seed(model, optimizer_class, optimizer_params,
                       train_data, dev_data, test_data,
                       num_iterations=10000,
                       print_every=1000,
                       eval_every=1000,
                       batch_fn=get_examples,
                       prep_fn=prepare_example,
                       eval_fn=simple_evaluate,
                       batch_size=1,
                       eval_batch_size=None):

    test_accs = []
    best_iters = []
    train_accs = []
    dev_accs = []

    list_of_accuracies = []
    list_of_losses = []

    # create 3 copies of the model
    models = [copy.deepcopy(model) for _ in range(3)]
    seeds = [17, 42, 2025]

    for seed, model_copy in zip(seeds, models):
        # Set seeds for reproducibility
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            torch.cuda.manual_seed_all(seed)

        # Create a fresh optimizer for each model
        # optimizer = optimizer_class(model_copy.parameters(), **optimizer_params)
        # Changed this to optimize only parameters that require gradients
        optimizer = optimizer_class(
            filter(lambda p: p.requires_grad, model_copy.parameters()),
            **optimizer_params
        )

        # Call your inner training function
        # Make sure your train_model function returns:
        # losses, accuracies, best_iter, train_acc, dev_acc, test_acc
        losses, accuracies, best_iter, train_acc, dev_acc, test_acc = train_model(
            model_copy, optimizer,
            num_iterations=num_iterations,
            print_every=print_every,
            eval_every=eval_every,
            batch_fn=batch_fn,
            prep_fn=prep_fn,
            eval_fn=eval_fn,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            seed=seed
        )

        list_of_accuracies.append(accuracies)
        list_of_losses.append(losses)
        best_iters.append(best_iter)
        train_accs.append(train_acc)
        dev_accs.append(dev_acc)
        test_accs.append(test_acc)

    # Compute mean and std
    mean_train_acc = np.mean(train_accs)
    mean_dev_acc = np.mean(dev_accs)
    mean_test_acc = np.mean(test_accs)
    mean_best_iter = np.mean(best_iters)

    std_train_acc = np.std(train_accs)
    std_dev_acc = np.std(dev_accs)
    std_test_acc = np.std(test_accs)
    std_best_iter = np.std(best_iters)

    # Create JSON
    json_data = {
        'train_accs_mean': mean_train_acc,
        'dev_accs_mean': mean_dev_acc,
        'test_accs_mean': mean_test_acc,

        'train_accs_std': std_train_acc,
        'dev_accs_std': std_dev_acc,
        'test_accs_std': std_test_acc,

        'train_accs': train_accs,
        'dev_accs': dev_accs,
        'test_accs': test_accs,
        'best_iters': best_iters,

        'list_of_losses': list_of_losses,
        'list_of_accuracies': list_of_accuracies
    }

    # Save JSON file once
    filename = f"{model.__class__.__name__}.json"
    with open(filename, "w") as f:
        json.dump(json_data, f, indent=2)

    return json_data

In [None]:
# plotting the validation accuracies over time
def plot_accuracies(list_of_accuracies, model_name):
    plt.figure(figsize=(10, 6))
    for i, accuracies in enumerate(list_of_accuracies):
        plt.plot(range(len(accuracies)), accuracies, label=f'Run {i+1}')
    plt.title(f'Validation Accuracies over Time for {model_name}')
    plt.xlabel('Evaluation Steps [every 1000 iterations]')
    plt.ylabel('Validation Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()
    
# This will plot the training loss over time.
def plot_losses(list_of_losses, model_name):
    plt.figure(figsize=(10, 6))
    for i, losses in enumerate(list_of_losses):
        plt.plot(range(len(losses)), losses, label=f'Run {i+1}')
    plt.title(f'Training Loss over Time for {model_name}')
    plt.xlabel('Evaluation Steps [every 1000 iterations]')
    plt.ylabel('Training Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

# Pre-trained word embeddings

In [None]:
# --- 1. Download the Word2Vec file (Replaces !wget) ---
# Check if the file already exists to avoid re-downloading
url = "https://gist.githubusercontent.com/bastings/4d1c346c68969b95f2c34cfbc00ba0a0/raw/76b4fefc9ef635a79d0d8002522543bc53ca2683/googlenews.word2vec.300d.txt"
filename = "googlenews.word2vec.300d.txt"

if not os.path.exists(filename):
    print(f"Downloading {filename}...")
    try:
        urllib.request.urlretrieve(url, filename)
        print("Download complete.")
    except Exception as e:
        print(f"Error during download: {e}")
else:
    print(f"File '{filename}' already exists. Skipping download.")

In [None]:
# --- 2. Read and Process the Word Vectors (Replaces file access from Google Drive) ---
# On your local machine, the file is in the current working directory,
# so you can open it directly by its name.

word2vec_data = {}
lines_to_print = 4

print(f"\nProcessing file and printing first {lines_to_print} lines:")
try:
    with open(filename, encoding="utf-8") as input_file:
      i = 0
      for line in input_file:
        # Printing the first 4 lines as requested
        if i < lines_to_print:
          print(f"Line {i+1}: {line.strip()}")
          # To check the vector length (skip first word/token)
          print(f"Vector Length: {len(line.split()[1:])}\n")
        
        # Converting to a dictionary format
        line_parts = line.split()
        if line_parts: # Ensure the line is not empty
          word = line_parts[0]
          # Convert the rest of the parts (the vector) to numpy float32
          vector = [np.float32(x) for x in line_parts[1:]]
          word2vec_data[word] = vector
        
        i += 1
        
        # Optional: Stop after a few lines for testing to save memory/time
        # if i > 1000: 
        #    break 
        
    print(f"\nSuccessfully loaded {len(word2vec_data)} vectors into the dictionary.")

except FileNotFoundError:
    print(f"\nERROR: The file '{filename}' was not found. Please check if the download was successful.")
except Exception as e:
    print(f"\nAn error occurred while reading the file: {e}")

In [None]:
# YOUR CODE HERE
word2vec_vocab = Vocabulary()
vectors = []

# taking the embedding dimension for the word2vec words
embedding_dim = 300 #len(word2vec_data.values()[0]) # recheck
print(f'our embed dim is {embedding_dim}')
# creating a tensor with values distributed based on the normal distribution with mean 0 and variance 1
# this is because for the unkown case we should be giving the embedding a
# somewhat realistic distribution to an actual word
# unk_vector = torch.randn(embedding_dim)
unk_vector = np.random.randn(embedding_dim)
vectors.append(unk_vector)
word2vec_vocab.add_token('<unk>')
# for padding, however, we do not want the embedding values to distort the
# meaning of our sentence. Hence, it is better to apply zero-padding.
pad_vector = np.zeros(embedding_dim)
vectors.append(pad_vector)

word2vec_vocab.add_token('<pad>')

for token, embedding in word2vec_data.items():
  #print(token)
  #print(embedding)
  word2vec_vocab.add_token(token)
  vectors.append(np.array(embedding))
  #break

#word2vec_vocab.build()
print("Vocabulary size:", len(word2vec_vocab.w2i))
print(embedding_dim)
vectors = np.stack(vectors, axis=0) 

In [None]:
class MyLSTMCell(nn.Module):
  """Our own LSTM cell"""

  def __init__(self, input_size, hidden_size, bias=True):
    """Creates the weights for this LSTM"""
    super(MyLSTMCell, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bias = bias

    # YOUR CODE HERE
    self.W_ii = torch.nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_if = torch.nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_ig = torch.nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_io = torch.nn.Parameter(torch.Tensor(input_size, hidden_size))
    self.W_hi = torch.nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.W_hf = torch.nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.W_hg = torch.nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.W_ho = torch.nn.Parameter(torch.Tensor(hidden_size, hidden_size))
    self.b_ii = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_if = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_ig = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_io = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_hi = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_hf = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_hg = torch.nn.Parameter(torch.Tensor(hidden_size))
    self.b_ho = torch.nn.Parameter(torch.Tensor(hidden_size))
    # end of my code

    self.reset_parameters()

  def reset_parameters(self):
    """This is PyTorch's default initialization method"""
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
      weight.data.uniform_(-stdv, stdv)

  def forward(self, input_, hx, mask=None):
    """
    input is (batch, input_size)
    hx is ((batch, hidden_size), (batch, hidden_size))
    """
    prev_h, prev_c = hx

    # project input and prev state
    # YOUR CODE HERE

    i = torch.sigmoid(torch.matmul(input_, self.W_ii) + self.b_ii + torch.matmul(prev_h, self.W_hi) + self.b_hi)

    #raise NotImplementedError("Implement this")

    # main LSTM computation

    # i = ...
    f = torch.sigmoid(torch.matmul(input_, self.W_if) + self.b_if + torch.matmul(prev_h, self.W_hf) + self.b_hf)
    g = torch.tanh(torch.matmul(input_, self.W_ig) + self.b_ig + torch.matmul(prev_h, self.W_hg) + self.b_hg)
    o = torch.sigmoid(torch.matmul(input_, self.W_io) + self.b_io + torch.matmul(prev_h, self.W_ho) + self.b_ho)
    c = f * prev_c + i * g
    h = o * torch.tanh(c)

    return h, c

  def __repr__(self):
    return "{}({:d}, {:d})".format(
        self.__class__.__name__, self.input_size, self.hidden_size)

In [None]:
class LSTMClassifier(nn.Module):
  """Encodes sentence with an LSTM and projects final hidden state"""

  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, vocab):
    super(LSTMClassifier, self).__init__()
    self.vocab = vocab
    self.hidden_dim = hidden_dim
    self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
    self.rnn = MyLSTMCell(embedding_dim, hidden_dim)

    self.output_layer = nn.Sequential(
        nn.Dropout(p=0.5),  # explained later
        nn.Linear(hidden_dim, output_dim)
    )

  def forward(self, x):

    B = x.size(0)  # batch size (this is 1 for now, i.e. 1 single example)
    T = x.size(1)  # timesteps (the number of words in the sentence)

    input_ = self.embed(x)

    # here we create initial hidden states containing zeros
    # we use a trick here so that, if input is on the GPU, then so are hx and cx
    hx = input_.new_zeros(B, self.rnn.hidden_size)
    cx = input_.new_zeros(B, self.rnn.hidden_size)

    # process input sentences one word/timestep at a time
    # input is batch-major (i.e., batch size is the first dimension)
    # so the first word(s) is (are) input_[:, 0]
    outputs = []
    for i in range(T):
      hx, cx = self.rnn(input_[:, i], (hx, cx))
      outputs.append(hx)

    # if we have a single example, our final LSTM state is the last hx
    if B == 1:
      final = hx
    else:
      #
      # This part is explained in next section, ignore this else-block for now.
      #
      # We processed sentences with different lengths, so some of the sentences
      # had already finished and we have been adding padding inputs to hx.
      # We select the final state based on the length of each sentence.

      # two lines below not needed if using LSTM from pytorch
      outputs = torch.stack(outputs, dim=0)           # [T, B, D]
      outputs = outputs.transpose(0, 1).contiguous()  # [B, T, D]

      # to be super-sure we're not accidentally indexing the wrong state
      # we zero out positions that are invalid
      pad_positions = (x == 1).unsqueeze(-1)

      outputs = outputs.contiguous()
      outputs = outputs.masked_fill_(pad_positions, 0.)

      mask = (x != 1)  # true for valid positions [B, T]
      lengths = mask.sum(dim=1)                 # [B, 1]

      indexes = (lengths - 1) + torch.arange(B, device=x.device, dtype=x.dtype) * T
      final = outputs.view(-1, self.hidden_dim)[indexes]  # [B, D]

    # we use the last hidden state to classify the sentence
    logits = self.output_layer(final)
    return logits

In [None]:
def get_minibatch(data, batch_size=25, shuffle=True):
  """Return minibatches, optional shuffling"""

  if shuffle:
    print("Shuffling training data")
    random.shuffle(data)  # shuffle training data each epoch

  batch = []

  # yield minibatches
  for example in data:
    batch.append(example)

    if len(batch) == batch_size:
      yield batch
      batch = []

  # in case there is something left
  if len(batch) > 0:
    yield batch

In [None]:
def pad(tokens, length, pad_value=1):
  """add padding 1s to a sequence to that it has the desired length"""
  return tokens + [pad_value] * (length - len(tokens))

# example
tokens = [2, 3, 4]
pad(tokens, 5)

In [None]:
def prepare_minibatch(mb, vocab):
  """
  Minibatch is a list of examples.
  This function converts words to IDs and returns
  torch tensors to be used as input/targets.
  """
  batch_size = len(mb)
  maxlen = max([len(ex.tokens) for ex in mb])

  # vocab returns 0 if the word is not there
  x = [pad([vocab.w2i.get(t, 0) for t in ex.tokens], maxlen) for ex in mb]

  x = torch.LongTensor(x)
  x = x.to(device)

  y = [ex.label for ex in mb]
  y = torch.LongTensor(y)
  y = y.to(device)

  return x, y

In [None]:
def evaluate(model, data,
             batch_fn=get_minibatch, prep_fn=prepare_minibatch,
             batch_size=16):
  """Accuracy of a model on given data set (using mini-batches)"""
  correct = 0
  total = 0
  model.eval()  # disable dropout

  for mb in batch_fn(data, batch_size=batch_size, shuffle=False):
    x, targets = prep_fn(mb, model.vocab)
    with torch.no_grad():
      logits = model(x)

    predictions = logits.argmax(dim=-1).view(-1)

    # add the number of correct predictions to the total correct
    correct += (predictions == targets.view(-1)).sum().item()
    total += targets.size(0)

  return correct, total, correct / float(total)

In [None]:

# Here we print each parameter name, shape, and if it is trainable.
def print_parameters(model):
  total = 0
  for name, p in model.named_parameters():
    total += np.prod(p.shape)
    print("{:24s} {:12s} requires_grad={}".format(name, str(list(p.shape)), p.requires_grad))
  print("\nTotal number of parameters: {}\n".format(total))

In [None]:
lstm_model = LSTMClassifier(
    len(word2vec_vocab.w2i), 300, 168, len(t2i), word2vec_vocab)

# copy pre-trained vectors into embeddings table
with torch.no_grad():
  lstm_model.embed.weight.data.copy_(torch.from_numpy(vectors))
  lstm_model.embed.weight.requires_grad = False # no fine-tuning yet

print(lstm_model)
print_parameters(lstm_model)

lstm_model = lstm_model.to(device)

optimizer_class = optim.Adam
optimizer_params = {"lr": 3e-5}
batch_size = 25
json_lstm_minibatch = train_model_w_seed(lstm_model, optimizer_class, optimizer_params,
                       train_data, dev_data, test_data,
                       num_iterations=30000,
                       print_every=1000,
                       eval_every=1000,
                       batch_size=batch_size,
                       batch_fn=get_minibatch,
                       prep_fn=prepare_minibatch,
                       eval_fn=evaluate)

In [None]:
filename = "lstm_model_minibatch.json"
with open(filename, "w") as f:
    json.dump(json_lstm_minibatch, f, indent=2)

Fine-tuning word embeddings

In [None]:
lstm_model = LSTMClassifier(
    len(word2vec_vocab.w2i), 300, 168, len(t2i), word2vec_vocab)

# Now fine-tune your embeddings together with the model
# YOUR CODE HERE
#raise NotImplementedError("Implement this.")
# copy pre-trained word vectors into embeddings table
with torch.no_grad():
  lstm_model.embed.weight.data.copy_(torch.from_numpy(vectors))
  lstm_model.embed.weight.requires_grad = True # <- this is where the fine-tuning takes place
# end of my code

print(lstm_model)
print_parameters(lstm_model)

lstm_model = lstm_model.to(device)
json_lstm_minibatch_fine_tuned = train_model_w_seed(lstm_model, optimizer_class, optimizer_params,
                       train_data, dev_data, test_data,
                       num_iterations=30000,
                       print_every=1000,
                       eval_every=1000,
                       batch_size=batch_size,
                       batch_fn=get_minibatch,
                       prep_fn=prepare_minibatch,
                       eval_fn=evaluate)

In [None]:
filename = "lstm_model_minibatch_fine_tuned.json"
with open(filename, "w") as f:
    json.dump(json_lstm_minibatch_fine_tuned, f, indent=2)