In [9]:
# ! pip install wandb # colab only
import re
import math
from collections import defaultdict, OrderedDict
import numpy as np
import torch
from torch import nn
from torchinfo import summary
from pprint import pprint
import wandb
from sklearn.metrics import f1_score
import config as cfg

def download_and_unzip(url, save_dir='.'):
  # downloads and unzips url, if not already downloaded
  # used for downloading dataset and glove embeddings
  import os
  from urllib.request import urlopen
  from io import BytesIO
  from zipfile import ZipFile
  fname = url.split('/')[-1][:-4] if save_dir == '.' else save_dir
  if fname not in os.listdir():
    print(f'downloading and unzipping {fname}...', end=' ')
    r = urlopen(url)
    zipf = ZipFile(BytesIO(r.read()))
    zipf.extractall(path=save_dir)
    print(f'completed')
  else:
    print(f'{fname} already downloaded')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def get_wandbkey():
    with open('wandbkey.txt') as f:
        return f.read().strip()

In [10]:
def get_glove(emb_size=100, number_token=False):
  """
    Download and load glove embeddings. 
    Parameters:
      emb_size: embedding size (50/100/200/300-dimensional vectors).    
    Returns tuple (voc, emb) where voc is dict from words to idx (in emb) and emb is (numpy) embedding matrix
  """
  n_tokens = 400000 + 1 # glove vocabulary size + PAD
  if emb_size not in (50, 100, 200, 300):
    raise ValueError(f'wrong size parameter: {emb_size}')
  
  if number_token: 
    n_tokens += 1
  download_and_unzip('http://nlp.stanford.edu/data/glove.6B.zip', save_dir='glove')
  vocabulary = dict()
  embedding_matrix = np.ones((n_tokens, emb_size))

  with open(f'glove/glove.6B.{emb_size}d.txt', encoding="utf8") as f:
    for i, line in enumerate(f):
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embedding_matrix[i] = coefs
        vocabulary[word] = i
  
  # add embedding for and padding and number token
  if number_token:
    embedding_matrix[n_tokens - 2] = 0
    vocabulary['<PAD>'] = n_tokens - 2
    digits = list(filter(lambda s: re.fullmatch('\d+(\.\d*)?', s) is not None, vocabulary.keys()))
    embedding_matrix[n_tokens - 1] = np.mean(embedding_matrix[[vocabulary[d] for d in digits]], axis=0)
    vocabulary['<NUM>'] = n_tokens - 1
  else: 
    embedding_matrix[n_tokens - 1] = 0
    vocabulary['<PAD>'] = n_tokens - 1
  return vocabulary, embedding_matrix

In [11]:
class Dataset(torch.utils.data.Dataset):
    """Simple dataset class to use dataloaders (batching) """
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]
    def __len__(self):
        return self.inputs.shape[0]

In [12]:
classes = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 
        'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 
        'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '<PAD>'] # serve una classe per il padding vero?
class2idx = {c: i for i, c in enumerate(classes)}


def add_oov(start_voc, oovs, embedding_matrix, sentences):
  """
    Computes new embedding matrix, adding embeddings for oovs
    Parameters:
      start_voc: dict, starting vocabulary that is extended with oovs
      oovs: set of string, oovs to add to the starting vocabulary and embedding matrix
      embbedding_matrix: starting embedding matrix (numpy)
      sentences: list of list of strings, set used to compute oov embeddings
    Returns tuple (voc, emb) where voc is dict from words to idx (in emb) and emb is (numpy) embedding matrix with oovs
  """
  oovs = oovs - set(start_voc.keys())
  start_voc_size, emb_size = embedding_matrix.shape
  oov_embeddings = np.zeros((start_voc_size + len(oovs), emb_size))
  oov_embeddings[:start_voc_size] = embedding_matrix
  new_voc = dict(start_voc)

  for i, oov in enumerate(oovs):
    context_words = [new_voc[word] 
                    for sentence in filter(lambda s: oov in s, sentences)
                    for word in sentence if word in new_voc and word not in (oov, '<PAD>')]
    oov_embeddings[start_voc_size + i] = np.mean(oov_embeddings[context_words], axis=0)
    new_voc[oov] = start_voc_size + i
  return new_voc, oov_embeddings
    
def load_data(start, end, start_voc, embedding_matrix, number_token=False,
              drop_punctuation=True, split_docs=True, ret_counts=False):
  """
    Downloads dataset and preprocess data.
    Params:
      start: idx of first file to include in data
      end: idx of last file to include in data
      start_voc: starting vocabulary that is extended with oov terms
      embedding_matrix: embedding matrix that 
      #TODO implement number_token: if True, use a single token for all cardinal numbers
      drop_punctuation: if True, drop punt
      split_docs: if True, each sequence is one sentence; if false, each sequence is one document
      ret_counts: if True, also return counts of each word in the documents
    Returns 
  """
  # download dataset
  download_and_unzip('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip')
  
  inputs, labels = [], []
  vocabulary = set()
  counts = defaultdict(int)
  
  # build dataset
  for doc in range(start, end+1):
    with open(f'dependency_treebank/wsj_{doc:04d}.dp') as f:
      
      input_seq, label_seq = [], []
      
      for line in f:
        if line.strip(): # check for empty lines
          word, label, _ = line.split('\t')
          word = word.lower()
          if '\/' in word:
            word = word.replace('\/', '-')
          if number_token and re.fullmatch('\d+(\.\d*)?', word) is not None:
            word = '<NUM>'
          if not drop_punctuation or label.isalpha(): # eventually drop punctuation
            vocabulary.add(word)
            input_seq.append(word)
            label_seq.append(label)
            counts[word] += 1
        elif split_docs: # sentence over, add to input if splitting documents
          inputs.append(input_seq)
          labels.append(label_seq)
          input_seq, label_seq = [], []

      inputs.append(input_seq)
      labels.append(label_seq)
  
  max_seq_len = int(np.quantile([len(seq) for seq in inputs], 0.999))
  inputs_copy = []
  labels_copy = []
  for i_seq, l_seq in zip(inputs, labels):
    if len(i_seq) > max_seq_len:
        inputs_copy.append(i_seq[:max_seq_len])
        labels_copy.append(l_seq[:max_seq_len])
    else:
        inputs_copy.append(i_seq + ['<PAD>'] * (max_seq_len - len(i_seq)))
        labels_copy.append(l_seq + ['<PAD>'] * (max_seq_len - len(l_seq)))
  
  inputs = inputs_copy
  labels = labels_copy
  vocabulary, embedding_matrix = add_oov(start_voc, vocabulary, embedding_matrix, inputs)
  inputs = torch.as_tensor([[vocabulary[word] for word in sequence] for sequence in inputs])
  labels = torch.as_tensor([[class2idx[label] for label in sequence] for sequence in labels])

  if ret_counts:
    return inputs, labels, vocabulary, embedding_matrix, counts
  else:
    return inputs, labels, vocabulary, embedding_matrix

In [13]:
class POSTagger(torch.nn.Module):

  def __init__(self, embedding_matrix, type, rec_size=1, units=None, hid_size=50):
    """
      A recurrent network performing multiclass classification (POS tagging).
      Params:
        type: type of rnn, either 'lstm' or 'gru'
        embedding_matrix: embedding matrix for embedding layer
        rec_size: number of stacked recurrent modules
        units: int or None, if given then add one additional linear layer with given number of units
        hid_size: size of hidden state of recurrent module
    """
    super().__init__()

    emb_size = embedding_matrix.shape[1]
    self.emb_layer = nn.Embedding.from_pretrained(torch.as_tensor(embedding_matrix))

    if type == 'lstm':
      rec_module = nn.LSTM
    elif type == 'gru':
      rec_module = nn.GRU
    else:
      raise ValueError(f'wrong type {type}, either lstm or gru')
    self.rec_modules = rec_module(input_size=emb_size, hidden_size=hid_size, bidirectional=True, batch_first=True, num_layers=rec_size)

    fc_params = [2 * hid_size] + ([units, 37] if units is not None else [37])
    self.fc_modules = nn.Sequential(
      OrderedDict([(f'fc_{i}', nn.Linear(in_shape, out_shape)) 
      for i, (in_shape, out_shape) in enumerate(zip(fc_params[:-1], fc_params[1:]))]))
      
    # self.logsoftmax = nn.LogSoftmax(dim=1)

  def forward(self, x):
    vecs = self.emb_layer(x).float()
    rec_out, _ = self.rec_modules(vecs)
    fc_out = self.fc_modules(rec_out)
    return fc_out
    # return self.logsoftmax(fc_out)

In [14]:
def train_one_epoch(model, optimizer, loss_fn, data_loader, device):
    """ 
        Trains model for one epoch on the given dataloader.
        Parameters:
            model: torch.nn.Module to train
            optimizer: torch.optim optimizer object
            loss_fn: torch.nn criterion to use to compute loss, given outputs and targets
            data_loader: torch.utils.data.DataLoader 
            device: torch.device where training is performed
        Returns log dict {'train/loss' : list(loss values for each batch)} 
    """
    model.train()
    log_dict = {'train/loss': []}

    for inputs, targets in data_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        logprobs = model(inputs).transpose(1, 2)
        loss = loss_fn(logprobs, targets)
        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print(f"Loss is {loss_value}, stopping training")
            exit(1)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        log_dict['train/loss'].append(loss_value)

    return log_dict

def evaluate(model, loss_fn, data_loader, device, metric='accuracy'):
    """ 
        Evaluate model on the given dataloader.
        Parameters:
            model: torch.nn.Module to evaluate
            loss_fn: torch.nn criterion to use to compute loss, given outputs and targets
            data_loader: torch.utils.data.DataLoader 
            device: torch.device where evaluation is performed
            metric: either 'accuracy' or 'f1'
        Returns log dict {'valid/loss' : mean loss, 'valid/{metric}': mean metric} 
    """
    model.eval()
    batch_losses = []
    batch_metrics = []
    if metric == 'f1':
        assert len(data_loader) == 1 # must be a single batch
        split = 'test'
    else:
        split = 'valid'
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            logprobs = model(inputs).transpose(1, 2)
            loss_value = loss_fn(logprobs, targets).item()
            preds = torch.argmax(logprobs, 1)

            if metric == 'accuracy':
                metric_value = ((targets == preds).sum() / (data_loader.batch_size * targets.shape[1])).item()
            elif metric == 'f1':
                metric_value = f1_score(targets.cpu().numpy().reshape(-1),
                                        preds.cpu().numpy().reshape(-1),
                                        labels=list(class2idx.values()),
                                        average='macro', zero_division=1)
            else:
                raise ValueError(f'wrong metric {metric}, either accuracy or f1')

            batch_losses.append(loss_value)
            batch_metrics.append(metric_value)

    log_dict = {f'{split}/loss': np.mean(batch_losses),
               f'{split}/{metric}': np.mean(batch_metrics) if metric == 'accuracy' else batch_metrics[0]}
    return log_dict


In [15]:
def train(verbose=False, test=False, number_token=False):
    """ Fully trains one model, based on cfg parameters, on training set and performs evaluation on validation set.
        Returns trained model.
    """
    cfg_dict = {
        'epochs': cfg.EPOCHS, 'batch_size': cfg.BATCH_SIZE, 'number_token': number_token,
        'model': cfg.TYPE, 'rec_size': cfg.REC_SIZE, 'units': cfg.UNITS, 'hid_size': cfg.HID_SIZE,
        'optim': cfg.OPTIM, 'lr': cfg.LR, 'alpha': cfg.ALPHA, 'betas': cfg.BETAS, 'momentum': cfg.MOMENTUM, 'weight_decay': cfg.WEIGHT_DECAY
    }
    if verbose:
        print('CONFIG PARAMETERS:')
        pprint(cfg_dict)
    metric = 'f1' if test else 'accuracy'
    wandb.login(key=get_wandbkey())
    run = wandb.init(project="assignment-one", entity="nlpetroni", reinit=True, config=cfg_dict)
    wandb.define_metric("train_step")
    wandb.define_metric("epoch")
    wandb.define_metric('train/loss', step_metric="train_step", summary="min")
    wandb.define_metric(f"valid/loss", step_metric="epoch", summary="min")
    wandb.define_metric(f"valid/accuracy", step_metric="epoch", summary="max")

    glove_voc, embedding_matrix = get_glove(number_token=number_token)
    if not test:
        train_set, train_labels, train_voc, embedding_matrix = load_data(1, 100, glove_voc, embedding_matrix, number_token=number_token)
        valid_set, valid_labels, valid_voc, embedding_matrix = load_data(101, 150, train_voc, embedding_matrix, number_token=number_token)
        train_dl = torch.utils.data.DataLoader(Dataset(train_set, train_labels), batch_size=cfg.BATCH_SIZE, shuffle=True)
        valid_dl = torch.utils.data.DataLoader(Dataset(valid_set, valid_labels), batch_size=cfg.BATCH_SIZE, shuffle=True)
    else:
        train_set, train_labels, train_voc, embedding_matrix = load_data(1, 150, glove_voc, embedding_matrix, number_token=number_token)
        test_set, test_labels, test_voc, embedding_matrix = load_data(151, 200, train_voc, embedding_matrix, number_token=number_token)
        train_dl = torch.utils.data.DataLoader(Dataset(train_set, train_labels), batch_size=cfg.BATCH_SIZE, shuffle=True)
        test_dl = torch.utils.data.DataLoader(Dataset(test_set, test_labels), batch_size=test_set.shape[0], shuffle=True)


    model = POSTagger(embedding_matrix, type=cfg.TYPE, rec_size=cfg.REC_SIZE, units=cfg.UNITS, hid_size=cfg.HID_SIZE).to(device)
    wandb.watch(model, log_graph=True)
    if verbose:
        print(summary(model))

    params = [p for p in model.parameters() if p.requires_grad]
    if cfg.OPTIM == 'rmsprop':
        optimizer = torch.optim.RMSprop(params, lr=cfg.LR, alpha=cfg.ALPHA, momentum=cfg.MOMENTUM, weight_decay=cfg.WEIGHT_DECAY)
    elif cfg.OPTIM == 'adam':
        optimizer = torch.optim.Adam(params, lr=cfg.LR, betas=cfg.BETAS, weight_decay=cfg.WEIGHT_DECAY)
    else:
        raise ValueError(f'wrong optim {cfg.OPTIM}, either rmsprop or adam')
    loss = nn.CrossEntropyLoss()

    train_step = 0
    print('STARTING TRAINING')
    
    for epoch in range(cfg.EPOCHS):
        log_dict = train_one_epoch(model, optimizer, loss, train_dl, device)
        if not test:
            log_dict.update(evaluate(model, loss, valid_dl, device, metric=metric))
        for batch_loss in log_dict['train/loss']:
            wandb.log({'train_step': train_step, 'epoch': epoch, 'train/loss': batch_loss})
            train_step += 1
        wandb.log({'epoch': epoch, 'valid/loss': log_dict['valid/loss'], 'valid/accuracy': log_dict['valid/accuracy']})
        if (epoch % 25) == 0:
            print(f'[{epoch:03d}/{cfg.EPOCHS:03d}] train loss: {np.mean(log_dict["train/loss"]):.3f}, valid loss: {log_dict["valid/loss"]:.3f}, accuracy: {log_dict["valid/accuracy"]:.2f}')
    if test:
        log_dict = evaluate(model, loss, test_dl, device, metric=metric)
        wandb.log()

    run.finish()

    return model

In [None]:
# simple hyperparameter tuning
for number_token in (False, True):
    for lr in (0.001, 0.0005):
        cfg.LR = lr
        for batch_size in (32, 64, 128):
            cfg.BATCH_SIZE = batch_size
            for optim in ('rmsprop', 'adam'):
                cfg.OPTIM = optim
                for hid_size in (64, 128):
                    cfg.HID_SIZE = hid_size
                    for (type, rec_size, units) in (('lstm', 1, None), ('lstm', 2, None), ('lstm', 1, 64),
                                                    ('lstm', 1, 128), ('gru', 1, None)):
                        cfg.TYPE = type
                        cfg.REC_SIZE = rec_size
                        cfg.UNITS = units
                        train(number_token=number_token)

In [None]:
# simple hyperparameter tuning
for number_token in (False, True):
    for lr in (0.005, 0.001, 0.0005):
        cfg.LR = lr
        for batch_size in (32, 64, 128):
            cfg.BATCH_SIZE = batch_size
            for optim in ('rmsprop', 'adam'):
                cfg.OPTIM = optim
                for hid_size in (32, 64, 128, 256):
                    cfg.HID_SIZE = hid_size
                    for (type, rec_size, units) in (('lstm', 1, None), ('lstm', 2, None), ('lstm', 1, 64),
                                                    ('lstm', 1, 128), ('gru', 1, None)):
                        cfg.TYPE = type
                        cfg.REC_SIZE = rec_size
                        cfg.UNITS = units
                        train(number_token=number_token)

In [None]:
# test best model
#