<a href="https://colab.research.google.com/github/LuigiSigillo/nlp2021-hw/blob/master/hw1/stud/nlp_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# imports

In [30]:
from google.colab import drive
# general
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *

# torch
import torch
import json
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD

torch.manual_seed(42)

drive.mount('/content/drive')
root_folder = '/content/drive/My Drive/NLP/nlp2021-hw1'
dataset_folder = os.path.join(root_folder,'data')
print(dataset_folder)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/NLP/nlp2021-hw1/data


In [None]:
#! wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
#! unzip -d data/glove.6B
#! cd '/content/drive/My Drive/NLP/nlp2021-hw1'
#!unzip '/content/drive/My Drive/NLP/nlp2021-hw1/glove.6B.zip'

In [38]:
!mv glove.6B.200d.txt '/content/drive/My Drive/NLP/nlp2021-hw1/model'
!ls

drive  sample_data


In [31]:
word_vectors = dict()
words_limit = 100_000
with open('/content/drive/My Drive/NLP/nlp2021-hw1/model/glove.6B.100d.txt') as f:

    next(f)  # skip header

    for i, line in tqdm(enumerate(f), total=words_limit):

        if i == words_limit:
            break

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector

## alternatives
# for l in f:
#     line = l.decode().split()
#     word = line[0]
#     words.append(word)
#     word2idx[word] = idx
#     idx += 1
#     vect = np.array(line[1:]).astype(np.float)
#     vectors.append(vect)


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))

In [32]:
def cosine_similarity(v1: torch.Tensor, v2: torch.Tensor) -> float:
    num = torch.sum(v1 * v2)
    den = torch.linalg.norm(v1) * torch.linalg.norm(v2)
    return (num / den).item()

In [33]:
cosine_similarity(word_vectors['king'], word_vectors['queen'])

0.7507690787315369

In [34]:
def phrase2vector(phrase: str) -> Optional[torch.Tensor]:
    phrases_word_vector = [word_vectors[w] for w in phrase.split(' ') if w in word_vectors]
    
    if len(phrases_word_vector) == 0:
        return None

    phrases_word_vector = torch.stack(phrases_word_vector)  # tensor shape: (#words X #features)
    return torch.mean(phrases_word_vector, dim=0)

In [58]:
# same class as Notebook 4
class AmazonReviewsDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, phrase2vector):
        self.data_store = []
        self.init_structures(dataset_path, phrase2vector)

    def init_structures(self, dataset_path: str, phrase2vector) -> None:

        with open(dataset_path) as f:
            for json_string in f:
                single_json = json.loads(json_string)
                s1 = single_json['sentence1']
                s2 =  single_json['sentence2']
                ground_t = 1 if single_json['label'] =='True' else 0
                vector1 = phrase2vector(s1)
                vector2 = phrase2vector(s2)
                if vector is None:
                    continue
                if vector2 is None:
                    continue
                self.data_store.append((vector1,vector2,ground_t))
    

    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [59]:
# same class as Notebook 4
class AmazonReviewsDataModule(nn.Module):

    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        data_test_path: str,
        batch_size: int,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.data_test_path = data_test_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn

        self.train_dataset = None
        self.validation_dataset = None
        self.test_dataset = None

    def setup(self, stage: Optional[str] = None) -> None:
        if stage == 'fit':
            self.train_dataset = AmazonReviewsDataset(self.data_train_path, phrase2vector)
            self.validation_dataset = AmazonReviewsDataset(self.data_dev_path, phrase2vector)
        elif stage == 'test':
            self.test_dataset = AmazonReviewsDataset(self.data_test_path, phrase2vector)

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        return DataLoader(self.validation_dataset, batch_size=self.batch_size)

    def test_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [60]:
amazon_review_dm = AmazonReviewsDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    data_test_path=dataset_folder+'/dev.jsonl',
    batch_size=32,
)

In [67]:
amazon_review_dm.setup('test')
test_dataloader = amazon_review_dm.test_dataloader()
# print(word_vectors['test'])

for batch in test_dataloader:
    X, y,z = batch
    print(f"batch X shape: {X.shape}")
    print(f"batch y shape: {z.shape}")
    #print(f"batch y shape: {z[0]}")
    
    break

batch X shape: torch.Size([32, 100])
batch y shape: torch.Size([32])


In [94]:
import sklearn

# same class as Notebook 4
class AmazonReviewClassifier(nn.Module):

    REVIEWS_CLASSES = 2

    def __init__(self, n_features: int, n_hidden: int):
        super().__init__()
        # classification function
        self.lin1 = torch.nn.Linear(n_features, n_hidden)
        self.lin2 = torch.nn.Linear(n_hidden, self.REVIEWS_CLASSES)
        
        # criterion
        self.loss_fn = torch.nn.CrossEntropyLoss()
        
        # metrics
        self.val_f1 = sklearn.metrics.f1_score
        self.test_f1 = sklearn.metrics.f1_score
        



    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # actual forward
        out = self.lin1(x)
        out = torch.relu(out)
        out = self.lin2(out).squeeze(1)

        # compute logits (which are simply the out variable) and the actual probability distribution (pred, as it is the predicted distribution)
        logits = out
        pred = torch.softmax(logits, dim=-1)

        result = {'logits': logits, 'pred': pred}

        # compute loss
        if y is not None:
            # while mathematically the CrossEntropyLoss takes as input the probability distributions,
            # torch optimizes its computation internally and takes as input the logits instead
            loss = self.loss(logits, y)
            result['loss'] = loss

        return result

    def training_step(
        self, 
        batch: Tuple[torch.Tensor], 
        batch_idx: int
    ) -> torch.Tensor:
        forward_output = self.forward(*batch)
        return forward_output['loss']

    def validation_step(
        self, 
        batch: Tuple[torch.Tensor], 
        batch_idx: int
    ):
        forward_output = self.forward(*batch)
        
        self.val_f1(forward_output['pred'], batch[1])

        self.log('val_f1', self.val_f1, prog_bar=True)
        self.log('val_loss', forward_output['loss'], prog_bar=True)

    def test_step(
        self,
        batch: Tuple[torch.Tensor],
        batch_idx: int
    ):
        forward_output = self.forward(*batch)
        self.test_f1(forward_output['pred'], batch[1])
        self.log('test_f1', self.test_f1, prog_bar=True)

    def loss(self, pred, y):
        return self.loss_fn(pred, y)

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=0.1, momentum=0.0)
        return optimizer

In [95]:
amazon_review_classifier = AmazonReviewClassifier(
    n_features=300, 
    n_hidden=128
)

In [101]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer

        # starts requires_grad for all layers
        self.model.train()  # we are using this model for training (some layers have different behaviours in train and eval mode)
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, output_folder, epochs=1):

        train_loss = 0.0
        for epoch in range(epochs):
            epoch_loss = 0.0
            len_train = 0

            # each element (sample) in train_dataset is a batch
            for step, sample in tqdm(enumerate(train_dataset), desc="Batch", leave=False):
                # inputs in the batch
                print(sample)
                inputs = sample['inputs']
                # outputs in the batch
                targets = sample['targets'].to(self.device)

                # one_hot_input : batch size X vocab_size
                one_hot_input = torch.zeros((inputs.shape[0], VOCAB_SIZE), device=self.device)
                # sets the ones corresponding to the input word
                for i, x in enumerate(inputs):
                    one_hot_input[i, x] = 1

                output_distribution = self.model(one_hot_input)
                loss = self.model.loss_function(output_distribution, targets)  # compute loss
                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1
            avg_epoch_loss = epoch_loss / len_train

            print('Epoch: {} avg loss = {:0.4f}'.format(epoch, avg_epoch_loss))

            train_loss += avg_epoch_loss
            torch.save(self.model.state_dict(),
                       os.path.join(output_folder, 'state_{}.pt'.format(epoch)))  # save the model state

        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

In [102]:
def get_trainer():

    # the PyTorch Lightning Trainer
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = amazon_review_classifier

    # define an optimizer (stochastic gradient descent) to update the parameters
    optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
    trainer = Trainer(model, optimizer, device)


    return trainer

In [103]:
# and finally we can let the "trainer" fit the amazon reviews classifier.
trainer = get_trainer()
avg_loss = trainer.train(test_dataloader, "output", epochs=100)
#trainer.fit(model=amazon_review_classifier, datamodule=amazon_review_dm)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Batch', max=1.0, style=ProgressStyle(de…

[tensor([[ 0.0777, -0.1355,  0.2992,  ..., -0.2303,  0.3809,  0.0008],
        [ 0.0777, -0.1355,  0.2992,  ..., -0.2303,  0.3809,  0.0008],
        [-0.1202,  0.0973,  0.3522,  ..., -0.4849,  0.4850,  0.1797],
        ...,
        [-0.0030,  0.2531,  0.1949,  ..., -0.1160,  0.6354,  0.3582],
        [-0.0095,  0.0145,  0.0887,  ..., -0.3209,  0.5519,  0.0739],
        [-0.0095,  0.0145,  0.0887,  ..., -0.3209,  0.5519,  0.0739]]), tensor([[ 0.0699,  0.1105,  0.2296,  ..., -0.4752,  0.4540,  0.0984],
        [ 0.0975, -0.2821,  0.2379,  ..., -0.3085,  0.4475,  0.0163],
        [-0.0973,  0.1742,  0.0963,  ..., -0.1884,  0.5794, -0.1349],
        ...,
        [-0.0116,  0.1431,  0.1148,  ..., -0.3956,  0.5592,  0.2019],
        [-0.0281, -0.0365,  0.2674,  ..., -0.3676,  0.7004,  0.1887],
        [-0.0818, -0.0044,  0.1980,  ..., -0.1523,  0.6536, -0.0984]]), tensor([0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0])]


TypeError: ignored

# load dats

In [7]:
class Word2VecDataset(torch.utils.data.IterableDataset):

    def __init__(self, txt_path, vocab_size, unk_token, window_size):
        """
        Args:
          txt_file (str): Path to the raw-text file.
          vocab_size (int): Maximum amount of words that we want to embed.
          unk_token (str): How will unknown words represented (e.g. 'UNK').
          window_size (int): Number of words to consider as context.
        """
        self.window_size = window_size
        # [[w1,s1, w2,s1, ..., w|s1|,s1], [w1,s2, w2,s2, ..., w|s2|,s2], ..., [w1,sn, ..., w|sn|,sn]]
        self.data_words = self.read_data(txt_path)
        self.build_vocabulary(vocab_size, unk_token)

    def __iter__(self):
        sentences = self.data_words
        for sentence in sentences:
            len_sentence = len(sentence)

            for input_idx in range(len_sentence):
                current_word = sentence[input_idx]
                # must be a word in the vocabulary
                if current_word in self.word2id and self.keep_word(current_word):
                    # left and right window indices
                    min_idx = max(0, input_idx - self.window_size)
                    max_idx = min(len_sentence, input_idx + self.window_size)

                    window_idxs = [x for x in range(min_idx, max_idx) if x != input_idx]
                    for target_idx in window_idxs:
                        # must be a word in the vocabulary
                        if sentence[target_idx] in self.word2id:
                            # index of target word in vocab
                            target = self.word2id[sentence[target_idx]]
                            # index of input word
                            current_word_id = self.word2id[current_word]
                            output_dict = {'targets':target, 'inputs':current_word_id}

                            yield output_dict

    def keep_word(self, word):
        '''Implements negative sampling and returns true if we can keep the occurrence as training instance.'''
        z = self.frequency[word] / self.tot_occurrences
        p_keep = np.sqrt(z / 10e-3) + 1
        p_keep *= 10e-3 / z # higher for less frequent instances
        return np.random.rand() < p_keep # toss a coin and compare it to p_keep to keep the word

    def read_data(self,jsonl_path):
        """Converts each line in the input file into a list of lists of tokenized words."""
        data = []
        total_words = 0
        # tot_lines = self.count_lines(txt_path)
        with open(jsonl_path) as f:
            for json_string in f:
                single_json = json.loads(json_string)
                split = self.tokenize_line(single_json['sentence1'])
                split2 = self.tokenize_line(single_json['sentence2'])
                if split:
                    data.append(split)
                    total_words += len(split)
                if split2:
                    data.append(split2)
                    total_words += len(split2)
        return data


    # "The pen is on the table" -> ["the, "pen", "is", "on", "the", "table"]
    def tokenize_line(self, line, pattern='\W'):
        """Tokenizes a single line."""
        return [word.lower() for word in re.split(pattern, line.lower()) if word]

    def build_vocabulary(self, vocab_size, unk_token):
        """Defines the vocabulary to be used. Builds a mapping (word, index) for
        each word in the vocabulary.

        Args:
          vocab_size (int): size of the vocabolary
          unk_token (str): token to associate with unknown words
        """
        counter_list = []
        # context is a list of tokens within a single sentence
        for context in self.data_words:
            counter_list.extend(context)
        counter = collections.Counter(counter_list)
        counter_len = len(counter)
        print("Number of distinct words: {}".format(counter_len))

        # consider only the (vocab size -1) most common words to build the vocab
        dictionary = {key: index for index, (key, _) in enumerate(counter.most_common(vocab_size - 1))}
        assert unk_token not in dictionary
        # all the other words are mapped to UNK
        dictionary[unk_token] = vocab_size - 1
        self.word2id = dictionary

        # dictionary with (word, frequency) pairs -- including only words that are in the vocabulary
        dict_counts = {x: counter[x] for x in dictionary if x is not unk_token}
        self.frequency = dict_counts
        self.tot_occurrences = sum(dict_counts[x] for x in dict_counts)

        print("Total occurrences of words in dictionary: {}".format(self.tot_occurrences))

        less_freq_word = min(dict_counts, key=counter.get)
        print("Less frequent word in dictionary appears {} times ({})".format(dict_counts[less_freq_word],
                                                                              less_freq_word))

        # index to word
        self.id2word = {value: key for key, value in dictionary.items()}

        # data is the text converted to indexes, as list of lists
        data = []
        # for each sentence
        for sentence in self.data_words:
            paragraph = []
            # for each word in the sentence
            for i in sentence:
                id_ = dictionary[i] if i in dictionary else dictionary[unk_token]
                if id_ == dictionary[unk_token]:
                    continue
                paragraph.append(id_)
            data.append(paragraph)
        # list of lists of indices, where each sentence is a list of indices, ignoring UNK
        self.data_idx = data

In [None]:
def read_data(jsonl_path):
    """Converts each line in the input file into a list of lists of tokenized words."""
    data = []
    total_words = 0
    # tot_lines = self.count_lines(txt_path)
    with open(jsonl_path) as f:
        for json_string in f:
            single_json = json.loads(json_string)
            split = tokenize_line(single_json['sentence1'])
            split2 = tokenize_line(single_json['sentence2'])
            if split:
                data.append(split)
                total_words += len(split)
            if split2:
                data.append(split2)
                total_words += len(split2)
    return data

# "The pen is on the table" -> ["the, "pen", "is", "on", "the", "table"]
def tokenize_line(line, pattern='\W'):
    """Tokenizes a single line."""
    return [word.lower() for word in re.split(pattern, line.lower()) if word]


train_data_path = os.path.join(dataset_folder,'train.jsonl')
read_data(train_data_path)

In [8]:
VOCAB_SIZE = 10_000
UNK = 'UNK'
train_data_path = os.path.join(dataset_folder,'train.jsonl')
dataset = Word2VecDataset(train_data_path, VOCAB_SIZE, UNK, window_size=5)

Number of distinct words: 27007
Total occurrences of words in dictionary: 361796
Less frequent word in dictionary appears 4 times (bolivia)
