<a href="https://colab.research.google.com/github/LuigiSigillo/nlp2021-hw/blob/master/hw1/stud/nlp_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# imports

In [92]:
from google.colab import drive
# general
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *

# torch
import torch
import json
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

drive.mount('/content/drive')
root_folder = '/content/drive/My Drive/NLP/nlp2021-hw1'
dataset_folder = os.path.join(root_folder,'data')
print(dataset_folder)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/NLP/nlp2021-hw1/data


In [None]:
#! wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
#! unzip -d data/glove.6B
#! cd '/content/drive/My Drive/NLP/nlp2021-hw1'
#!unzip '/content/drive/My Drive/NLP/nlp2021-hw1/glove.6B.zip'
!mv glove.6B.200d.txt '/content/drive/My Drive/NLP/nlp2021-hw1/model'
!ls

 load the actual word embeddings

In [123]:
word_vectors = dict()
words_limit = 100_000
with open('/content/drive/My Drive/NLP/nlp2021-hw1/model/glove.6B.100d.txt') as f:

    next(f)  # skip header

    for i, line in tqdm(enumerate(f), total=words_limit):

        if i == words_limit:
            break

        word, *vector = line.strip().split(' ')
        vector = torch.tensor([float(c) for c in vector])
        
        word_vectors[word] = vector
# word_vectors["UNK"] = np.mean(np.array(list(word_vectors.values()), dtype=np.float64), axis=0)
word_vectors["UNK"] = torch.tensor(np.random.random(100),dtype=torch.float)

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))

In [116]:
print(type(torch.tensor(word_vectors["UNK"])))
print(type(word_vectors["queen"]))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


word-embedding-powered function $\phi$.  just converts any review to a vector by **averaging the embeddings corresponding to each word in it**.

In [110]:
def phrase2vector(phrase: str, method: str) -> Optional[torch.Tensor]:
    phrases_word_vector = [word_vectors[w] if w in word_vectors else word_vectors['UNK'] for w in phrase.split(' ')]
    if len(phrases_word_vector) == 0:
        return None

    phrases_word_vector = torch.stack(phrases_word_vector)  # tensor shape: (#words X #features)
    if method=="avg":
        return torch.mean(phrases_word_vector, dim=0)
    else:
        return torch.sum(phrases_word_vector, dim=0)

In [111]:
class SentencesDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, phrase2vector):
        self.data_store = []
        self.init_structures(dataset_path, phrase2vector)

    def init_structures(self, dataset_path: str, phrase2vector) -> None:

        with open(dataset_path) as f:
            for json_string in f:
                single_json = json.loads(json_string)
                sentence =  self.remove_stopwords(single_json['sentence1']) + " <SEP> " + self.remove_stopwords(single_json['sentence2'])
                ground_t = np.float32(1) if single_json['label'] =='True' else np.float32(0)
                vector = phrase2vector(sentence,"avg")
                if vector is None:
                    continue
                self.data_store.append((vector,ground_t))
                



    def remove_stopwords(self,sent):
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(sent)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence = []

        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        return " ".join(filtered_sentence)
    
    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [112]:
class SentencesDataModule(nn.Module):

    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        batch_size: int,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn

        self.train_dataset = None
        self.validation_dataset = None
        self.test_dataset = None

    def setup(self, stage: Optional[str] = None) -> None:
        self.train_dataset = SentencesDataset(self.data_train_path, phrase2vector)
        self.validation_dataset = SentencesDataset(self.data_dev_path, phrase2vector)

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        return DataLoader(self.validation_dataset, batch_size=self.batch_size)


In [124]:
sentences_dm = SentencesDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=32,
)
sentences_dm.setup('fit')
val_dataloader = sentences_dm.val_dataloader()
# print(word_vectors['test'])

for batch in val_dataloader:
    X, y = batch
    print(batch)
    print(f"batch X shape: {X.shape}")
    print(f"batch z shape: {y.shape}")
    
    
    break

[tensor([[ 0.2512,  0.1998,  0.2639,  ..., -0.1565,  0.4290,  0.1367],
        [ 0.2410, -0.0969,  0.2973,  ..., -0.0946,  0.3923,  0.0225],
        [ 0.1122,  0.1507,  0.2331,  ..., -0.1570,  0.5311,  0.0543],
        ...,
        [ 0.1197,  0.3343,  0.2411,  ..., -0.1259,  0.3664,  0.3817],
        [ 0.0013,  0.1617,  0.2219,  ..., -0.1883,  0.6025,  0.0947],
        [-0.0314,  0.0898,  0.2073,  ..., -0.1508,  0.5528, -0.0428]]), tensor([0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
        1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.])]
batch X shape: torch.Size([32, 100])
batch z shape: torch.Size([32])


# Non so se training

In [125]:
class SentencesClassifier(nn.Module):

    def __init__(self, n_features: int, n_hidden: int):
        super().__init__()
        # classification function
        self.lin1 = torch.nn.Linear(n_features, n_hidden)
        self.output_layer = torch.nn.Linear(n_hidden, 1)
        
        # criterion
        self.loss_fn = torch.nn.BCELoss()
        

    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # actual forward
        out = self.lin1(x)
        out = torch.relu(out)
        # compute logits (which are simply the out variable) and the actual probability distribution (pred, as it is the predicted distribution)
    
        logits = self.output_layer(out).squeeze(1)

        out = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': out}

        # compute loss
        if y is not None:
            # torch optimizes its computation internally and takes as input the logits instead
            loss = self.loss(out, y)
            result['loss'] = loss

        return result

    def loss(self, pred, y):
        return self.loss_fn(pred, y)

In [129]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer

        # starts requires_grad for all layers
        self.model.train()  # we are using this model for training (some layers have different behaviours in train and eval mode)
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs=1):

        train_loss = 0.0
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            epoch_val_loss = 0.0
            len_val_train = 0
            accuracy = 0
            self.model.train()
            # each element (sample) in train_dataset is a batch
            for step, sample in enumerate(train_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # outputs in the batch
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss
                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1
            
            self.model.eval()
            for step, sample in enumerate(eval_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # outputs in the batch
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss    
                
                accuracy += ((output_distribution['pred'] > 0.5) == targets).float().mean().item() #TODO
                epoch_val_loss += loss.item()
                len_val_train += 1
            
            avg_epoch_loss = epoch_loss / len_train
            avg_eval_loss = epoch_val_loss / len_val_train
            avg_accuracy_loss = accuracy / len_val_train
            print('Epoch: {} avg loss = {:0.4f} eval loss = {:0.4f} ACC = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss))

            train_loss += avg_epoch_loss
            # torch.save(self.model.state_dict(),
            #            os.path.join(output_folder, 'state_{}.pt'.format(epoch)))  # save the model state

        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

In [127]:
sent_classifier = SentencesClassifier(
    n_features=100, 
    n_hidden=64
)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.SGD(sent_classifier.parameters(), lr=0.02)
trainer = Trainer(sent_classifier, optimizer, device)
train_dataloader = sentences_dm.train_dataloader()
avg_loss = trainer.train(train_dataloader,val_dataloader, epochs=150)

In [90]:
def predict(model, phrase2vector, review: str):
    review_vector = phrase2vector(review).to('cuda' if torch.cuda.is_available() else 'cpu')
    forward_out = model(review_vector.unsqueeze(0))  # add a dimension to create a one-item batch
    print(f"# Sentences: {review}")
    for i,prob in enumerate(forward_out["pred"]):
        print("\n {}".format( prob) )
predict(sent_classifier, phrase2vector, "The cat eats the mouse <SEP> Use the mouse to click on the button")
predict(sent_classifier, phrase2vector, "The cat eats the mouse <SEP> The mouse escaped from the predator")



# Sentences: The cat eats the mouse <SEP> Use the mouse to click on the button

 0.9545401930809021
# Sentences: The cat eats the mouse <SEP> The mouse escaped from the predator

 0.9759144186973572


# RNN