In [95]:
"""
This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
Note, you can also pass BERT embeddings to the BiLSTM.
"""
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import math
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import logging
from datetime import datetime
import os
import csv
import gzip
import pandas as pd
import numpy as np
import copy
import spacy
from collections import Counter

In [63]:
import torch
from torch import nn
from typing import List
import os
import json



class LSTM(nn.Module):
    """
    Bidirectional LSTM running over word embeddings.
    """
    def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, num_classes: int = 13, vocab_size: int = 0, dropout: float = 0, bidirectional: bool = True):
        nn.Module.__init__(self)
        self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
        self.word_embedding_dimension = word_embedding_dimension
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        self.vocab_size = vocab_size

        self.embeddings_dimension = hidden_dim
        if self.bidirectional:
            self.embeddings_dimension *= 2
            
        #self.embeddings = nn.Embedding(vocab_size, word_embedding_dimension, padding_idx=0)
        self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        self.linear = nn.Linear(hidden_dim, num_classes)

    def forward(self, features, sentence_length):
        #token_embeddings = features['token_embeddings']
        #sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
        #x = self.embeddings(features)
        packed = nn.utils.rnn.pack_padded_sequence(x, sentence_length, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.encoder(packed)
        #packed = self.encoder(packed)
        #out_pack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)
        #out_pack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)
        #out_pack, (ht, ct) = self.encoder(out_pack)
        out = self.linear(ht[-1])
        
        return out

    def get_word_embedding_dimension(self) -> int:
        return self.embeddings_dimension

    def tokenize(self, text: str) -> List[int]:
        raise NotImplementedError()

    def save(self, output_path: str):
        with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
            json.dump(self.get_config_dict(), fOut, indent=2)

        torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))

    def get_config_dict(self):
        return {key: self.__dict__[key] for key in self.config_keys}

    @staticmethod
    def load(input_path: str):
        with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
            config = json.load(fIn)

        weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
        model = LSTM(**config)
        model.load_state_dict(weights)
        return model
    
def smart_batching_collate(self, batch):
    """
    Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
    Here, batch is a list of tuples: [(tokens, label), ...]
    :param batch:
        a batch from a SmartBatchingDataset
    :return:
        a batch of tensors for the model
    """
    num_texts = len(batch[0].texts)
    texts = [[] for _ in range(num_texts)]
    labels = []

    for example in batch:
        for idx, text in enumerate(example.texts):
            texts[idx].append(text)

        labels.append(example.label)

    labels = torch.tensor(labels).to(self._target_device)

    sentence_features = []
    for idx in range(num_texts):
        tokenized = self.tokenize(texts[idx])
        batch_to_device(tokenized, self._target_device)
        sentence_features.append(tokenized)

    return sentence_features, labels

In [21]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Read the dataset
batch_size = 32
model_save_path = './lstm_data/models/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = "./lstm_data/preprocessed_data_v2.csv"

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

logging.info("Read STSbenchmark train dataset")

2021-09-16 17:01:23 - Read STSbenchmark train dataset


In [57]:
df = pd.read_csv(sts_dataset_path)

train_samples = []
dev_samples = []
test_samples = []
for row in df.iterrows():
    score = row[1]['label_int']
    inp_example = InputExample(texts=[row[1]['four_pages_processed']], label=score)

    if row[1]['fold'] == 'val':
        dev_samples.append(inp_example)
    elif row[1]['fold'] == 'test':
        test_samples.append(inp_example)
    else:
        train_samples.append(inp_example)

In [23]:
# Map tokens to traditional word embeddings like GloVe
word_embedding_model = models.WordEmbeddings.from_text_file('./lstm_data/modelo_w2v_vec600_wd10_ct5_tec1.txt')

Load Word Embeddings: 623Embeddings [00:00, 6221.43Embeddings/s]

2021-09-16 17:06:06 - Read in embeddings file ./lstm_data/modelo_w2v_vec600_wd10_ct5_tec1.txt


Load Word Embeddings: 38443Embeddings [00:06, 5941.22Embeddings/s]


In [26]:
model = LSTM(
    word_embedding_dimension = word_embedding_model.get_word_embedding_dimension(), 
    hidden_dim = word_embedding_model.get_word_embedding_dimension(), 
    num_classes = 13, 
    #vocab_size = config["vocab_size"],
    num_layers = 2, 
    #dropout = config["dropout"], 
    bidirectional = True
)

In [27]:
model = nn.Sequential(word_embedding_model, model)

In [39]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=batch_size)

test_dataloader = DataLoader(test_samples, shuffle=True, batch_size=batch_size)

In [52]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
model.to(device)

best_model = copy.deepcopy(model)
best_loss = float("inf")
#best_macro = 0.0

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [69]:
train_dataloader.collate_fn = smart_batching_collate(batch=batch_size)
train_dataiterator = iter(train_dataloader)

TypeError: smart_batching_collate() missing 1 required positional argument: 'self'

In [67]:
for data, label in train_dataloader:
    break

TypeError: smart_batching_collate() missing 1 required positional argument: 'batch'

In [56]:
# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
outputs = model(data)

TypeError: tuple indices must be integers or slices, not str

In [121]:
df_emb = pd.read_csv("./lstm_data/modelo_w2v_vec600_wd10_ct5_tec1.txt", header=None, sep = " ", index_col=0)
embedding = {key: val.values for key, val in df_emb.T.items()}

def create_embedding_matrix(word_index,embedding_dict,dimension):
    embedding_matrix=np.zeros((len(word_index)+1,dimension))

    for word,index in word_index.items():
        if word in embedding_dict:
            embedding_matrix[index]=embedding_dict[word]
    return embedding_matrix


tok = spacy.load('pt_core_news_sm')    
# count frequency of each word    
counts = Counter()
"""for index, row in df_data.loc[df_data['fold']=="train"].iterrows():
    counts.update(tokenize(row['four_pages_processed'], tok))"""
texts=["O gato sentou no carpete bliu","nós podemos brincar com modelos"]

for text in texts:
    counts.update([token.text for token in tok.tokenizer(text)])

#creating vocabulary
vocab2index = dict()
words = []
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

embedding_matrix=create_embedding_matrix(vocab2index,embedding_dict=embedding,dimension=600)

vocab_size=embedding_matrix.shape[0]
vector_size=embedding_matrix.shape[1]

In [125]:
embedding_layer=nn.Embedding(num_embeddings=vocab_size,embedding_dim=vector_size, )

In [131]:
embedding_layer.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
embedding_layer.weight.requires_grad=False

In [132]:
embedding_layer(torch.LongTensor([5])).shape

torch.Size([1, 600])