In [9]:
# BAZA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from argparse import Namespace
from collections import Counter
import string
# OSNOVA
from torch.utils.data import Dataset, DataLoader

In [6]:
class Vocabulary(object):
    def __init__(self, token_to_idx=None):
        self._token_to_idx = dict()
        if token_to_idx is not None:
            self._token_to_idx = token_to_idx
        self._idx_to_token = {
            index: token
            for token, index in self._token_to_idx
        }
    
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        if token in self._token_to_idx:
            return self._token_to_idx[token]
        raise KeyError(f"Token {token} is not in Vocabulary")
   
    def lookup_index(self, index):
        if index in self._idx_to_token:
            return self._idx_to_token[index]
        raise KeyError(f"No token with index {index} in Vocabulary")        
        
    def __len__(self):
        return len(self._token_to_idx)

In [7]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, 
                 mask_token="<MSK>", unk_token="<UNK>",
                 begin_token="<BGN>", end_token="<END>"
                ):
        super(SequenceVocabulary, self).__init__(token_to_idx)
        
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_token = begin_token
        self._end_token = end_token
        
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_index = self.add_token(self._begin_token)
        self.end_index = self.add_token(self._end_token)

In [None]:
class Vectorizer(object):
    def __init__(self, vocab=None):
        self.vocab = vocab
        
    @classmethod
    def from_dataframe(cls, df, cutoff=10):
        vocab = SequenceVocabulary()
        
        tokens_counter = Counter()
        for text in df.text:
            for token in text.split(" "):
                tokens_counter[token] += 1
                
        for token, count in tokens_counter.items():
            if count >= cutoff:
                vocab.add_token(token)
        
        return cls(vocab)
    
    def vectorize(self, description, vector_length=-1):
        # tokens
        indices = [self.vocab.begin_index]
        for token in description.split(" "):
            indices.append(self.vocab.lookup_token(token))
        indices.append(self.vocab.end_index)
        # filling
        if vector_length < 0:
            vector_length = len(indices)
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.vocab.mask_index
        return out_vector