In [2]:
import string

class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)

    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()

    def make_vocabulary(self, dataset):
        self.vocabulary = {"": 0, "[UNK]": 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())

    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]

    def decode(self, int_sequence):
        return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

# Create an instance of the Vectorizer class
vectorizer = Vectorizer()

# Example dataset
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]

# Build the vocabulary
vectorizer.make_vocabulary(dataset)


In [3]:
# Encode a text
text_to_encode = "I write, erase, rewrite"
encoded_sequence = vectorizer.encode(text_to_encode)
print("Encoded Sequence:", encoded_sequence)

# Decode an encoded sequence
decoded_text = vectorizer.decode(encoded_sequence)
print("Decoded Text:", decoded_text)


Encoded Sequence: [2, 3, 4, 5]
Decoded Text: i write erase rewrite
