# Vectorisation
Conversion of text into a numerical represenation normally a vector in n dimensional space. 

In [6]:
import string

class Vectorizer:
    def standardize(self, text):
        text = text.lower() # capitalisation iis not important. 
        # Slightly suprising considering a captical letter is a good indicator for humans
        return "".join(char for char in text if char not in string.punctuation)
    
    def tokenize(self,text):
        text = self.standardize(text)
        return text.split() # break into list
    
    def make_vocabulary(self,dataset):
        self.vocabulary = {"" : 0, " [UNK] " : 1} 
        # we use blank as zero and unknown as 1 
        # All other vocab are unique integers
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict((v,k) for k,v in self.vocabulary.items())
    
    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token,1) for token in tokens]
    
    def decode(self, int_sequence):
        return " ".join(self.inverse_vocabulary.get(i, " [UNK] ") for i in int_sequence)
    
vectorizer = Vectorizer()
    
dataset = ["I write, erase, rewrite", "Erase again, and then", "A poppy blooms"]
    
vectorizer.make_vocabulary(dataset)
                    
    

In [7]:
test = "I write, erase, rewrite, and still rewrite again"
encode = vectorizer.encode(test)
print(encode)

decode = vectorizer.decode(encode)
print(decode)

[2, 3, 4, 5, 7, 1, 5, 6]
i write erase rewrite and  [UNK]  rewrite again


Nice to see the above code returns it in lower case. Although I wonder if more complex algorithims learn with capitalised english?

# Tensorflow example
Below is roughly tthe same but much simplier with tensorflow.

In [9]:
from tensorflow.keras.layers import TextVectorization
text_vector = TextVectorization(output_mode = "int")

text_vector.adapt(dataset)
print(text_vector.get_vocabulary())

['', '[UNK]', 'erase', 'write', 'then', 'rewrite', 'poppy', 'i', 'blooms', 'and', 'again', 'a']


So easy! 