#### Tokenization

1. White space tokenization

In [39]:
class WhiteSpaceTokenizer:
    def __init__(self, lower_case=True, remove_punctuation=True):
        self.whitespace = [' ', '\t', '\n', '\r']
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.vocabulary = {}

    def tokenize(self, text):
        if self.lower_case:
            text = text.lower()
        if self.remove_punctuation:
            text = ''.join([c for c in text if c.isalnum() or c in self.whitespace])
        
        tokens = text.split()
        return tokens
    
    def build_vocabulary(self, texts):
        unique_tokens = set()
        for text in texts:
            tokens = self.tokenize(text)
            unique_tokens.update(tokens)
        print(unique_tokens)
        self.vocabulary = {token: i for i, token in enumerate(unique_tokens)}
        self.vocabulary['<UNK>'] = len(self.vocabulary)
        self.idx2token = {i: token for token, i in self.vocabulary.items()}
    
    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, self.vocabulary['<UNK>']) for token in tokens if token in tokens]
    
    def decode(self, indices):
        return [self.idx2token[i] for i in indices]
    

In [44]:
text = "Hello, world! This is a test."
tokenizer = WhiteSpaceTokenizer(remove_punctuation=True)
tokens = tokenizer.tokenize(text)
print(tokens)
tokenizer.build_vocabulary([text])
print(tokenizer.vocabulary)
encoded = tokenizer.encode("Hello I am there")
print(encoded)
print(tokenizer.decode(encoded))

['hello', 'world', 'this', 'is', 'a', 'test']
{'a', 'hello', 'this', 'is', 'test', 'world'}
{'a': 0, 'hello': 1, 'this': 2, 'is': 3, 'test': 4, 'world': 5, '<UNK>': 6}
[1, 6, 6, 6]
['hello', '<UNK>', '<UNK>', '<UNK>']


In [45]:
class CharTokenizer:
    def __init__(self, lower_case=True, remove_punctuation=True):
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.vocabulary = {}
    
    def tokenize(self, text):
        if self.lower_case:
            text = text.lower()
        if self.remove_punctuation:
            text = ''.join([c for c in text if c.isalnum()])
        return list(text)
    
    def build_vocabulary(self, texts):
        unique_tokens = set()
        for text in texts:
            tokens = self.tokenize(text)
            unique_tokens.update(tokens)
        self.vocabulary = {token: i for i, token in enumerate(unique_tokens)}
        self.vocabulary['<UNK>'] = len(self.vocabulary)
        self.idx2token = {i: token for token, i in self.vocabulary.items()}
    
    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, self.vocabulary['<UNK>']) for token in tokens]
    
    def decode(self, indices):
        return [self.idx2token[i] for i in indices]

In [47]:
text = "Hello, world! This is a test."
tokenizer = CharTokenizer()
tokens = tokenizer.tokenize(text)
print(tokens)
tokenizer.build_vocabulary([text])
print(tokenizer.vocabulary)
encoded = tokenizer.encode("Hello I am there")
print(encoded)
print(tokenizer.decode(encoded))

['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd', 't', 'h', 'i', 's', 'i', 's', 'a', 't', 'e', 's', 't']
{'d': 0, 'w': 1, 'h': 2, 'i': 3, 'e': 4, 'o': 5, 'l': 6, 'a': 7, 's': 8, 't': 9, 'r': 10, '<UNK>': 11}
[2, 4, 6, 6, 5, 3, 7, 11, 9, 2, 4, 10, 4]
['h', 'e', 'l', 'l', 'o', 'i', 'a', '<UNK>', 't', 'h', 'e', 'r', 'e']


In [57]:
class NgramTokenization:
    def __init__(self, n=2):
        self.n = n
        self.vocabulary = {}
    
    def tokenize(self, text):
        return [text[i:i+self.n] for i in range(len(text)-self.n+1)]
    
    def build_vocabulary(self, texts):
        unique_tokens = set()
        for text in texts:
            tokens = self.tokenize(text)
            unique_tokens.update(tokens)
        self.vocabulary = {token: i for i, token in enumerate(unique_tokens)}
        self.vocabulary['<UNK>'] = len(self.vocabulary)
        self.idx2token = {i: token for token, i in self.vocabulary.items()}

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, self.vocabulary['<UNK>']) for token in tokens]
    
    def decode(self, indices):
        return [self.idx2token[i] for i in indices]

In [58]:
text = "Hello, world! This is a test."
tokenizer = NgramTokenization(4)
tokens = tokenizer.tokenize(text)
print(tokens)
tokenizer.build_vocabulary([text])
print(tokenizer.vocabulary)
encoded = tokenizer.encode("Hello I am there")
print(encoded)
print(tokenizer.decode(encoded))

['Hell', 'ello', 'llo,', 'lo, ', 'o, w', ', wo', ' wor', 'worl', 'orld', 'rld!', 'ld! ', 'd! T', '! Th', ' Thi', 'This', 'his ', 'is i', 's is', ' is ', 'is a', 's a ', ' a t', 'a te', ' tes', 'test', 'est.']
{' tes': 0, 'worl': 1, 'a te': 2, 'test': 3, 'rld!': 4, 'o, w': 5, 'This': 6, 'is i': 7, 'ld! ': 8, 'est.': 9, ' Thi': 10, ' a t': 11, 'llo,': 12, 'lo, ': 13, 'd! T': 14, ', wo': 15, ' wor': 16, 'Hell': 17, 'is a': 18, 'ello': 19, 'his ': 20, '! Th': 21, 'orld': 22, 's a ': 23, ' is ': 24, 's is': 25, '<UNK>': 26}
[17, 19, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26]
['Hell', 'ello', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>']


In [81]:
import nltk
nltk.download('punkt') 
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer, RegexpTokenizer, WordPunctTokenizer

class WordTokenizer:
    def __init__(self, tokenizer = 'punkt'):
        self.supported_tokenizers = ["punkt", "treebank", "wordpunct", "regexp", "whitespace"]
        assert tokenizer in self.supported_tokenizers, f"Unsupported tokenizer: {tokenizer}"
        self.tokenizer = tokenizer

    # Tokenization using nltk's word_tokenize
    def tokenize(self, text):
        if self.tokenizer == 'punkt':
            tokens = word_tokenize(text.lower())
        elif self.tokenizer == 'regexp':
            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(text.lower())
        elif self.tokenizer == 'treebank':
            tokenizer = TreebankWordDetokenizer()
            tokens = tokenizer.tokenize(text.lower())
        elif self.tokenizer == 'wordpunct':
            tokenizer = WordPunctTokenizer()
            tokens = tokenizer.tokenize(text.lower())
        return tokens

    def build_vocabulary(self, texts):
        unique_tokens = set()
        for text in texts:
            tokens = self.tokenize(text)
            unique_tokens.update(tokens)
        self.vocabulary = {token: i for i, token in enumerate(unique_tokens)}
        self.vocabulary['<UNK>'] = len(self.vocabulary)
        self.idx2token = {i: token for token, i in self.vocabulary.items()}

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, self.vocabulary['<UNK>']) for token in tokens]
    
    def decode(self, indices):
        return [self.idx2token[i] for i in indices]


text = "Hello, world! This is a test."
tokenizer = WordTokenizer("punkt")
tokens = tokenizer.tokenize(text)
print(tokens)
tokenizer.build_vocabulary([text])
print(tokenizer.vocabulary)
encoded = tokenizer.encode("hello I am there")
print(encoded)
print(tokenizer.decode(encoded))

['hello', ',', 'world', '!', 'this', 'is', 'a', 'test', '.']
{',': 0, 'a': 1, 'hello': 2, 'this': 3, '!': 4, 'is': 5, 'test': 6, 'world': 7, '.': 8, '<UNK>': 9}
[2, 9, 9, 9]
['hello', '<UNK>', '<UNK>', '<UNK>']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chandk10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize("Hello, how're you doing today?")
print(tokens)

['Hello', ',', 'how', "'re", 'you', 'doing', 'today', '?']
