In [1]:
import re


class Tokenizer:
  
    NULL = u"\u0000"

    @staticmethod
    def tokenize(string):
        return re.findall("\w+", string.lower())

    @staticmethod
    def unique_tokenizer(string):
        return set(Tokenizer.tokenize(string))

    @staticmethod
    def ngram(string, ngram):
        tokens = Tokenizer.tokenize(string)

        ngrams = []

        for i in range(len(tokens)):
            shift = i - ngram + 1
            padding = max(-shift, 0)
            first_idx = max(shift, 0)
            last_idx = first_idx + ngram - padding
            print(i,shift,padding,first_idx,last_idx)

            ngrams.append(Tokenizer.pad(tokens[first_idx:last_idx], padding))

        return ngrams

    @staticmethod
    def pad(tokens, padding):
        padded_tokens = []

        for i in range(padding):
            padded_tokens.append(Tokenizer.NULL)

        return padded_tokens + tokens

In [2]:
new_string = "The quick brown fox jumps over the lazy dog"

In [3]:
Tokenizer.tokenize(new_string)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [4]:
result = Tokenizer.ngram(new_string, 3)

0 -2 2 0 1
1 -1 1 0 2
2 0 0 0 3
3 1 0 1 4
4 2 0 2 5
5 3 0 3 6
6 4 0 4 7
7 5 0 5 8
8 6 0 6 9


In [26]:
result

[['\x00', '\x00', 'the'],
 ['\x00', 'the', 'quick'],
 ['the', 'quick', 'brown'],
 ['quick', 'brown', 'fox'],
 ['brown', 'fox', 'jumps'],
 ['fox', 'jumps', 'over'],
 ['jumps', 'over', 'the'],
 ['over', 'the', 'lazy'],
 ['the', 'lazy', 'dog']]

In [5]:
import unittest

In [10]:
class TestTokenizer(unittest.TestCase):
    def setUp(self):
        self.string = "this is a test of the emergency broadcasting system"

    def test_downcasing(self):
        expectation = ["this", "is", "all", "caps"]

        actual = Tokenizer.tokenize("THIS IS ALL CAPS")
        self.assertEqual(actual, expectation,"Wrong")

    def test_ngrams(self):
        expectation = [
          [u'\u0000', "quick"],
          ["quick", "brown"],
          ["brown", "fox"],
        ]

        actual = Tokenizer.ngram("quick brown fox", 2)
        self.assertEqual(actual, expectation)

In [11]:
tt = TestTokenizer()

In [12]:
tt.test_downcasing()