# Tokenization Implementation in Python



We can split the text with white space using .split in python 

In [1]:
# Input
data = """Natural language processing (NLP) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language,\n
        in particular how to program computers to process and analyze large amounts of natural language data."""

     
print(data.split())

['Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'subfield', 'of', 'linguistics,', 'computer', 'science,', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language,', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data.']


Word Space Tokenization can be implemented with nltk library. With the help of nltk.tokenize.WhitespaceTokenizer() method, we are able to extract the tokens from string of words or sentences without whitespaces, new line and tabs.

In [2]:
# import WhitespaceTokenizer from nltk
from nltk.tokenize import WhitespaceTokenizer
     
# To use Class WhitespaceTokenizer create a variable
tokenizer = WhitespaceTokenizer()
     
# Input
data = """Natural language processing (NLP) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language,\n
        in particular how to program computers to process and analyze large amounts of natural language data."""


# Apply tokenization on data
tokens = tokenizer.tokenize(data)
     
print(tokens)

['Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'subfield', 'of', 'linguistics,', 'computer', 'science,', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language,', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data.']


Now if we add space in (NLP) like ( NLP ) the space will be considered and it will do tokenization with considering

In [3]:
# Input
data = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language,\n
        in particular how to program computers to process and analyze large amounts of natural language data."""


# Apply tokenization on data
tokens = tokenizer.tokenize(data)
     
print(tokens)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics,', 'computer', 'science,', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language,', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data.']


We can also perform tokenization using NLTK word_tokenize 

In [4]:
# import word_tokenize from nltk
from nltk.tokenize import word_tokenize
     
# Input
data = """Natural language processing (NLP) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language,\n
        in particular how to program computers to process and analyze large amounts of natural language data."""


# Apply tokenization on data
tokens = word_tokenize(data)
     
print(tokens)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.']


### We can split the sentences also with the help of regex. Also, we can use regex to split tokens based on regular expressions

In [5]:
import re
data = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language.\n
        In particular how to program computers to process and analyze large amounts of natural language data."""

sentences = re.compile('[,!?] ').split(data)
print(sentences)

['Natural language processing ( NLP ) is a subfield of linguistics', 'computer science,\n\n        and artificial intelligence concerned with the interactions between computers and human language.\n\n        In particular how to program computers to process and analyze large amounts of natural language data.']


In [6]:
from nltk.tokenize import sent_tokenize

data = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language.\n
        In particular how to program computers to process and analyze large amounts of natural language data."""

# To use Class WhitespaceTokenizer create a variable
#sen_tokenizer = sent_tokenize()

tokens = sent_tokenize(data)
print(tokens)

['Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n\n        and artificial intelligence concerned with the interactions between computers and human language.', 'In particular how to program computers to process and analyze large amounts of natural language data.']


# Tokenization using the spaCy library

In [7]:
import spacy

# Use en_core_web_sm for english language
nlp = spacy.load("en_core_web_sm")

text = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language,\n
        in particular how to program computers to process and analyze large amounts of natural language data."""

# English Object is used to create documents
doc = nlp(text)

# Create tokens from document
tokens = []
for token in doc:
    tokens.append(token.text)
print(tokens)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', '\n\n        ', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', ',', '\n\n        ', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.']


In [8]:
# Use en_core_web_sm for english language
nlp = spacy.load("en_core_web_sm")

text = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language.\n
        In particular how to program computers to process and analyze large amounts of natural language data."""

# English Object is used to create documents
doc = nlp(text)

# Create tokens from document
sents = []
for sent in doc.sents:
    sents.append(sent.text)
print(sents)

['Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n\n        and artificial intelligence concerned with the interactions between computers and human language.', '\n\n        ', 'In particular how to program computers to process and analyze large amounts of natural language data.']


# Tokenization using Gensim

In [9]:
from gensim.utils import tokenize

text = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language.\n
        In particular how to program computers to process and analyze large amounts of natural language data."""

print(list(tokenize(text)))

['Natural', 'language', 'processing', 'NLP', 'is', 'a', 'subfield', 'of', 'linguistics', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', 'In', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data']


# Tokenization using Keras

In [10]:
from keras.preprocessing.text import text_to_word_sequence

text = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language.\n
        In particular how to program computers to process and analyze large amounts of natural language data."""

tokens = text_to_word_sequence(text)
print(tokens)

['natural', 'language', 'processing', 'nlp', 'is', 'a', 'subfield', 'of', 'linguistics', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data']


Here, we can see that the punctuations are removed by default with keras while doing tokenization.

# Byte Pair Encoding

In [11]:
text = """Natural language processing ( NLP ) is a subfield of linguistics, computer science,\n
        and artificial intelligence concerned with the interactions between computers and human language.\n
        In particular how to program computers to process and analyze large amounts of natural language data."""


#We will calculate the vacabulary size
words = text.strip().split(" ")
print(len(words))

55


In [12]:
#Get words from tokens and calculate the frequency and add <w> at the end of each word.

import collections
import pandas as pd 
import re
word_freq = collections.defaultdict(int)
for word in words:
    word_freq[" ".join(word)+' <w>'] +=1

word_freq

defaultdict(int,
            {'N a t u r a l <w>': 1,
             'l a n g u a g e <w>': 2,
             'p r o c e s s i n g <w>': 1,
             '( <w>': 1,
             'N L P <w>': 1,
             ') <w>': 1,
             'i s <w>': 1,
             'a <w>': 1,
             's u b f i e l d <w>': 1,
             'o f <w>': 2,
             'l i n g u i s t i c s , <w>': 1,
             'c o m p u t e r <w>': 1,
             's c i e n c e , \n \n <w>': 1,
             ' <w>': 14,
             'a n d <w>': 3,
             'a r t i f i c i a l <w>': 1,
             'i n t e l l i g e n c e <w>': 1,
             'c o n c e r n e d <w>': 1,
             'w i t h <w>': 1,
             't h e <w>': 1,
             'i n t e r a c t i o n s <w>': 1,
             'b e t w e e n <w>': 1,
             'c o m p u t e r s <w>': 2,
             'h u m a n <w>': 1,
             'l a n g u a g e . \n \n <w>': 1,
             'I n <w>': 1,
             'p a r t i c u l a r <w>': 1,
             'h 

In [13]:
#Get char from words and calculate the frequency.
char_freq = collections.defaultdict(int)
for word,freq in word_freq.items():
    chars = word.split()
    for char in chars:
        char_freq["".join(char)]+=freq
    
char_freq

defaultdict(int,
            {'N': 2,
             'a': 27,
             't': 18,
             'u': 13,
             'r': 15,
             'l': 13,
             '<w>': 55,
             'n': 21,
             'g': 11,
             'e': 23,
             'p': 7,
             'o': 14,
             'c': 14,
             's': 13,
             'i': 16,
             '(': 1,
             'L': 1,
             'P': 1,
             ')': 1,
             'b': 2,
             'f': 4,
             'd': 6,
             ',': 2,
             'm': 6,
             'w': 3,
             'h': 4,
             '.': 2,
             'I': 1,
             'y': 1,
             'z': 1})

In [14]:
df = pd.DataFrame(char_freq, index=[0]).T
df = df.rename(columns={0:'freq'})
df

Unnamed: 0,freq
N,2
a,27
t,18
u,13
r,15
l,13
<w>,55
n,21
g,11
e,23


In [15]:
#We have created out Vocabulary. Now, we will merge most frequent pair.
char_pair = collections.defaultdict(int)

for word,freq in word_freq.items():
    chars = word.split()
    for i in range(len(chars)-1):
        char_pair[chars[i],chars[i+1]] += freq
        
char_pair

defaultdict(int,
            {('N', 'a'): 1,
             ('a', 't'): 3,
             ('t', 'u'): 2,
             ('u', 'r'): 2,
             ('r', 'a'): 4,
             ('a', 'l'): 4,
             ('l', '<w>'): 3,
             ('l', 'a'): 5,
             ('a', 'n'): 8,
             ('n', 'g'): 5,
             ('g', 'u'): 4,
             ('u', 'a'): 3,
             ('a', 'g'): 3,
             ('g', 'e'): 5,
             ('e', '<w>'): 6,
             ('p', 'r'): 3,
             ('r', 'o'): 3,
             ('o', 'c'): 2,
             ('c', 'e'): 5,
             ('e', 's'): 2,
             ('s', 's'): 2,
             ('s', 'i'): 1,
             ('i', 'n'): 4,
             ('g', '<w>'): 1,
             ('(', '<w>'): 1,
             ('N', 'L'): 1,
             ('L', 'P'): 1,
             ('P', '<w>'): 1,
             (')', '<w>'): 1,
             ('i', 's'): 2,
             ('s', '<w>'): 6,
             ('a', '<w>'): 1,
             ('s', 'u'): 1,
             ('u', 'b'): 1,
             ('

In [16]:
max(char_pair)

('z', 'e')

In [17]:
#We will merge these tokens and do the same process again. We will continue this process until no pair left.

def get_pairs(word_freq_dict):
    pairs = collections.defaultdict(int)
    for word, freq in word_freq_dict.items():
        chars = word.split()
        for i in range(len(chars)-1):
            pairs[chars[i], chars[i+1]] += freq
    return pairs

def merge_byte_pairs(best_pair, word_freq_dict):
    print(best_pair)
    merged_dict = {}
    bigram = re.escape(' '.join(best_pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in word_freq_dict:
        # print(word)
        w_out = p.sub(''.join(best_pair), word)
        merged_dict[w_out] = word_freq_dict[word]
    return merged_dict

def get_subword_tokens(word_freq_dict):
    char_freq_dict = collections.defaultdict(int)
    for word, freq in word_freq_dict.items():
        chars = word.split()
        for char in chars:
            char_freq_dict[char] += freq
    return char_freq_dict

for i in range(1000):
    pairs = get_pairs(word_freq)
    best_pair = max(pairs, key=pairs.get)
    print(best_pair)
    print(f"Iteration {i}: ")
    word_freq = merge_byte_pairs(best_pair, word_freq)
    # print(word_freq_dict)
    subword_tokens = get_subword_tokens(word_freq)
    print(subword_tokens)
    print(len(subword_tokens))
    print("--------")
    
#After 75 epochs we are not getting new pair here.

('a', 'n')
Iteration 0: 
('a', 'n')
defaultdict(<class 'int'>, {'N': 2, 'a': 19, 't': 18, 'u': 13, 'r': 15, 'l': 13, '<w>': 55, 'an': 8, 'g': 11, 'e': 23, 'p': 7, 'o': 14, 'c': 14, 's': 13, 'i': 16, 'n': 13, '(': 1, 'L': 1, 'P': 1, ')': 1, 'b': 2, 'f': 4, 'd': 6, ',': 2, 'm': 6, 'w': 3, 'h': 4, '.': 2, 'I': 1, 'y': 1, 'z': 1})
31
--------
('e', '<w>')
Iteration 1: 
('e', '<w>')
defaultdict(<class 'int'>, {'N': 2, 'a': 19, 't': 18, 'u': 13, 'r': 15, 'l': 13, '<w>': 49, 'an': 8, 'g': 11, 'e<w>': 6, 'p': 7, 'o': 14, 'c': 14, 'e': 17, 's': 13, 'i': 16, 'n': 13, '(': 1, 'L': 1, 'P': 1, ')': 1, 'b': 2, 'f': 4, 'd': 6, ',': 2, 'm': 6, 'w': 3, 'h': 4, '.': 2, 'I': 1, 'y': 1, 'z': 1})
32
--------
('s', '<w>')
Iteration 2: 
('s', '<w>')
defaultdict(<class 'int'>, {'N': 2, 'a': 19, 't': 18, 'u': 13, 'r': 15, 'l': 13, '<w>': 43, 'an': 8, 'g': 11, 'e<w>': 6, 'p': 7, 'o': 14, 'c': 14, 'e': 17, 's': 7, 'i': 16, 'n': 13, '(': 1, 'L': 1, 'P': 1, ')': 1, 's<w>': 6, 'b': 2, 'f': 4, 'd': 6, ',': 2, 'm': 6

--------
('science,', '<w>')
Iteration 182: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 183: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguis

--------
('science,', '<w>')
Iteration 226: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 227: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguis

Iteration 309: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 310: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 

('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 393: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1

61
--------
('science,', '<w>')
Iteration 476: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 477: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'ling

--------
('science,', '<w>')
Iteration 559: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 560: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguis

Iteration 642: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 643: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 

('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 726: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1

61
--------
('science,', '<w>')
Iteration 809: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 810: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'ling

--------
('science,', '<w>')
Iteration 892: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 893: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguis

Iteration 975: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 1, 'science,': 1, '<w>': 23, 'and<w>': 3, 'ar': 2, 'ti': 2, 'fi': 1, 'ci': 1, 'a': 4, 'l<w>': 1, 'inte': 2, 'l': 3, 'i': 2, 'g': 3, 'en': 2, 'c': 2, 'e<w>': 4, 'co': 1, 'n': 6, 'ce': 1, 'r': 1, 'e': 4, 'd<w>': 1, 'w': 3, 'th': 2, 'ra': 2, 'o': 3, 's<w>': 3, 'b': 1, 't': 2, 'computers<w>': 2, 'h': 2, 'u': 3, 'm': 3, 'an': 2, 'languag': 1, '.': 1, 'I': 1, 'p': 1, 'tic': 1, 'lar': 2, 'to<w>': 2, 'pro': 1, 'proces': 1, 'y': 1, 'z': 1, 'atural<w>': 1, 'd': 1, 'at': 1, '.<w>': 1})
61
--------
('science,', '<w>')
Iteration 976: 
('science,', '<w>')
defaultdict(<class 'int'>, {'Natural<w>': 1, 'language<w>': 2, 'processing<w>': 1, '(<w>': 1, 'NLP<w>': 1, ')<w>': 1, 'is<w>': 1, 'a<w>': 1, 'subfield<w>': 1, 'of<w>': 2, 'linguistics,<w>': 1, 'computer<w>': 