# Deriving N-Grams from Text

## 1. Tokenization

In [11]:
s = "The quick brown fox jumps over the lazy dog."
s = s.lower()

In [17]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[a-zA-Z]+")
s_tokenized = tokenizer.tokenize(s)
s_tokenized

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [18]:
from nltk.util import ngrams
generated_4grams = []

for word in s_tokenized:
    generated_4grams.append(list(ngrams(word, 4, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_'))) # n = 4.
generated_4grams

[[('_', '_', '_', 't'),
  ('_', '_', 't', 'h'),
  ('_', 't', 'h', 'e'),
  ('t', 'h', 'e', '_'),
  ('h', 'e', '_', '_'),
  ('e', '_', '_', '_')],
 [('_', '_', '_', 'q'),
  ('_', '_', 'q', 'u'),
  ('_', 'q', 'u', 'i'),
  ('q', 'u', 'i', 'c'),
  ('u', 'i', 'c', 'k'),
  ('i', 'c', 'k', '_'),
  ('c', 'k', '_', '_'),
  ('k', '_', '_', '_')],
 [('_', '_', '_', 'b'),
  ('_', '_', 'b', 'r'),
  ('_', 'b', 'r', 'o'),
  ('b', 'r', 'o', 'w'),
  ('r', 'o', 'w', 'n'),
  ('o', 'w', 'n', '_'),
  ('w', 'n', '_', '_'),
  ('n', '_', '_', '_')],
 [('_', '_', '_', 'f'),
  ('_', '_', 'f', 'o'),
  ('_', 'f', 'o', 'x'),
  ('f', 'o', 'x', '_'),
  ('o', 'x', '_', '_'),
  ('x', '_', '_', '_')],
 [('_', '_', '_', 'j'),
  ('_', '_', 'j', 'u'),
  ('_', 'j', 'u', 'm'),
  ('j', 'u', 'm', 'p'),
  ('u', 'm', 'p', 's'),
  ('m', 'p', 's', '_'),
  ('p', 's', '_', '_'),
  ('s', '_', '_', '_')],
 [('_', '_', '_', 'o'),
  ('_', '_', 'o', 'v'),
  ('_', 'o', 'v', 'e'),
  ('o', 'v', 'e', 'r'),
  ('v', 'e', 'r', '_'),
  ('e', 'r'

It seems that `generated_4grams` needs flattening since it's supposed to be a list of 4-grams:

In [19]:
generated_4grams = [word for sublist in generated_4grams for word in sublist]
generated_4grams[:10]

[('_', '_', '_', 't'),
 ('_', '_', 't', 'h'),
 ('_', 't', 'h', 'e'),
 ('t', 'h', 'e', '_'),
 ('h', 'e', '_', '_'),
 ('e', '_', '_', '_'),
 ('_', '_', '_', 'q'),
 ('_', '_', 'q', 'u'),
 ('_', 'q', 'u', 'i'),
 ('q', 'u', 'i', 'c')]

## 2. Obtaining n-grams (n = 4)

In [20]:
ng_list_4grams = generated_4grams
for idx, val in enumerate(generated_4grams):
    ng_list_4grams[idx] = ''.join(val)
ng_list_4grams

['___t',
 '__th',
 '_the',
 'the_',
 'he__',
 'e___',
 '___q',
 '__qu',
 '_qui',
 'quic',
 'uick',
 'ick_',
 'ck__',
 'k___',
 '___b',
 '__br',
 '_bro',
 'brow',
 'rown',
 'own_',
 'wn__',
 'n___',
 '___f',
 '__fo',
 '_fox',
 'fox_',
 'ox__',
 'x___',
 '___j',
 '__ju',
 '_jum',
 'jump',
 'umps',
 'mps_',
 'ps__',
 's___',
 '___o',
 '__ov',
 '_ove',
 'over',
 'ver_',
 'er__',
 'r___',
 '___t',
 '__th',
 '_the',
 'the_',
 'he__',
 'e___',
 '___l',
 '__la',
 '_laz',
 'lazy',
 'azy_',
 'zy__',
 'y___',
 '___d',
 '__do',
 '_dog',
 'dog_',
 'og__',
 'g___']

## 3. Sorting n-grams by frequency (n = 4)

In [21]:
freq_4grams = {}

for ngram in ng_list_4grams:
    if ngram not in freq_4grams:
        freq_4grams.update({ngram: 1})
    else:
        ngram_occurrences = freq_4grams[ngram]
        freq_4grams.update({ngram: ngram_occurrences + 1})
        
from operator import itemgetter # The operator module exports a set of efficient functions corresponding to the intrinsic operators of Python. For example, operator.add(x, y) is equivalent to the expression x + y.

freq_4grams_sorted = sorted(freq_4grams.items(), key=itemgetter(1), reverse=True)[0:300] # We only keep the 300 most popular n-grams. This was suggested in the original paper written about n-grams.
freq_4grams_sorted

[('___t', 2),
 ('__th', 2),
 ('_the', 2),
 ('the_', 2),
 ('he__', 2),
 ('e___', 2),
 ('___q', 1),
 ('__qu', 1),
 ('_qui', 1),
 ('quic', 1),
 ('uick', 1),
 ('ick_', 1),
 ('ck__', 1),
 ('k___', 1),
 ('___b', 1),
 ('__br', 1),
 ('_bro', 1),
 ('brow', 1),
 ('rown', 1),
 ('own_', 1),
 ('wn__', 1),
 ('n___', 1),
 ('___f', 1),
 ('__fo', 1),
 ('_fox', 1),
 ('fox_', 1),
 ('ox__', 1),
 ('x___', 1),
 ('___j', 1),
 ('__ju', 1),
 ('_jum', 1),
 ('jump', 1),
 ('umps', 1),
 ('mps_', 1),
 ('ps__', 1),
 ('s___', 1),
 ('___o', 1),
 ('__ov', 1),
 ('_ove', 1),
 ('over', 1),
 ('ver_', 1),
 ('er__', 1),
 ('r___', 1),
 ('___l', 1),
 ('__la', 1),
 ('_laz', 1),
 ('lazy', 1),
 ('azy_', 1),
 ('zy__', 1),
 ('y___', 1),
 ('___d', 1),
 ('__do', 1),
 ('_dog', 1),
 ('dog_', 1),
 ('og__', 1),
 ('g___', 1)]

## 4. Obtaining n-grams for multiple values of n

To get n-grams for n = 1, 2, 3 and 4 we can use:

In [22]:
from nltk import everygrams

s_clean = ' '.join(s_tokenized) # For the code below we need the raw sentence as opposed to the tokens.
s_clean

'the quick brown fox jumps over the lazy dog'

In [23]:
def ngram_extractor(sent):
    return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4) 
            if ' ' not in ng and '\n' not in ng and ng != ('_',)]

ngram_extractor(s_clean)

['t',
 'h',
 'e',
 'q',
 'u',
 'i',
 'c',
 'k',
 'b',
 'r',
 'o',
 'w',
 'n',
 'f',
 'o',
 'x',
 'j',
 'u',
 'm',
 'p',
 's',
 'o',
 'v',
 'e',
 'r',
 't',
 'h',
 'e',
 'l',
 'a',
 'z',
 'y',
 'd',
 'o',
 'g',
 'th',
 'he',
 'e_',
 '_q',
 'qu',
 'ui',
 'ic',
 'ck',
 'k_',
 '_b',
 'br',
 'ro',
 'ow',
 'wn',
 'n_',
 '_f',
 'fo',
 'ox',
 'x_',
 '_j',
 'ju',
 'um',
 'mp',
 'ps',
 's_',
 '_o',
 'ov',
 've',
 'er',
 'r_',
 '_t',
 'th',
 'he',
 'e_',
 '_l',
 'la',
 'az',
 'zy',
 'y_',
 '_d',
 'do',
 'og',
 'the',
 'he_',
 '_qu',
 'qui',
 'uic',
 'ick',
 'ck_',
 '_br',
 'bro',
 'row',
 'own',
 'wn_',
 '_fo',
 'fox',
 'ox_',
 '_ju',
 'jum',
 'ump',
 'mps',
 'ps_',
 '_ov',
 'ove',
 'ver',
 'er_',
 '_th',
 'the',
 'he_',
 '_la',
 'laz',
 'azy',
 'zy_',
 '_do',
 'dog',
 'the_',
 '_qui',
 'quic',
 'uick',
 'ick_',
 '_bro',
 'brow',
 'rown',
 'own_',
 '_fox',
 'fox_',
 '_jum',
 'jump',
 'umps',
 'mps_',
 '_ove',
 'over',
 'ver_',
 '_the',
 'the_',
 '_laz',
 'lazy',
 'azy_',
 '_dog']