# Import packages

In [0]:
! pip -q install unidecode

In [0]:
! pip -q install nltk

In [0]:
! pip -q install  num2words

[?25l[K    10% |███▎                            | 10kB 21.1MB/s eta 0:00:01[K    20% |██████▋                         | 20kB 3.4MB/s eta 0:00:01[K    31% |██████████                      | 30kB 4.9MB/s eta 0:00:01[K    41% |█████████████▎                  | 40kB 3.1MB/s eta 0:00:01[K    51% |████████████████▌               | 51kB 3.8MB/s eta 0:00:01[K    62% |███████████████████▉            | 61kB 4.5MB/s eta 0:00:01[K    72% |███████████████████████▏        | 71kB 5.1MB/s eta 0:00:01[K    82% |██████████████████████████▌     | 81kB 5.8MB/s eta 0:00:01[K    93% |█████████████████████████████▊  | 92kB 6.4MB/s eta 0:00:01[K    100% |████████████████████████████████| 102kB 4.7MB/s 
[?25h

In [0]:
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
import re
import string
import unidecode

import numpy as np
import pandas as pd

from collections import Counter

### Sentence for examples

In [0]:
example_sentence = "That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence"

# Tokenization

### Word tokenization

In [0]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, WordPunctTokenizer

In [0]:
def tokenize_text(text, word=True, tokenizer=None):
    if tokenizer is None and word:
        tokens = word_tokenize(text)
    elif tokenizer is None and not word:
        tokens = sent_tokenize(text)
    else:
        tokens = tokenizer.tokenize(text)
    return tokens 

In [0]:
# Original sentence
print('Original:\n {}'.format(example_sentence))

# Tokenize a string to split off punctuation other than periods
tokens = example_sentence.split()
print('\nWith a naive split:\n {}'.format(tokens))

# Tokenize a string to split off punctuation other than periods
tokens = tokenize_text(example_sentence)
print('\nWith NLTK word tokenizer:\n {}'.format(tokens))

# Splits on punctuation, but keep it with the word
tokens = tokenize_text(example_sentence, tokenizer=TreebankWordTokenizer())
print('\nWith TreebankWordTokenizer:\n {}'.format(tokens))

# Tokenize a text into a sequence of alphabetic and non-alphabetical characters using
# the regex \w+|[^\w\s]+
tokens = tokenize_text(example_sentence, tokenizer=WordPunctTokenizer())
print('\nWith NLTK WordPunctTokenizer:\n {}'.format(tokens))

Original:
 That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence

With a naive split:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']

With NLTK word tokenizer:
 ['That', "'s", 'àn', 'example', ':', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%', '&', '&', 'article', 't¨¨ïtled', 'Intelligence']

With TreebankWordTokenizer:
 ['That', "'s", 'àn', 'example', ':', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%', '&', '&', 'article', 't¨¨ïtled', 'Intelligence']

With NLTK WordPunctTokenizer:
 ['That', "'", 's', 'àn', 'example', ':', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't', '¨¨', 'ïtled', 'Intelligence']


### Sentence tokenization

In [0]:
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer

In [0]:
# Original sentence
print('Original:\n {}'.format(example_sentence))

# Tokenize a text into a list of sequences by using NLTK's recommended sentence tokenizer (currently
# PunktSentenceTokenizer)
tokens = tokenize_text(example_sentence, word=False)
print('\nWith NLTK sentence tokenizer:\n {}'.format(tokens))

# Tokenize a text into a list of sequences by using an unsupervised algorithm to 
# build a model for abbreviation words, collocations, and words that start sentences
tokens = tokenize_text(example_sentence, word=False, tokenizer=PunktSentenceTokenizer())
print('\nWith NLTK PunktSentenceTokenizer:\n {}'.format(tokens))

Original:
 That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence

With NLTK sentence tokenizer:
 ["That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence"]

With NLTK PunktSentenceTokenizer:
 ["That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence"]


# Text Normalization

### Convert characters to lower or upper case

In [0]:
def convert_characters(tokens, style='lower'):
    if style == 'lower':
        tokens = [token.lower() for token in tokens]
    else:
        tokens = [token.upper() for token in tokens]
    return tokens

In [0]:
# Original sentence
print('Original:\n {}'.format(example_sentence))

tokens = convert_characters(example_sentence.split())
print('\nTo lowercase:\n {}'.format(tokens))

tokens = convert_characters(example_sentence.split(), style='upper')
print('\nTo uppercase:\n {}'.format(tokens))

Original:
 That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence

To lowercase:
 ["that's", 'àn', 'example:', 'in', '1950', ',', 'alan', 'turing', 'públished', 'an', '%&', '&', 'article', 't¨¨ïtled', 'intelligence']

To uppercase:
 ["THAT'S", 'ÀN', 'EXAMPLE:', 'IN', '1950', ',', 'ALAN', 'TURING', 'PÚBLISHED', 'AN', '%&', '&', 'ARTICLE', 'T¨¨ÏTLED', 'INTELLIGENCE']


### Removing blanks 

In [0]:
def remove_blanks(tokens):
    return [token.strip() for token in tokens]

In [0]:
tokens = remove_blanks(example_sentence.split())
print('Original:\n {}'.format(tokens))
print('\nRemoving blanks:\n {}'.format(tokens))

Original:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']

Removing blanks:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']


### Removing punctuation, diacritics, etc.

In [0]:
def remove_punctuation(sentence, keep_apostrophe=False):
    return re.sub(r'[^a-zA-Z0-9]', r' ', sentence)

In [0]:
tokens = remove_punctuation(example_sentence)
print('Original:\n {}'.format(example_sentence))
print('\nRemoving punctuation:\n {}'.format(tokens))

Original:
 That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence

Removing punctuation:
 That s  n example  In 1950   Alan Turing P BLISHED an      article t   tled Intelligence


In [0]:
def remove_accents(tokens):
    tokens = [unidecode.unidecode(token) for token in tokens]
    return(tokens)

In [0]:
tokens = remove_accents(example_sentence.split())
print('Original:\n {}'.format(example_sentence.split()))
print('\nRemoving punctuation:\n {}'.format(tokens))

Original:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']

Removing punctuation:
 ["That's", 'an', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PUBLISHED', 'an', '%&', '&', 'article', 't""itled', 'Intelligence']


### Expanding contractions 

In [0]:
english_contractions_mapping = {
    "ain't": "is not", "aren't": "are not","can't": "cannot", 
    "can't've": "cannot have", "'cause": "because", "could've": "could have", 
    "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
    "he'll've": "he he will have", "he's": "he is", "how'd": "how did", 
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
    "I'll've": "I will have","I'm": "I am", "I've": "I have", 
    "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
    "i'll've": "i will have","i'm": "i am", "i've": "i have", 
    "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
    "it'll": "it will", "it'll've": "it will have","it's": "it is", 
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
    "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
    "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
    "she's": "she is", "should've": "should have", "shouldn't": "should not", 
    "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
    "this's": "this is",
    "that'd": "that would", "that'd've": "that would have","that's": "that is", 
    "there'd": "there would", "there'd've": "there would have","there's": "there is", 
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
    "they'll've": "they will have", "they're": "they are", "they've": "they have", 
    "to've": "to have", "wasn't": "was not", "we'd": "we would", 
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
    "we're": "we are", "we've": "we have", "weren't": "were not", 
    "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
    "what's": "what is", "what've": "what have", "when's": "when is", 
    "when've": "when have", "where'd": "where did", "where's": "where is", 
    "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
    "who's": "who is", "who've": "who have", "why's": "why is", 
    "why've": "why have", "will've": "will have", "won't": "will not", 
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
    "you'll've": "you will have", "you're": "you are", "you've": "you have"
} 

In [0]:
def expand_match(contraction): 
        match = contraction.group(0) 
        first_char = match[0] 
        expanded_contraction = english_contractions_mapping.get(match) if english_contractions_mapping.get(match) else english_contractions_mapping.get(match.lower())                        
        expanded_contraction = first_char+expanded_contraction[1:] 
        return expanded_contraction 
    
def expand_contractions(sentence, english_contractions_mapping):    
    contractions_pattern = re.compile('({})'.format('|'.join(english_contractions_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL) 
    return contractions_pattern.sub(expand_match, sentence)

In [0]:
expanded = [expand_contractions(txt, english_contractions_mapping) 
            for txt in sent_tokenize(example_sentence)]     
print ('Original:\n {}'.format(example_sentence))
print ('\nAfter expanding contractions:\n {}'.format(' '.join(expanded)))

Original:
 That's àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence

After expanding contractions:
 That is àn example: In 1950 , Alan Turing PÚBLISHED an %& & article t¨¨ïtled Intelligence


### Removing Stopwords 

In [0]:
from nltk.corpus import stopwords

In [0]:
english_nltk_stopwords = stopwords.words('english')
english_nltk_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
def remove_stopwords(tokens, stopwords_list):
    return [token for token in tokens if token not in stopwords_list]

In [0]:
tokens = remove_stopwords(example_sentence.split(), english_nltk_stopwords)
print('Original:\n {}'.format(example_sentence.split()))
print('\nRemoving stopwords:\n {}'.format(tokens))

Original:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']

Removing stopwords:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']


### Correcting words

#### Comparing with a corpus 

In [0]:
from nltk.corpus import brown

In [0]:
# Corpus with 500 samples of English-language text
word_list = brown.words()
print(len(word_list))
print(word_list[:10])

1161192
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


In [0]:
brown_vocabulary = list(set(word_list))
print(len(brown_vocabulary))
print(brown_vocabulary[:10])

56057
['1803-1895', 'Toobin', 'mates', '7-1', 'operetta', 'Tareytown', 'observance', 'leather-hard', '1515', 'trembles']


In [0]:
def verify_word(word, corpus_vocabulary):
    return word in corpus_vocabulary

In [0]:
print('Word "house" in the corpus?:\n {}'.format(verify_word('house', brown_vocabulary)))
print('Word "houuuuuse" in the corpus?:\n {}'.format(verify_word('houuuuuse', brown_vocabulary)))

Word "house" in the corpus?:
 True
Word "houuuuuse" in the corpus?:
 False


#### Using an algorithm / model

In [0]:
# Spelling corrector (source: http://norvig.com/spell-correct.html)

In [0]:
! wget http://norvig.com/big.txt

--2019-04-03 10:08:44--  http://norvig.com/big.txt
Resolving norvig.com (norvig.com)... 66.96.146.129
Connecting to norvig.com (norvig.com)|66.96.146.129|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6488666 (6.2M) [text/plain]
Saving to: ‘big.txt’


2019-04-03 10:08:52 (871 KB/s) - ‘big.txt’ saved [6488666/6488666]



In [0]:
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [0]:
print('speling\t   ->\t', correction('speling'))
print('houuuse\t   ->\t', correction('houuuse'))
print('fial\t   ->\t', correction('fial'))
print('misstkaes  ->\t', correction("misstkaes"))

speling	   ->	 spelling
houuuse	   ->	 house
fial	   ->	 final
misstkaes  ->	 mistakes


### Converting number to words

In [0]:
import num2words as n2w

In [0]:
def num_to_words(tokens, lang='en'):
    return [n2w.num2words(int(token), lang=lang) if token.isdigit() else token for token in tokens]

In [0]:
tokens = num_to_words(example_sentence.split())
print('Original:\n {}'.format(example_sentence.split()))
print('\nRemoving punctuation:\n {}'.format(tokens))

Original:
 ["That's", 'àn', 'example:', 'In', '1950', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']

Removing punctuation:
 ["That's", 'àn', 'example:', 'In', 'one thousand, nine hundred and fifty', ',', 'Alan', 'Turing', 'PÚBLISHED', 'an', '%&', '&', 'article', 't¨¨ïtled', 'Intelligence']


# Lexical Normalization

In [0]:
words_ok = ['study', 'studies', 'studying', 'studied']
words_wrong = ['university', 'universal', 'universe']

### Stemming 

In [0]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [0]:
def stem_word(word, stemmer):
    print('Stemmer: {}\toriginal word: {}\t\tstem: {}'.format(stemmer[0], word, stemmer[1].stem(word)))

In [0]:
stemmers = [('Porter\t', PorterStemmer()),
            ('Snowball', SnowballStemmer(language='english')),
            ('Lancaster', LancasterStemmer())]

In [0]:
for word in words_ok:
    for stemmer in stemmers:
        stem_word(word, stemmer)
    print('\n')

Stemmer: Porter		original word: study		stem: studi
Stemmer: Snowball	original word: study		stem: studi
Stemmer: Lancaster	original word: study		stem: study


Stemmer: Porter		original word: studies		stem: studi
Stemmer: Snowball	original word: studies		stem: studi
Stemmer: Lancaster	original word: studies		stem: study


Stemmer: Porter		original word: studying		stem: studi
Stemmer: Snowball	original word: studying		stem: studi
Stemmer: Lancaster	original word: studying		stem: study


Stemmer: Porter		original word: studied		stem: studi
Stemmer: Snowball	original word: studied		stem: studi
Stemmer: Lancaster	original word: studied		stem: study




In [0]:
for word in words_wrong:
    for stemmer in stemmers:
        stem_word(word, stemmer)
    print('\n')

Stemmer: Porter		original word: university		stem: univers
Stemmer: Snowball	original word: university		stem: univers
Stemmer: Lancaster	original word: university		stem: univers


Stemmer: Porter		original word: universal		stem: univers
Stemmer: Snowball	original word: universal		stem: univers
Stemmer: Lancaster	original word: universal		stem: univers


Stemmer: Porter		original word: universe		stem: univers
Stemmer: Snowball	original word: universe		stem: univers
Stemmer: Lancaster	original word: universe		stem: univers




### Lemmatization 

In [0]:
from nltk.stem import WordNetLemmatizer

In [0]:
lemmatizer = WordNetLemmatizer()
for word in words_ok:
    print('Lemmatizer: {}\toriginal word: {}\t\tstem: {}'.format('WordNet', word, lemmatizer.lemmatize(word)))

Lemmatizer: WordNet	original word: study		stem: study
Lemmatizer: WordNet	original word: studies		stem: study
Lemmatizer: WordNet	original word: studying		stem: studying
Lemmatizer: WordNet	original word: studied		stem: studied
Lemmatizer: WordNet	original word: saw		stem: saw


# ----

# To Do: Build an end-to-end preprocessing pipeline

In [0]:
text = "Stemming isn't funier than attendign  to a MACHINE LEARNING  class. In  eaister I'm going to Andalucía . But, I can't stay there more 