In [1]:
import nltk
from nltk import regexp_tokenize, word_tokenize
# nltk.download('punkt')
print("Text Processing With NLTK")

Text Processing With NLTK


In [2]:
corpus = {
    "simple_Text": "The brown fox jumps over the lazy dog",
    "punctuation": "Hello. Who am I speaking with? Dear Lord!",
    "contractions": "I'am am the one who knocks. Yes you're it, it's obvious",
    "numbers": "12.30$ 77.5% 499.99€ 0,77%",
    "compound_words": "guarda-chuva, nao-sei-mais",
    "abreviaturas": "U.S.A."
}
corpus_tokens = {
    'simple_Text': ['The', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
    'punctuation': ['Hello', '.', 'Who', 'am', 'I', 'speaking', 'with', '?', 'Dear', 'Lord', '!'],
    'contractions': ["I'am", 'am', 'the', 'one', 'who', 'knocks', '.', 'Yes', "you're", 'it', ',', "it's", 'obvious'],
    'numbers': ['12.30$', '77.5%', '499.99€', '0,77%'],
    'compound_words': ['guarda-chuva', ',', 'nao-sei-mais'],
    'abreviaturas': ['U.S.A.'],
}

In [3]:
def print_tests(tests):
    passed = {name: test for name, test in tests.items() if test["passed"]}
    failed = {name: test for name, test in tests.items() if not test["passed"]}

    for key, test in passed.items():
        print(f"✅ {key}")
        print(f"      Output: {test['result']}")

    for key, test in failed.items():
        print(f"🚨 {key}")
        print(f"     Expected: {test['expected']}")
        print(f"     Got     : {test['result']}")


def test_tokenize(tokenizer):
    tests = {}
    for test, text in corpus.items():
        got = tokenizer(text)
        expected = corpus_tokens[test]
        tests[test] = {
            "result": got,
            "expected": expected,
            "passed": got == expected
        }
    print_tests(tests)

In [4]:
pattern = r'''(?x)
    (?:[a-zA-Z]\.)+    # Abreviaturas
    | (?:\w+)'(?:am|re|s|t)
    | (?:\w+(?:-\w+)+) # Compound words
    | \d+(?:[.,]\d+)?[$£%€]? # Numbers and currencies
    | \w+              # Normal words
    | [.,:-?!]         # Punctuation
    '''

test_tokenize(lambda text: regexp_tokenize(text, pattern))

✅ simple_Text
      Output: ['The', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
✅ punctuation
      Output: ['Hello', '.', 'Who', 'am', 'I', 'speaking', 'with', '?', 'Dear', 'Lord', '!']
✅ contractions
      Output: ["I'am", 'am', 'the', 'one', 'who', 'knocks', '.', 'Yes', "you're", 'it', ',', "it's", 'obvious']
✅ numbers
      Output: ['12.30$', '77.5%', '499.99€', '0,77%']
✅ compound_words
      Output: ['guarda-chuva', ',', 'nao-sei-mais']
✅ abreviaturas
      Output: ['U.S.A.']


### Attention to Alternation

When making an disjunction  `|` in a regular expression, remember that the order in which they are defined matter.

So if you have *regexes* that are more **general** they should be further down.

In [5]:
pattern = r'''(?x)
    \w+
    | [.,?!]
    | \d+(?:[.,]\d+)?
'''
pattern_better = r'''(?x)
    [.,?!]
    |\d+(?:[.,]\d+)? 
    |\w+   # Most general in the end
'''
will_cause_problem = "That will be 69.99"

print(regexp_tokenize(will_cause_problem, pattern))
print(regexp_tokenize(will_cause_problem, pattern_better))

['That', 'will', 'be', '69', '.', '99']
['That', 'will', 'be', '69.99']


In [6]:
from nltk.tokenize import wordpunct_tokenize

test_tokenize(wordpunct_tokenize)

✅ simple_Text
      Output: ['The', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
✅ punctuation
      Output: ['Hello', '.', 'Who', 'am', 'I', 'speaking', 'with', '?', 'Dear', 'Lord', '!']
🚨 contractions
     Expected: ["I'am", 'am', 'the', 'one', 'who', 'knocks', '.', 'Yes', "you're", 'it', ',', "it's", 'obvious']
     Got     : ['I', "'", 'am', 'am', 'the', 'one', 'who', 'knocks', '.', 'Yes', 'you', "'", 're', 'it', ',', 'it', "'", 's', 'obvious']
🚨 numbers
     Expected: ['12.30$', '77.5%', '499.99€', '0,77%']
     Got     : ['12', '.', '30', '$', '77', '.', '5', '%', '499', '.', '99', '€', '0', ',', '77', '%']
🚨 compound_words
     Expected: ['guarda-chuva', ',', 'nao-sei-mais']
     Got     : ['guarda', '-', 'chuva', ',', 'nao', '-', 'sei', '-', 'mais']
🚨 abreviaturas
     Expected: ['U.S.A.']
     Got     : ['U', '.', 'S', '.', 'A', '.']


### Punkt Sentence Tokenizer

This tokenizer divides a text into a list of sentences by using an **unsupervised algorithm** to build a model for *abbreviation words*, *collocations*, and *words that start sentences*.

It must be trained on a large collection of plaintext in the target language before it can be used.

The NLTK data package includes a pre-trained Punkt tokenizer for English.

`punkt` is a **sentence segementation** tokenizer

In [7]:
import nltk.data

punkt = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
lines = '''
Punkt knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.  and sometimes sentences
can start with non-capitalized words.  (How does it deal with 
this parenthesis?)  "It should be part of the
previous sentence." "(And the same with this one.)" ('And this one!')
"('(And (this)) '?)" [(and this. )]
'''

def test_sent_tokenizer(sent_tokenizer):
    print("\n------\n".join(sent_tokenizer(lines)))

In [9]:
test_sent_tokenizer(lambda lines : punkt.tokenize(lines))


Punkt knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.
------
and sometimes sentences
can start with non-capitalized words.
------
(How does it deal with 
this parenthesis?)
------
"It should be part of the
previous sentence."
------
"(And the same with this one.)"
------
('And this one!')
------
"('(And (this)) '?)"
------
[(and this. )]


In [10]:
from nltk.tokenize import sent_tokenize
test_sent_tokenizer(sent_tokenize)


Punkt knows that the periods in Mr. Smith and Johann S. Bach
do not mark sentence boundaries.
------
and sometimes sentences
can start with non-capitalized words.
------
(How does it deal with 
this parenthesis?)
------
"It should be part of the
previous sentence."
------
"(And the same with this one.)"
------
('And this one!')
------
"('(And (this)) '?)"
------
[(and this. )]


## Sentence Segmentation on Long Texts


In [11]:
from urllib import request

url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [12]:
print(f"Characters {len(raw)}")
print(f"Lines using python string.plit('\\n'):",len(raw.split('\n')))

Characters 1176812
Lines using python string.plit('\n'): 22444


In [13]:
lines = punkt.tokenize(raw)
tokens_by_lines = [word_tokenize(line) for line in lines]
vocabulary = {}
for line in tokens_by_lines:
    for token in line:
        vocabulary[token] = vocabulary.get(token, 0) + 1

print("Lines:", len(lines))
print("Tokens: ", sum([len(l) for l in tokens_by_lines]))
print("Vocabulary", len(vocabulary.keys()))

Lines: 12060
Tokens:  257058
Vocabulary 11516


In [14]:
# First Token in the second sentence
print(tokens_by_lines[1][0])

You


In [15]:
# Get N most frequent tokens
sorted_by_frequence = sorted(vocabulary.items(),key=lambda a : a[1],reverse=True)
print(sorted_by_frequence[:10])

[(',', 16177), ('.', 8908), ('the', 7447), ('and', 6279), ('to', 5280), ('a', 4469), ('I', 4397), ('’', 4039), ('“', 3980), ('”', 3929)]


### Using the counter Container from the builtin [collections](https://docs.python.org/3/library/collections.html#collections.Counter)

It is very handy and as we'll see it will spare us this previous work

In [16]:
from collections import Counter

frequency = Counter((token for line in tokens_by_lines for token in line))

In [17]:
frequency.most_common(10)

[(',', 16177),
 ('.', 8908),
 ('the', 7447),
 ('and', 6279),
 ('to', 5280),
 ('a', 4469),
 ('I', 4397),
 ('’', 4039),
 ('“', 3980),
 ('”', 3929)]

In [18]:
frequency.total()

257058

In [19]:
len(set(frequency))

11516

## Multi Word Expressions

As we know, this is a problem similar to **named entity recognition**, we can give a tokenizer a **dictionary of multi-word-expressions** 

In [20]:
from nltk import MWETokenizer

text = "Good muffins cost $3.88\nin New York."


multi_word_expressions = [('New','York'),('Real','Madrid')]

mwe = MWETokenizer(multi_word_expressions,separator="_")

tokens = word_tokenize(text)
mwes_tokens = mwe.tokenize(tokens)

# It now recognizes the Multi Word Expressions in the "knowledge base"
print(mwes_tokens)


['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New_York', '.']


## Lemmatization and Stemming

Both *lemmatization* and *stemization* are techinques of normalizing and reducing the corpus.

However *lemmatization* is a more expensive process that aims to find the **root** of each word.

While *stemming* applies a set of transformations that aims to cut off word suffixes.

### Stemming

`nltk` includes the **Porter stemmer** that we've talked about

In [21]:
from nltk import PorterStemmer

porter_stemmer = PorterStemmer()

# The piece of text from the slides
sentence = '''The European Commission has funded a numerical study to analyze the purchase of a pipe organ with no noise
for Europe's organization. Numerous donations have followed the analysis after a noisy debate.'''

In [22]:
tokens = word_tokenize(sentence)
def show_statistics(tokens):
    cnt = Counter(tokens)
    print(f"Number of Tokens:",cnt.total())
    print(f"Vocabulary:",len(set(cnt)))
show_statistics(tokens)

Number of Tokens: 35
Vocabulary: 31


In [23]:
# Now applying the Porter Stemmer to Normalize the text

stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
cnt = Counter(stemmed_tokens)
show_statistics(stemmed_tokens)

Number of Tokens: 35
Vocabulary: 28


As we can see the dimension of the vocabulary was reduced, but lets checkout what happened to the sentence:

In [24]:
print("Original: "," ".join(tokens))
print("Stemmed: "," ".join(stemmed_tokens))

Original:  The European Commission has funded a numerical study to analyze the purchase of a pipe organ with no noise for Europe 's organization . Numerous donations have followed the analysis after a noisy debate .
Stemmed:  the european commiss ha fund a numer studi to analyz the purchas of a pipe organ with no nois for europ 's organ . numer donat have follow the analysi after a noisi debat .


In [25]:
mapping = {}
for token in tokens:
    stemmed = porter_stemmer.stem(token)
    mapping[stemmed] = mapping.get(stemmed,set())
    mapping[stemmed].add(token)

sorted(mapping.items(),key=lambda a: len(a[1]),reverse=True)[:5]

[('the', {'The', 'the'}),
 ('numer', {'Numerous', 'numerical'}),
 ('organ', {'organ', 'organization'}),
 ('european', {'European'}),
 ('commiss', {'Commission'})]

As we can see some words suffered from **overgeneralization**, like *organ* and *organization*

And other words suffered from **undergeneralization** like **european** and **europe** that dind't got the same stem.

### Lemmatization

`nltk` includes ways of finding the root of words

In [26]:
# WordNet lemmatizer
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

sentence = "Men and women love to study artificial intelligence while studying data science. Don't you? My feet and teeth are clean!"

[nltk_data] Downloading package wordnet to /home/martim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
tokens = word_tokenize(sentence)
show_statistics(tokens)

Number of Tokens: 24
Vocabulary: 23


In [28]:
lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]
print(lemmatized_tokens)
show_statistics(lemmatized_tokens)

stemmed_tokens = [porter_stemmer.stem(t) for t in tokens]
show_statistics(stemmed_tokens)


import pandas as pd
def compare_lemma_stemmer(tokens):
    data = []
    for token in tokens:
        data.append([token,porter_stemmer.stem(token),lemmatizer.lemmatize(token)])
    return pd.DataFrame(data,columns=["Original","Stemmed","Lemmatized"])

compare_lemma_stemmer(word_tokenize("cats corpora mice")) 


['Men', 'and', 'woman', 'love', 'to', 'study', 'artificial', 'intelligence', 'while', 'studying', 'data', 'science', '.', 'Do', "n't", 'you', '?', 'My', 'foot', 'and', 'teeth', 'are', 'clean', '!']
Number of Tokens: 24
Vocabulary: 23
Number of Tokens: 24
Vocabulary: 22


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Original,Stemmed,Lemmatized
0,cats,cat,cat
1,corpora,corpora,corpus
2,mice,mice,mouse


To improve the performance of the `lemmatizer` it needs the **part of speech** of the token, by default it uses *noun*, so the lemmatizer only removed plurals

# spaCy


`spaCy` is a python library that provides several **language processing pipelines** that streamline and facilitate the process of language processing.

Once a `language pipeline` is loaded  it will return a `Language` that is an object that contains all the components and data needed to process text.

Those compoenents are:
- Binary Weights of a model for the **part-of-speech tagger**, the **dependency parser** and the **named entity recognizer** to predict the annotations in the text
- Lexical Entries in the vocabulary: words and their context independent attributes like shape and spelling
- Data files for lemmatization  rules and look up tables
- Word Vectors multidimensional meaning representations of the words that let you determine how similar words are

Between other

In [29]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

### Processing With SpaCy

Now we just need to call `nlp(text)` and it will return an object of the type `Document` that is the processed text. However it is worth mentioning that the `Document` still holds all the information about the text and it is possible to reconstruct it from the `Document`

In [30]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [31]:
# Just for pretty printing
import pandas as pd

pd.DataFrame(([t.text,t.pos_,t.dep_] for t in doc),columns=['Text','POS','Dependency'])

Unnamed: 0,Text,POS,Dependency
0,Apple,PROPN,nsubj
1,is,AUX,aux
2,looking,VERB,ROOT
3,at,ADP,prep
4,buying,VERB,pcomp
5,U.K.,PROPN,dobj
6,startup,NOUN,dep
7,for,ADP,prep
8,$,SYM,quantmod
9,1,NUM,compound


As we can see the ease of use of `spaCy` as just calling the **language processing pipeline** was easy and gives us lots of information

### Spacy's Tokenizer

It is **language dependent** as different languages have differences in how they should be tokenized.*Like the lack of spaces*

![](https://spacy.io/images/tokenization.svg)

Here we show the flow of the tokenizer, it first splits by spaces then tries to see if each "word" matches an exception rule and should be further devided.

It also tries to split off infixes, like punctuation.

In [32]:
columns_func = [
    ['text', 'text'], ['lemma', 'lemma_'],
    ['pos', 'pos_'], ['tag', 'tag_'],
    ['dep', 'dep_'], ['shape', 'shape_'],
    ['alpha', 'is_alpha'], ['stop', 'is_stop']
]

data = []
for tok in doc:
    row = []
    for _, func in columns_func:
        row.append(getattr(tok,func))
    data.append(row)

pd.DataFrame(data,columns=[i[0] for i in columns_func])

Unnamed: 0,text,lemma,pos,tag,dep,shape,alpha,stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,X.X.,False,False
6,startup,startup,NOUN,NN,dep,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


In [33]:
doc = nlp("I ate fish and chips, in the U.K.")
displacy.render(doc, style="dep")

In [34]:
spacy.explain("ADJ")

'adjective'

In [38]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

data = []
for ent in doc.ents:
    data.append([ent.text, ent.start_char, ent.end_char, ent.label_,spacy.explain(ent.label_)])

pd.DataFrame(data,columns=["Text","Start","End","Label","Description"])


Unnamed: 0,Text,Start,End,Label,Description
0,Apple,0,5,ORG,"Companies, agencies, institutions, etc."
1,U.K.,27,31,GPE,"Countries, cities, states"
2,$1 billion,44,54,MONEY,"Monetary values, including unit"


In [41]:
displacy.render(doc,style="ent")

## Word Vectors and similarity

**Similarity** is determined by comparing *word vectors* or *word embeddings*, these are multi dimensional representations of a word.

Spacy pipelines that end in `sm` don't ship with *word vectors*, to get accurate similarity results a pipeline that ends in `lg` is required.

It is important to note that word similarity depends on the application needs as the sentence `I like pasta` and `I like pizza` are both similar 
because they both talk about food preferences but if our application is about differences in food, pizza and pasta are very different.

`Doc` and `Span` vector values are the **average** of its constituints. Meaning that ordering between words is lost.

In [42]:
import spacy

# python3 -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")  # make sure to use larger package!
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.691649353055761


In [43]:
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

salty fries <-> hamburgers 0.6938489675521851


In [47]:
good = nlp('sweet')
bad = nlp('salt')
print(good,"<->",bad,good.similarity(bad))

sweet <-> salt 0.3181479327298317


![Pipeline](https://spacy.io/images/pipeline.svg)

![Architecture](https://spacy.io/images/architecture.svg)