# Text Cleaning

In [0]:
url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'
file_name = 'sherlock.txt'
!rm -rf sample_data

In [0]:
import requests
# Download the file from `url` and save it locally under `file_name`:
data = requests.get(url)
with open(file_name, 'w+') as out_file:
    out_file.write(data.text)

In [3]:
!ls

sherlock.txt


In [4]:
!head -2 sherlock.txt

﻿Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle



In [0]:
# Remove the first 33 lines inplace from the file
!sed -i 1,33d sherlock.txt

In [6]:
!head -5 sherlock.txt

THE ADVENTURES OF SHERLOCK HOLMES

by

SIR ARTHUR CONAN DOYLE


## Load Data

In [7]:
#let's the load data to RAM
text = open(file_name, 'r', encoding='utf-8').read()  # note that I add an encoding='utf-8' parameter to preserve information
print(text[:5])

THE A


In [8]:
print(f'The file is loaded as datatype: {type(text)} and has {len(text)} characters in it')

The file is loaded as datatype: <class 'str'> and has 581204 characters in it


### Exploring Loaded Data

In [44]:
# how many unique characters do we see? 
# For reference, ASCII has 127 characters in it - so we expect this to have at most 127 characters
unique_chars = list(set(text))
unique_chars.sort()
print(unique_chars)
print(f'There are {len(unique_chars)} unique characters, including both ASCII and Unicode character')

['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']
There are 85 unique characters, including both ASCII and Unicode character


## Tokenization 

### Split by Whitespace

In [45]:
words = text.split()
print(len(words))

107431


In [46]:
print(words[90:200])  #start with the first chapeter, ignoring the index for now

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler.', 'All', 'emotions,', 'and', 'that', 'one', 'particularly,', 'were', 'abhorrent', 'to', 'his', 'cold,', 'precise', 'but', 'admirably', 'balanced', 'mind.', 'He', 'was,', 'I', 'take', 'it,', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen,', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions,', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer.', 'They', 'were', 'admirable', 'things', 'for']


In [47]:
# Let's look at another example: 
'red-headed woman on the street'.split()

['red-headed', 'woman', 'on', 'the', 'street']

### Split by Word Extraction
**Introducing Regex**

In [37]:
import re
re.split('\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [0]:
words_alphanumeric = re.split('\W+', text)

In [39]:
len(words_alphanumeric), len(words)

(109111, 14)

In [40]:
print(words_alphanumeric[90:200])

['BOHEMIA', 'I', 'To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', 'All', 'emotions', 'and', 'that', 'one', 'particularly', 'were', 'abhorrent', 'to', 'his', 'cold', 'precise', 'but', 'admirably', 'balanced', 'mind', 'He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', 'They', 'were', 'admirable']


In [50]:
# words_break = re.split('\W+', ":::::And for the second time of asking, when")
# print(words_break)
print(' '.join(re.split('\W+', "::::::And for the second time of asking, when")))

 And for the second time of asking when


### spaCy for Tokenization

In [8]:
%%time
import spacy
nlp = spacy.load('en')

CPU times: user 740 ms, sys: 197 ms, total: 937 ms
Wall time: 3.73 s


In [0]:
doc = nlp(text)

In [10]:
print(list(doc)[150:200])

[whole, of, her, sex, ., It, was, not, that, he, felt, 
, any, emotion, akin, to, love, for, Irene, Adler, ., All, emotions, ,, and, that, 
, one, particularly, ,, were, abhorrent, to, his, cold, ,, precise, but, 
, admirably, balanced, mind, ., He, was, ,, I, take, it, ,]


Conveniently, spaCy tokenizes all *punctuations and words* and returned those as individual tokens as well. Let's try the example which we didn't like earlier:

In [11]:
words = nlp("Isn't he coming home for dinner with the red-headed girl?")
print([token for token in words])

[Is, n't, he, coming, home, for, dinner, with, the, red, -, headed, girl, ?]


In [12]:
sentences = list(doc.sents)
print(sentences[35:45])

[(for I had now returned to
civil practice), when my way led me through Baker Street., As I
passed the well-remembered door, which must always be associated
in my mind with my wooing, and with the dark incidents of the
Study in Scarlet, I was seized with a keen desire to see Holmes
again, and to know how he was employing his extraordinary powers.
, His rooms were brilliantly lit, and, even as I looked up, I saw
his tall, spare figure pass twice in a dark silhouette against
the blind., He was pacing the room swiftly, eagerly, with his head
sunk upon his chest and his hands clasped behind him., To me, who
knew his every mood and habit, his attitude and manner told their
own story., He was at work again., He had risen out of his
drug-created dreams and was hot upon the scent of some new
problem., I rang the bell and was shown up to the chamber which
had formerly been in part my own.

, His manner was not effusive., It seldom was; but he was glad, I
think, to see me.]


#### STOP WORD REMOVAL & CASE CHANGE

spaCy has already marked each token as a stop word or not and stored it in `is_stop` attribute of each token. This makes it very handy for text cleaning. Let's take a quick look: 

In [0]:
sentence_example = "the AI/AGI uprising cannot happen without the progress of NLP"

In [14]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, False, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, False, False)]

In [15]:
for token in doc[:5]:
    print(token, token.is_stop, token.is_punct)

THE True False
ADVENTURES False False
OF True False
SHERLOCK False False
HOLMES False False


In [0]:
text_lower = text.lower()  # native python function
doc_lower = nlp(text_lower)

In [17]:
for token in doc_lower[:5]:
    print(token, token.is_stop)

the True
adventures False
of True
sherlock False
holmes False


In [18]:
from spacy.lang.en.stop_words import STOP_WORDS
f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'

'spaCy has a dictionary of 326 stop words'

In [0]:
domain_stop_words = ["NLP", "Processing", "AGI"]
for word in domain_stop_words:
    STOP_WORDS.add(word)

In [20]:
f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'

'spaCy has a dictionary of 329 stop words'

In [34]:
# !pip install --upgrade spacy
# spacy.__version__
nlp.vocab['NLP'].is_stop = True
print(nlp.vocab['NLP'].is_stop)
print(nlp.vocab['uprising'].is_stop)
f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'
nlp.vocab['uprising'].is_stop = True
print (f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words')
print( 'uprising' in STOP_WORDS)
print(nlp.vocab['uprising'].is_stop)

True
True
spaCy has a dictionary of 329 stop words
False
True


In [32]:
'NLP' in STOP_WORDS

True

In [0]:
import spacy
nlp = spacy.load('en')

In [25]:
[(token, token.is_stop, token.is_punct) for token in nlp("the AI/AGI uprising cannot happen without the progress of NLP")]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, False, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, False, False)]

In [82]:
[str(token) for token in nlp(sentence_example) if not token.is_stop and not token.is_punct]

['AI', 'AGI', 'uprising', 'happen', 'progress', 'NLP']

In [83]:
[str(token) for token in nlp(sentence_example) if not token.is_stop]

['AI', '/', 'AGI', 'uprising', 'happen', 'progress', 'NLP']

## Stemming and Lemmatization

### spaCy for Lemmatization
**spaCy only supports lemmatization** 

An underscore at end, such as `lemma_` tells spaCy we are looking for something which is human readable. spaCy stores the internal hash or identifier which spaCy stores in `token.lemma`. 

In [35]:
lemma_sentence_example = "Their Apples & Banana fruit salads are amazing. Would you like meeting me at the cafe?"
[(token, token.lemma_, token.lemma, token.pos_ ) for token in nlp(lemma_sentence_example)]

[(Their, '-PRON-', 561228191312463089, 'DET'),
 (Apples, 'Apples', 9297668116247400838, 'PROPN'),
 (&, '&', 15473034735919704609, 'CCONJ'),
 (Banana, 'Banana', 7617506991971869807, 'PROPN'),
 (fruit, 'fruit', 17674554054627885835, 'NOUN'),
 (salads, 'salad', 16382906660984395826, 'NOUN'),
 (are, 'be', 10382539506755952630, 'VERB'),
 (amazing, 'amazing', 12968186374132960503, 'ADJ'),
 (., '.', 12646065887601541794, 'PUNCT'),
 (Would, 'Would', 10299253490465169573, 'VERB'),
 (you, '-PRON-', 561228191312463089, 'PRON'),
 (like, 'like', 18194338103975822726, 'VERB'),
 (meeting, 'meet', 6880656908171229526, 'VERB'),
 (me, '-PRON-', 561228191312463089, 'PRON'),
 (at, 'at', 11667289587015813222, 'ADP'),
 (the, 'the', 7425985699627899538, 'DET'),
 (cafe, 'cafe', 10569699879655997926, 'NOUN'),
 (?, '?', 8205403955989537350, 'PUNCT')]