In [1]:
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
        'width': 1300,
        'height': 300,
        'scroll': True,
})

{'width': 1300, 'height': 300, 'scroll': True}

## Topics will be covered in this session

1. Regular Expressions
2. Tokenization
3. Stop words
4. Lemmatization
5. Stemming
6. Challenges in Tokenization

## Regular Expressions

* A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.
* RegEx can be used to check if a string contains the specified search pattern.
* __re__ module is used to work with regular expressions.

In [4]:
# import re module
import re

import nltk
# nltk.download('punkt')

### Metacharacters

* A metacharacter is a character that has a special meaning during pattern processing. 
* Metacharacters can be used in regular expressions to define the search criteria and any text manipulations.

#__^__ &emsp; --> &emsp; Beginning of line<br>
__$__ &emsp; --> &emsp; End of line<br>
__[abc]__ &emsp; --> &emsp; Match any character enclosed in the brackets<br>
__[^abc]__ &emsp; --> &emsp; Match any character not enclosed in the brackets<br>
__[a-z]__ &emsp; --> &emsp; Match the range of characters specified by the hyphen<br>
__.__ &emsp; --> &emsp; Match any single character<br>
__\__ &emsp; --> &emsp; Use the literal meaning of the metacharacter<br>
__?__ &emsp; --> &emsp; Match zero or one of the preceding expressio<br>
__*__ &emsp; --> &emsp; Match zero, one, or many of the preceding expression<br>
__+__ &emsp; --> &emsp; Match one or many of the preceding expression<br>
__\d__ &emsp; --> &emsp; Match digit (0-9)<br>
__\D__ &emsp; --> &emsp; Not a digit (0-9)<br>
__\w__ &emsp; --> &emsp; Word character<br>
__\W__ &emsp; --> &emsp; Not a word character<br>
__\s__ &emsp; --> &emsp; Whitespace (space, tab, newline)<br>
__\S__ &emsp; --> &emsp; Not a whitespace<br>

In [3]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ

1234567890

Ha HazHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
mat
pat
bat
'''

emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

sentence = 'Start a sentence and then bring it to an end'


### First Type RegularExpressions

#### re.compile()

We can combine a regular expression pattern into pattern objects, which can be used for pattern matching. It also helps to search a pattern again without rewriting it.

In [4]:
# defining a pattern
text = 'abc is the text here'
pattern = re.compile(r'abc')

In [5]:
# match
match = pattern.match(text)   #Match searches only at the start of the string
match

<re.Match object; span=(0, 3), match='abc'>

In [6]:
pattern_m = re.compile(r'here')
print(text)
match = pattern_m.match(text)
match

abc is the text here


No matches because match() is used match a pattern at the start of a string.

In [7]:
# find the matches of pattern
# findall method
pattern = re.compile(r'\w+')   #\w+ searches all the charecters and groups them as words || while \w matches the charecters
match = pattern.findall(text)
print(type(match))
match

<class 'list'>


['abc', 'is', 'the', 'text', 'here']

In [8]:
# search method
# looks for the first location where the pattern matches
match = pattern.search(text)  # If a match is found, then re.search() returns a match object. Otherwise, it returns None
match

<re.Match object; span=(0, 3), match='abc'>

In [9]:
# match any character
pattern = re.compile(r'\.')
match = pattern.findall(text) 
match

[]

In [10]:
# match period character
pattern = re.compile(r'\.')
text1 = "something is useful."
match = pattern.findall(text1)  #findall returns the charecter specified after \
match

['.']

In [11]:
# match digits
pattern = re.compile(r'\d+')
match = pattern.findall(text_to_search) #\d+ selects and groups the numbers || while \d selects only the charecters
match

['1234567890',
 '321',
 '555',
 '4321',
 '123',
 '555',
 '1234',
 '123',
 '555',
 '1234',
 '800',
 '555',
 '1234',
 '900',
 '555',
 '1234']

In [45]:
# match non digits
pattern = re.compile(r'\D')match = pattern.findall(text_to_search) # while \d selects the digits 
match                                                              #while \D+ selects non digits and groups


['\n',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'u',
 'r',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\n',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '\n',
 '\n',
 '\n',
 '\n',
 'H',
 'a',
 ' ',
 'H',
 'a',
 'z',
 'H',
 'a',
 '\n',
 '\n',
 'M',
 'e',
 't',
 'a',
 'C',
 'h',
 'a',
 'r',
 'a',
 'c',
 't',
 'e',
 'r',
 's',
 ' ',
 '(',
 'N',
 'e',
 'e',
 'd',
 ' ',
 't',
 'o',
 ' ',
 'b',
 'e',
 ' ',
 'e',
 's',
 'c',
 'a',
 'p',
 'e',
 'd',
 ')',
 ':',
 '\n',
 '.',
 ' ',
 '^',
 ' ',
 '$',
 ' ',
 '*',
 ' ',
 '+',
 ' ',
 '?',
 ' ',
 '{',
 ' ',
 '}',
 ' ',
 '[',
 ' ',
 ']',
 ' ',
 '\\',
 ' ',
 '|',
 ' ',
 '(',
 ' ',
 ')',
 '\n',
 '\n',
 'c',
 'o',
 'r',
 'e',
 'y',
 'm',
 's',
 '.',
 'c',
 'o',
 'm',
 '\n',
 '\n',
 '-',
 '-',
 '\n',
 '.',
 '.',
 '\n',
 '*',
 '*',
 '\n',
 '-',
 '-',
 '\n',
 '-',
 '-',
 '\n',
 '\n',
 '

In [13]:
# match word character
pattern = re.compile(r'\w+')
match = pattern.findall(text_to_search)
match

['abcdefghijklmnopqurtuvwxyz',
 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
 '1234567890',
 'Ha',
 'HazHa',
 'MetaCharacters',
 'Need',
 'to',
 'be',
 'escaped',
 'coreyms',
 'com',
 '321',
 '555',
 '4321',
 '123',
 '555',
 '1234',
 '123',
 '555',
 '1234',
 '800',
 '555',
 '1234',
 '900',
 '555',
 '1234',
 'Mr',
 'Schafer',
 'Mr',
 'Smith',
 'Ms',
 'Davis',
 'Mrs',
 'Robinson',
 'Mr',
 'T',
 'cat',
 'mat',
 'pat',
 'bat']

In [14]:
# match not a word character
pattern = re.compile(r'\W')
print(text_to_search)
match = pattern.findall(text_to_search)
match


abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ

1234567890

Ha HazHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
mat
pat
bat



['\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 ' ',
 '\n',
 '\n',
 ' ',
 '(',
 ' ',
 ' ',
 ' ',
 ')',
 ':',
 '\n',
 '.',
 ' ',
 '^',
 ' ',
 '$',
 ' ',
 '*',
 ' ',
 '+',
 ' ',
 '?',
 ' ',
 '{',
 ' ',
 '}',
 ' ',
 '[',
 ' ',
 ']',
 ' ',
 '\\',
 ' ',
 '|',
 ' ',
 '(',
 ' ',
 ')',
 '\n',
 '\n',
 '.',
 '\n',
 '\n',
 '-',
 '-',
 '\n',
 '.',
 '.',
 '\n',
 '*',
 '*',
 '\n',
 '-',
 '-',
 '\n',
 '-',
 '-',
 '\n',
 '\n',
 '.',
 ' ',
 '\n',
 ' ',
 '\n',
 ' ',
 '\n',
 '.',
 ' ',
 '\n',
 '.',
 ' ',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n']

In [15]:
# match space, tab, newline
pattern = re.compile(r'\s')
match = pattern.findall(text_to_search)
match

['\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 ' ',
 '\n',
 '\n',
 ' ',
 ' ',
 ' ',
 ' ',
 '\n',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 ' ',
 '\n',
 ' ',
 '\n',
 ' ',
 '\n',
 ' ',
 '\n',
 ' ',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n']

In [46]:
# match not whitespace character
pattern = re.compile(r'\S+')
match = pattern.findall(text_to_search)
match

['abcdefghijklmnopqurtuvwxyz',
 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
 '1234567890',
 'Ha',
 'HazHa',
 'MetaCharacters',
 '(Need',
 'to',
 'be',
 'escaped):',
 '.',
 '^',
 '$',
 '*',
 '+',
 '?',
 '{',
 '}',
 '[',
 ']',
 '\\',
 '|',
 '(',
 ')',
 'coreyms.com',
 '321-555-4321',
 '123.555.1234',
 '123*555*1234',
 '800-555-1234',
 '900-555-1234',
 'Mr.',
 'Schafer',
 'Mr',
 'Smith',
 'Ms',
 'Davis',
 'Mrs.',
 'Robinson',
 'Mr.',
 'T',
 'cat',
 'mat',
 'pat',
 'bat']

 ### Second Type RegularExpressions

In [50]:
# beginning of line
pattern = re.compile(r'^[Ss]tart')
sentence="Start with new end"
match = pattern.findall(sentence)
match

['Start']

In [18]:
# ending of line
pattern = re.compile(r'end$')
match = pattern.findall(sentence)
match

['end']

In [52]:
# check phone number pattern
pattern = re.compile(r'\d{3}.\d\d\d.\d{4}') #. matches with the any charecter of the given context
match = pattern.findall(text_to_search)
match

['321-555-4321',
 '123.555.1234',
 '123*555*1234',
 '800-555-1234',
 '900-555-1234']

In [20]:
# check phone number starts with 800 or 900
pattern = re.compile(r'[89]00.\d\d\d.\d\d\d\d')
match = pattern.findall(text_to_search)
match

['800-555-1234', '900-555-1234']

In [58]:
# Check name pattern only starts with Mr
pattern = re.compile(r'Mr\.?\s*[A-Za-z]\w+')
match = pattern.findall(text_to_search)
match

['Mr. Schafer', 'Mr Smith']

In [22]:
# Check name pattern
pattern = re.compile(r'M[rs][s]?\.?\s[A-Z]\w*')
match = pattern.findall(text_to_search)
match

['Mr. Schafer', 'Mr Smith', 'Ms Davis', 'Mrs. Robinson', 'Mr. T']

In [60]:
# check email pattern
pattern = re.compile(r'[a-zA-Z]+@[a-zA-Z]+\.com')
match = pattern.findall(emails)
match

['CoreyMSchafer@gmail.com']

### finditer()

* Get advantage of grouping

In [24]:
# Check name pattern
pattern = re.compile(r'(Ms|Mr|Mrs)\.?\s[A-Z]\w*')
matchs = pattern.finditer(text_to_search)
for match in matchs:
    print(match)

<re.Match object; span=(223, 234), match='Mr. Schafer'>
<re.Match object; span=(235, 243), match='Mr Smith'>
<re.Match object; span=(244, 252), match='Ms Davis'>
<re.Match object; span=(253, 266), match='Mrs. Robinson'>
<re.Match object; span=(267, 272), match='Mr. T'>


In [62]:
# check email pattern
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.(com|net|edu)')
matches = pattern.finditer(emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


## Review Questions
1. What does the function re.match do?<br>
    a. matches a pattern at the start of the string<br>
    b. matches a pattern at any position in the string<br>
    c. such a function does not exist<br>
    d. none of the mentioned<br>

## Answer: a

2. What is the datatype of returned data from findall()?

## Answer: list

# Tokenization

The process of segmenting running text into linguistic units such as words, punctuation, numbers, alpha-numerics, etc. This process is called tokenization.

In [26]:
text = """Backgammon is one of the oldest known board games. Its history can be traced back nearly
5,000 years to archeological discoveries in the Middle East. It is a two player game where
each player has fifteen checkers which move between twenty-four points according to the
roll of two dice."""

* Sentence Tokenization
* Word Tokenization

In [27]:
# Sentence Tokenization
sentences = text.split(".")

sentences

['Backgammon is one of the oldest known board games',
 ' Its history can be traced back nearly\n5,000 years to archeological discoveries in the Middle East',
 ' It is a two player game where\neach player has fifteen checkers which move between twenty-four points according to the\nroll of two dice',
 '']

In [28]:
# Word Tokenization
words = text.split()

words

['Backgammon',
 'is',
 'one',
 'of',
 'the',
 'oldest',
 'known',
 'board',
 'games.',
 'Its',
 'history',
 'can',
 'be',
 'traced',
 'back',
 'nearly',
 '5,000',
 'years',
 'to',
 'archeological',
 'discoveries',
 'in',
 'the',
 'Middle',
 'East.',
 'It',
 'is',
 'a',
 'two',
 'player',
 'game',
 'where',
 'each',
 'player',
 'has',
 'fifteen',
 'checkers',
 'which',
 'move',
 'between',
 'twenty-four',
 'points',
 'according',
 'to',
 'the',
 'roll',
 'of',
 'two',
 'dice.']

# Stop words

In computing, stop words are words that are filtered out before or after the natural language data (text) are processed. While “stop words” typically refers to the most common words in a language, all-natural language processing tools don’t use a single universal list of stop words.

In [29]:
# define your own stop words
stopwords = ['a', 'the', 'we', 'such','this', 'an','will', 'is', 'are', 'were', 'am', 'was', 'being', 'to', 'for', 'on','you', 'I', 'me', 'he', 'him', 'she','her']

In [30]:
text = "Oh man, this is pretty cool. We will do more such things."

In [31]:
# split the text into words using split method
words = text.lower().split()

In [32]:
words

['oh',
 'man,',
 'this',
 'is',
 'pretty',
 'cool.',
 'we',
 'will',
 'do',
 'more',
 'such',
 'things.']

In [33]:
# words without stopwords
words_filtered = [w for w in words if not w in stopwords]

In [34]:
words_filtered

['oh', 'man,', 'pretty', 'cool.', 'do', 'more', 'things.']

## Review Questions
1. Is stopwords list unique for a language?

## Answer:No

# Lemmatization

* Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.
* Used in comprehensive retrieval systems like search engines.

For example, in English, the verb 'to walk' may appear as 'walk', 'walked', 'walks' or 'walking'. The base form, 'walk', that one might look up in a dictionary, is called the lemma for the word. 

In [35]:
# import required module for lemmatiztion
from nltk.stem import WordNetLemmatizer 

In [36]:
# create an instance of WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [37]:
print("cats: ", lemmatizer.lemmatize("cats"))
print("feet: ", lemmatizer.lemmatize("feet"))
print("walks: ", lemmatizer.lemmatize("walks"))

cats:  cat
feet:  foot
walks:  walk


# Stemming

* Stemming is a technique used to extract the base form of the words by removing affixes from them. It is just like cutting down the branches of a tree to its stems. 
* For example, the stem of the words eating, eats, eaten is eat.

In [38]:
# import a stemmer
from nltk.stem import PorterStemmer

In [64]:
# create an instance of stemmer
stemmer = PorterStemmer()

print("googly: ", stemmer.stem("googly"))
print("likely: ", stemmer.stem("likely"))
print("walking: ", stemmer.stem("walking"))
print("feet: ", stemmer.stem("foot"))

googly:  googli
likely:  like
walking:  walk
feet:  foot


## Review Questions:
1. Which of the following are keyword Normalization techniques in NLP<br>
a.  Stemming<br>
b.  Part of Speech<br>
c. Named entity recognition<br>
d. Lemmatization<br>

## Answer: a,d

# Challenges in Tokenization

In [68]:
text = "He has completed his Ph. D. degree. He is happy."
print(text)

# sentences = text.split(". ")
pattern_i = re.compile(r'Ph\.\s+D\.\s+degree')
text_after = re.sub(pattern_i, '[Degree]', text)
print(text_after)
sentences = text_after.split(". ")
print("Before: ", sentences)
sentences = [sent.replace('[Degree]', 'Ph. D. degree') for sent in sentences]

sentences

He has completed his Ph. D. degree. He is happy.
He has completed his [Degree]. He is happy.
Before:  ['He has completed his [Degree]', 'He is happy.']


['He has completed his Ph. D. degree', 'He is happy.']

In the above example we should split the sentece after degree not after Ph.

In [7]:
# Abbreviation

text = "Dr. S. P. Kishore is the primary faculty of this course"
words = text.split()
words
print()
pat=re.compile(r'Dr\. \S\. P.')
text_after=re.sub(pat,"SP",text)
print(text_after)
sentences=text_after.split(".")
print("before: ",sentences)
sentences=[sent.replace("SP","Dr.S.P") for sent in sentences]

sentences


SP Kishore is the primary faculty of this course
before:  ['SP Kishore is the primary faculty of this course']


['Dr.S.P Kishore is the primary faculty of this course']

In the above example Dr. S. P. Kishore should be one token instead of multiple.

In [69]:
# hyphenation

text = "We would deal with the state-of-the-art."

words = text.split()

words

['We', 'would', 'deal', 'with', 'the', 'state', '-of-the-art.']

In [43]:
# numbers

text = "The value of gravity is 9.8 m/s/s"

words = text.split(".")

words

['The value of gravity is 9', '8 m/s/s']

## Review Questions:
1. Which one of the following is not a pre-processing technique in NLP<br>
    a. Stemming and Lemmatization<br>
    b. converting to lowercase<br>
    c. removing punctuations<br>
    d. removal of stop words<br>
    e. Sentiment analysis<br>

## Answer:e

### Next Week Content

* Spelling mistake detection and correction with minimum edit distance
* Chunking and NER
* POS tagging