# Text Tokenization Exercise

This exercise explores the challenges of splitting text into sentences and words when dealing with complex real-world text containing dates, amounts, URLs, emails, acronyms, and multi-word expressions.

In [1]:
# Sample text with challenging elements
text = """Dr. John Smith, Ph.D., earned $1,250.50 on Jan. 15, 2024, for his work at A.I. Corp. You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info. The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M."""

print("Original text:")
print(text) 

Original text:
Dr. John Smith, Ph.D., earned $1,250.50 on Jan. 15, 2024, for his work at A.I. Corp. You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info. The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M.


Before

In [2]:
import re

sentences = re.split(r'(?<=[.!?])\s+', text)
for s in sentences:
    print(s)

Dr.
John Smith, Ph.D., earned $1,250.50 on Jan.
15, 2024, for his work at A.I.
Corp.
You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info.
The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M.


In [3]:
words = text.split()
print(words)

['Dr.', 'John', 'Smith,', 'Ph.D.,', 'earned', '$1,250.50', 'on', 'Jan.', '15,', '2024,', 'for', 'his', 'work', 'at', 'A.I.', 'Corp.', 'You', 'can', 'reach', 'him', 'at', 'j.smith@ai-corp.co.uk', 'or', 'visit', 'https://www.ai-corp.co.uk/team/dr-smith', 'for', 'more', 'info.', 'The', 'U.S.A.-based', 'company', 'reported', 'a', '23.5%', 'increase', 'in', 'Q3', 'revenue,', 'totaling', '€2.5M.']


After

In [None]:
# Sentences
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
sentences = []
for s in doc.sents:
    sentences.append(s.text)


sentences

['Dr. John Smith, Ph.D., earned $1,250.50 on Jan. 15, 2024, for his work at A.I. Corp.',
 'You can reach him at j.smith@ai-corp.co.uk or visit https://www.ai-corp.co.uk/team/dr-smith for more info.',
 'The U.S.A.-based company reported a 23.5% increase in Q3 revenue, totaling €2.5M.']

In [10]:
# Tokens or words
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
tokens = []
for token in doc:
    tokens.append(token.text)

tokens

['Dr.',
 'John',
 'Smith',
 ',',
 'Ph.D.',
 ',',
 'earned',
 '$',
 '1,250.50',
 'on',
 'Jan.',
 '15',
 ',',
 '2024',
 ',',
 'for',
 'his',
 'work',
 'at',
 'A.I.',
 'Corp.',
 'You',
 'can',
 'reach',
 'him',
 'at',
 'j.smith@ai-corp.co.uk',
 'or',
 'visit',
 'https://www.ai-corp.co.uk/team/dr-smith',
 'for',
 'more',
 'info',
 '.',
 'The',
 'U.S.A.-based',
 'company',
 'reported',
 'a',
 '23.5',
 '%',
 'increase',
 'in',
 'Q3',
 'revenue',
 ',',
 'totaling',
 '€',
 '2.5M.']

# Corpus Tokenization Exercise

This exercise explores the challenges of splitting words in large corpuses and find the most common words. 

In [12]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm", disable=["ner"]) # Is the Name Entity  (not neccesary)
tokens = []

for token in doc:
    if token.is_alpha:
        word = token.text.lower() # lowercase letters
        tokens.append(word) # Add to the list

counts = Counter(tokens)
count = counts.most_common(20)

count

[('for', 2),
 ('at', 2),
 ('john', 1),
 ('smith', 1),
 ('earned', 1),
 ('on', 1),
 ('his', 1),
 ('work', 1),
 ('you', 1),
 ('can', 1),
 ('reach', 1),
 ('him', 1),
 ('or', 1),
 ('visit', 1),
 ('more', 1),
 ('info', 1),
 ('the', 1),
 ('company', 1),
 ('reported', 1),
 ('a', 1)]

## The Challenge

Given a file `shakes.txt` in the book folder. Find the words that are more common in Shakespeare's book. 

In [15]:
import re
from collections import Counter

file = open("shakes.txt")
text = file.read()

text = text.lower()  # lowercase 
# Extract words as king's
pattern = r"[a-z]+(?:'[a-z]+)?"
tokens = re.findall(pattern, text)

# Count frecuency of words
counts = Counter(tokens)
count = counts.most_common(20)

# results
count

[('the', 27843),
 ('and', 26845),
 ('i', 20717),
 ('to', 19773),
 ('of', 18299),
 ('a', 14798),
 ('you', 13732),
 ('my', 12489),
 ('that', 11160),
 ('in', 11067),
 ('is', 9626),
 ('not', 8760),
 ('for', 8281),
 ('with', 8053),
 ('me', 7776),
 ('it', 7717),
 ('be', 7117),
 ('this', 6898),
 ('your', 6891),
 ('his', 6859)]