In [1]:
import nltk
import spacy

# Tokenization #
### Tokenization is the process of breaking up the original text into components (tokens) ###

## Spacy code ##

In [2]:
# Load the language library
nlp=spacy.load("en_core_web_sm")

In [3]:
#Create a document
#parse each word string into token
doc1=nlp(u'Lewis University located in U.S., Since 1932, Lewis University has been grounded in intentionality, guided by truth, and inspired by innovation.')

In [4]:
#Token has various attributes that we can grab from each token
#For example, I can grab token text, which is the raw text that grabbed
#It is smart enough to treat U.S. as a single token
for token in doc1:
    print(token.text)

Lewis
University
located
in
U.S.
,
Since
1932
,
Lewis
University
has
been
grounded
in
intentionality
,
guided
by
truth
,
and
inspired
by
innovation
.


In [5]:
for token in doc1:
    print(token.text, token.pos_, token.pos)

Lewis PROPN 96
University PROPN 96
located VERB 100
in ADP 85
U.S. PROPN 96
, PUNCT 97
Since SCONJ 98
1932 NUM 93
, PUNCT 97
Lewis PROPN 96
University PROPN 96
has AUX 87
been AUX 87
grounded VERB 100
in ADP 85
intentionality NOUN 92
, PUNCT 97
guided VERB 100
by ADP 85
truth NOUN 92
, PUNCT 97
and CCONJ 89
inspired VERB 100
by ADP 85
innovation NOUN 92
. PUNCT 97


In [6]:
# If we used lemma_, it will give you the limitization of the base from of the word
for token in doc1:
    print(f'{token.text} ===>  {token.lemma_}')

Lewis ===>  Lewis
University ===>  University
located ===>  locate
in ===>  in
U.S. ===>  U.S.
, ===>  ,
Since ===>  since
1932 ===>  1932
, ===>  ,
Lewis ===>  Lewis
University ===>  University
has ===>  have
been ===>  be
grounded ===>  ground
in ===>  in
intentionality ===>  intentionality
, ===>  ,
guided ===>  guide
by ===>  by
truth ===>  truth
, ===>  ,
and ===>  and
inspired ===>  inspire
by ===>  by
innovation ===>  innovation
. ===>  .


In [7]:
#tokenize sentence
doc2=nlp(u"Welcome! this is the firstsentence. this is the second sentence. This is the last sentence ")

In [8]:
for sen in doc2.sents:
    print(sen)

Welcome!
this is the firstsentence.
this is the second sentence.
This is the last sentence


## NLTK code ##


In [46]:
from nltk.tokenize import word_tokenize
text="Lewis University located in U.S., Since 1932, Lewis University has been grounded in intentionality, guided by truth, and inspired by innovation."
tokens = word_tokenize(text)
print(tokens)

['Lewis', 'University', 'located', 'in', 'U.S.', ',', 'Since', '1932', ',', 'Lewis', 'University', 'has', 'been', 'grounded', 'in', 'intentionality', ',', 'guided', 'by', 'truth', ',', 'and', 'inspired', 'by', 'innovation', '.']


In [47]:
text2 = "Hello, world! NLP is amazing. Isn't it?"
tokens = word_tokenize(text2)
print(tokens)

['Hello', ',', 'world', '!', 'NLP', 'is', 'amazing', '.', 'Is', "n't", 'it', '?']


In [59]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text2)
print(sentences)

['Hello, world!', 'NLP is amazing.', "Isn't it?"]


### Custom tokenization using regular expressions: ###


In [60]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')  # Words only (no punctuation)
print(tokenizer.tokenize("Hello, world!"))  # ['Hello', 'world']

['Hello', 'world']


In [61]:
# Example
text3 = "This costs $12.50 and that's 50% off!"
tokens3 = word_tokenize(text3)
print(tokens3)

['This', 'costs', '$', '12.50', 'and', 'that', "'s", '50', '%', 'off', '!']


In [63]:
tokenizer = RegexpTokenizer(r'\$?\d+\.?\d+|\w+|\S') 
print(tokenizer.tokenize("Hello, world!")) 

['Hello', ',', 'world', '!']


### Common Use Cases ###

1- Text preprocessing for NLP pipelines

2- Feature extraction for machine learning

3- Word frequency analysis

4- Sentiment analysis

# Stemming #
### Porter Stemmer ###
One of the most common - and effective - stemming tools is Porter's Algorithm developed by Martin Porter in 1980. The algorithm employs five phases of word reduction, each with its own set of mapping rules. 

In [32]:
# Import the toolkit and the full Porter Stemmer library
import nltk

from nltk.stem.porter import *

In [33]:
pstemmer = PorterStemmer()


In [42]:
words = ['jump','run','running','runner', 'jumping', 'jumps', 'jumped', 'fairly','easily', 'happily', 'happiness']

In [43]:
stemmed_words = [pstemmer.stem(word) for word in words]
print(stemmed_words)

['jump', 'run', 'run', 'runner', 'jump', 'jump', 'jump', 'fairli', 'easili', 'happili', 'happi']


Note how the stemmer recognizes "runner" as a noun, not a verb form or participle. Also, the adverbs "fairly", "easily", "happily" and "happily" are stemmed to the unusual root "fairli", "easili", "happily" and "happi"

### Snowball Stemmer ###
The algorithm implemented here is technically known as the "English Stemmer" or "Porter2 Stemmer", an enhanced version of the original Porter stemmer with optimizations in both logic and computational efficiency. While the nltk library refers to this implementation as SnowballStemmer, we will adhere to that naming convention for consistency in this context.

In [44]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
sstemmer = SnowballStemmer(language='english')

In [45]:
stemmed_words = [sstemmer.stem(word) for word in words]
print(stemmed_words)

['jump', 'run', 'run', 'runner', 'jump', 'jump', 'jump', 'fair', 'easili', 'happili', 'happi']


<font color=blue>In this instance, the stemmer produced results identical to the Porter Stemmer, with one key improvement: it correctly derived the stem "fair" from "fairly", demonstrating more accurate handling of adverbial forms.</font>

# Lemmatization #

Unlike stemming, which simply truncates word endings, lemmatization employs comprehensive linguistic analysis to determine a word's canonical form (lemma) based on its morphological structure and syntactic context. This process:

1- References a language's complete lexical database

2- Accounts for part-of-speech and semantic context

3- Produces dictionary-valid words

### Examples: ###

'was' → 'be' (verb inflection)

'mice' → 'mouse' (noun pluralization)

'meeting' → either 'meet' (verb) or 'meeting' (noun), depending on usage

## Spacy ##
Key Advantage: Uses part-of-speech (POS) context to determine the correct lemma.


In [73]:
text = "The mice were running in circles, meeting their friends."
doc = nlp(text)

for token in doc:
    print(f"Word: {token.text:<8} | Lemma: {token.lemma_:<8} | POS: {token.pos_}")

Word: The      | Lemma: the      | POS: DET
Word: mice     | Lemma: mouse    | POS: NOUN
Word: were     | Lemma: be       | POS: AUX
Word: running  | Lemma: run      | POS: VERB
Word: in       | Lemma: in       | POS: ADP
Word: circles  | Lemma: circle   | POS: NOUN
Word: ,        | Lemma: ,        | POS: PUNCT
Word: meeting  | Lemma: meet     | POS: VERB
Word: their    | Lemma: their    | POS: PRON
Word: friends  | Lemma: friend   | POS: NOUN
Word: .        | Lemma: .        | POS: PUNCT


## Using NLTK ##

In [72]:
from nltk.stem import WordNetLemmatizer
#from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

In [66]:
lemmatizer = WordNetLemmatizer()

words = ["mice", "running", "better", "meeting"]
lemmas = [lemmatizer.lemmatize(word) for word in words]

print("Lemmatization Results:", lemmas)

Lemmatization Results: ['mouse', 'running', 'better', 'meeting']


### Lemmatization sentence ###

In [71]:
#text = "The mice were running in circles, meeting their friends."
def lemmatize_sentence(sentence):
    tokens = word_tokenize(sentence)
    return [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatize_sentence(text))

['The', 'mouse', 'were', 'running', 'in', 'circle', ',', 'meeting', 'their', 'friend', '.']
