In [73]:
import spacy


In [74]:
nlp = spacy.load('en_core_web_sm')

In [75]:
pos_sentence = "The quick brown fox jumps over the lazy dog."
ner_sentence = "Barack Obama was the 44th President of the United States."

In [76]:
import pandas as pd
pos_doc = nlp(pos_sentence)
pos_tags = [(token.text,token.pos_)for token in pos_doc]
df = pd.DataFrame(pos_tags,columns=('TOKEN','POS'),index=None)
display(HTML(df.to_html(index=False)))

TOKEN,POS
The,DET
quick,ADJ
brown,ADJ
fox,NOUN
jumps,VERB
over,ADP
the,DET
lazy,ADJ
dog,NOUN
.,PUNCT


In [77]:
import pandas as pd
from IPython.display import HTML

ner_doc = nlp(ner_sentence)
pos_tags = [(ent.text,ent.label_)for ent in ner_doc.ents]
df = pd.DataFrame(pos_tags,columns=('TOKEN','ENT'),index=None)
display(HTML(df.to_html(index=False)))

TOKEN,ENT
Barack Obama,PERSON
44th,ORDINAL
the United States,GPE


In [78]:
from nltk.stem import PorterStemmer, SnowballStemmer

# Initialize the stemmers
porter = PorterStemmer()
snowball = SnowballStemmer("english")

# Define a list of words to demonstrate stemming on
words = ["running", "jumps", "happily", "organization", "universal", "generously"]

# Apply PorterStemmer and print results
print("Porter Stemmer:")
for word in words:
  stemmed_word_porter = porter.stem(word)
  print(f"Original: {word}, Stemmed: {stemmed_word_porter}")

# Apply SnowballStemmer and print results
print("\nSnowball Stemmer (English):")
for word in words:
  stemmed_word_snowball = snowball.stem(word)
  print(f"Original: {word}, Stemmed: {stemmed_word_snowball}")

Porter Stemmer:
Original: running, Stemmed: run
Original: jumps, Stemmed: jump
Original: happily, Stemmed: happili
Original: organization, Stemmed: organ
Original: universal, Stemmed: univers
Original: generously, Stemmed: gener

Snowball Stemmer (English):
Original: running, Stemmed: run
Original: jumps, Stemmed: jump
Original: happily, Stemmed: happili
Original: organization, Stemmed: organ
Original: universal, Stemmed: univers
Original: generously, Stemmed: generous


In [79]:
from nltk.tokenize import word_tokenize

# 1. Define a sample sentence string.
sample_sentence = "The quick brown foxes are jumping over the lazy dogs."

# 2. Tokenize the sentence into individual words.
tokens = word_tokenize(sample_sentence)

# 3. Iterate through the list of tokens and 4. Apply stemming.
stemmed_words = [porter.stem(token) for token in tokens]

# 5. Print the original sentence and the list of stemmed words.
print(f"Original sentence: {sample_sentence}")
print(f"Stemmed words: {stemmed_words}")

Original sentence: The quick brown foxes are jumping over the lazy dogs.
Stemmed words: ['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [80]:
# 1. Define a list of sample words for lemmatization.
sample_words = ["running", "ran", "better", "geese", "organization", "flies", "flying", "fly"]

print("Applying lemmatization to individual words:")

# 2. Iterate through the list of words.
for word in sample_words:
  # 3. For each word, process it with the nlp object to obtain a Doc and access the Token.
  # 4. Apply the lemma_ attribute to get the lemma of the token.
  # 5. Print the original word and its corresponding lemma.
  doc = nlp(word)
  # Assuming one token per word in this simple case
  token = doc[0]
  print(f"Original: '{word}', Lemma: '{token.lemma_}'")

# 6. Include examples that show how the lemma can change based on the inferred or provided part-of-speech (e.g., "better" as an adjective).
print("\nDemonstrating POS influence on lemmatization:")

# Example: 'better' as an adjective (inferred by spaCy)
doc_better_adj = nlp("better")
token_better_adj = doc_better_adj[0]
print(f"Original: 'better' (inferred POS: {token_better_adj.pos_}), Lemma: '{token_better_adj.lemma_}'")

# Example: 'flies' as a noun (inferred by spaCy)
doc_flies_noun = nlp("flies")
token_flies_noun = doc_flies_noun[0]
print(f"Original: 'flies' (inferred POS: {token_flies_noun.pos_}), Lemma: '{token_flies_noun.lemma_}'")

# Example: 'flies' as a verb (requires context or manual POS)
# spaCy often infers the most common POS. To show verb 'flies', we might need a sentence.
doc_flies_verb = nlp("He flies.")
token_flies_verb = [token for token in doc_flies_verb if token.text == "flies"][0]
print(f"Original: 'flies' (inferred POS: {token_flies_verb.pos_}), Lemma: '{token_flies_verb.lemma_}'")

# Using WordNetLemmatizer with explicit POS (requires NLTK wordnet tags)
# Note: spaCy's lemmatizer is generally more robust and handles POS internally.
# This part is just to illustrate the concept of explicit POS using NLTK's lemmatizer
# It requires converting spaCy POS to WordNet POS if using spaCy tokens with NLTK lemmatizer,
# or tokenizing and POS tagging with NLTK. For simplicity, I'll show a direct NLTK example
# using a common conversion for demonstrative purposes.

# NLTK WordNetLemmatizer example with explicit POS
# Requires converting POS tags. WordNetLemmatizer uses 'n', 'v', 'a', 'r' for noun, verb, adjective, adverb.
# spaCy's POS tags are more detailed (NOUN, VERB, ADJ, ADV).
def get_wordnet_pos(spacy_pos):
    if spacy_pos == "NOUN":
        return wordnet.NOUN
    elif spacy_pos == "VERB":
        return wordnet.VERB
    elif spacy_pos == "ADJ":
        return wordnet.ADJ
    elif spacy_pos == "ADV":
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to noun if no clear mapping

print("\nDemonstrating explicit POS with NLTK's WordNetLemmatizer (for comparison):")
word_to_lemma = "better"
# Using spaCy to get the token and its inferred POS first
doc_nltk = nlp(word_to_lemma)
token_nltk = doc_nltk[0]
spacy_pos = token_nltk.pos_
wordnet_pos = get_wordnet_pos(spacy_pos)

# Apply NLTK lemmatizer with the converted POS
lemma_nltk = lemmarizer.lemmatize(word_to_lemma, pos=wordnet_pos)
print(f"Original: '{word_to_lemma}', Inferred spaCy POS: '{spacy_pos}', Converted WordNet POS: '{wordnet_pos}', Lemma (NLTK): '{lemma_nltk}'")

word_to_lemma_flies = "flies"
doc_nltk_flies = nlp(word_to_lemma_flies)
token_nltk_flies = doc_nltk_flies[0]
spacy_pos_flies = token_nltk_flies.pos_
wordnet_pos_flies = get_wordnet_pos(spacy_pos_flies)
lemma_nltk_flies = lemmarizer.lemmatize(word_to_lemma_flies, pos=wordnet_pos_flies)
print(f"Original: '{word_to_lemma_flies}', Inferred spaCy POS: '{spacy_pos_flies}', Converted WordNet POS: '{wordnet_pos_flies}', Lemma (NLTK): '{lemma_nltk_flies}'")

# Example with explicit verb POS for 'flies' using NLTK
lemma_nltk_flies_verb = lemmarizer.lemmatize("flies", pos=wordnet.VERB)
print(f"Original: 'flies', Explicit WordNet POS: '{wordnet.VERB}', Lemma (NLTK): '{lemma_nltk_flies_verb}'")


Applying lemmatization to individual words:
Original: 'running', Lemma: 'run'
Original: 'ran', Lemma: 'run'
Original: 'better', Lemma: 'well'
Original: 'geese', Lemma: 'geese'
Original: 'organization', Lemma: 'organization'
Original: 'flies', Lemma: 'fly'
Original: 'flying', Lemma: 'fly'
Original: 'fly', Lemma: 'fly'

Demonstrating POS influence on lemmatization:
Original: 'better' (inferred POS: ADV), Lemma: 'well'
Original: 'flies' (inferred POS: VERB), Lemma: 'fly'
Original: 'flies' (inferred POS: VERB), Lemma: 'fly'

Demonstrating explicit POS with NLTK's WordNetLemmatizer (for comparison):
Original: 'better', Inferred spaCy POS: 'ADV', Converted WordNet POS: 'r', Lemma (NLTK): 'well'
Original: 'flies', Inferred spaCy POS: 'VERB', Converted WordNet POS: 'v', Lemma (NLTK): 'fly'
Original: 'flies', Explicit WordNet POS: 'v', Lemma (NLTK): 'fly'


In [81]:
# 1. Define a sample sentence string to work with.
sample_sentence_lemmatization = "The quick brown foxes are jumping over the lazy dogs."

# 2. Process the sentence using the loaded nlp object to create a Doc object.
lemmatization_doc = nlp(sample_sentence_lemmatization)

# 3. Iterate through the tokens in the Doc object.
lemmatized_words_with_pos = []
print(f"Original sentence: {sample_sentence_lemmatization}")
print("Lemmatized words with POS:")
for token in lemmatization_doc:
  # 4. For each token, access its lemma using the .lemma_ attribute.
  # 5. Print the original word, its POS, and its lemma.
  # We also store the lemma and POS for later printing as a list if needed
  lemmatized_words_with_pos.append((token.text, token.pos_, token.lemma_))
  print(f"Token: {token.text}, POS: {token.pos_}, Lemma: {token.lemma_}")

# Optional: Print just the list of lemmas
lemmas_only = [item[2] for item in lemmatized_words_with_pos]
print(f"\nLemmas only: {lemmas_only}")

Original sentence: The quick brown foxes are jumping over the lazy dogs.
Lemmatized words with POS:
Token: The, POS: DET, Lemma: the
Token: quick, POS: ADJ, Lemma: quick
Token: brown, POS: ADJ, Lemma: brown
Token: foxes, POS: NOUN, Lemma: fox
Token: are, POS: AUX, Lemma: be
Token: jumping, POS: VERB, Lemma: jump
Token: over, POS: ADP, Lemma: over
Token: the, POS: DET, Lemma: the
Token: lazy, POS: ADJ, Lemma: lazy
Token: dogs, POS: NOUN, Lemma: dog
Token: ., POS: PUNCT, Lemma: .

Lemmas only: ['the', 'quick', 'brown', 'fox', 'be', 'jump', 'over', 'the', 'lazy', 'dog', '.']


In [82]:
# Select a few sentences (e.g., 3) from the 'sentences' list
selected_sentences = sentences[:3]

for sentence in selected_sentences:
  # Join the words back into a single string.
  sentence_string = " ".join(sentence)

  print(f"\nOriginal Sentence: {sentence_string}")

  # Apply the Porter Stemmer to each word
  porter_stemmed_words = [porter.stem(word) for word in sentence]
  print(f"Porter Stemmed Words: {porter_stemmed_words}")

  # Apply the Snowball Stemmer (English) to each word
  snowball_stemmed_words = [snowball.stem(word) for word in sentence]
  print(f"Snowball Stemmed Words: {snowball_stemmed_words}")

  # Process the sentence string using the spaCy nlp object for lemmatization
  lemmatization_doc = nlp(sentence_string)
  print("SpaCy Lemmatization (Token, POS, Lemma):")
  for token in lemmatization_doc:
    print(f"  {token.text} ({token.pos_}): {token.lemma_}")



Original Sentence: The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
Porter Stemmed Words: ['the', 'fulton', 'counti', 'grand', 'juri', 'said', 'friday', 'an', 'investig', 'of', "atlanta'", 'recent', 'primari', 'elect', 'produc', '``', 'no', 'evid', "''", 'that', 'ani', 'irregular', 'took', 'place', '.']
Snowball Stemmed Words: ['the', 'fulton', 'counti', 'grand', 'juri', 'said', 'friday', 'an', 'investig', 'of', 'atlanta', 'recent', 'primari', 'elect', 'produc', '``', 'no', 'evid', "''", 'that', 'ani', 'irregular', 'took', 'place', '.']
SpaCy Lemmatization (Token, POS, Lemma):
  The (DET): the
  Fulton (PROPN): Fulton
  County (PROPN): County
  Grand (PROPN): Grand
  Jury (PROPN): Jury
  said (VERB): say
  Friday (PROPN): Friday
  an (DET): an
  investigation (NOUN): investigation
  of (ADP): of
  Atlanta (PROPN): Atlanta
  's (PART): 's
  recent (ADJ): recent
  primary (ADJ):

In [91]:
practice_words = ["cats", "troubled", "having", "stemming", "stemmed", "stemmer", "stemmers"]
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet
doc = nlp(" ".join(practice_words))
porter = PorterStemmer()
snowball = SnowballStemmer(language='english')
stemm_words_port = [porter.stem(word) for word in practice_words]
stemm_words_snow = [snowball.stem(word) for word in practice_words]
lemm_words_spacy = [token.lemma_ for token in doc]
lemm_words_NLTK = [lemmatizer.lemmatize(word) for word in practice_words]
df = pd.DataFrame({
    'Original Word': practice_words,
    'Porter': stemm_words_port,
    'Snowball': stemm_words_snow,
    'Spacy' : lemm_words_spacy,
    'NLTK' : lemm_words_NLTK
})
display(HTML(df.to_html(index=False)))

Original Word,Porter,Snowball,Spacy,NLTK
cats,cat,cat,cat,cat
troubled,troubl,troubl,trouble,troubled
having,have,have,have,having
stemming,stem,stem,stem,stemming
stemmed,stem,stem,stem,stemmed
stemmer,stemmer,stemmer,stemmer,stemmer
stemmers,stemmer,stemmer,stemmer,stemmer


In [98]:
from pickle import FALSE
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer

corpus = 'The researchers are analyzing the data to understand the underlying patterns.'

tokens = word_tokenize(corpus)
porter = PorterStemmer()
snowball = SnowballStemmer('english')
porter_stemm = [porter.stem(word) for word in tokens]
snow_stemm = [snowball.stem(word) for word in tokens]
dataframe = {
    'Original': tokens,
    'Porter': porter_stemm,
    'Snowball':snow_stemm
}
df = pd.DataFrame(dataframe)
display(HTML(df.to_html(index=False)))


Original,Porter,Snowball
The,the,the
researchers,research,research
are,are,are
analyzing,analyz,analyz
the,the,the
data,data,data
to,to,to
understand,understand,understand
the,the,the
underlying,underli,under


In [101]:
import nltk
from nltk.corpus import brown
try:
    nltk.data.find('corpora/brown')
except nltk.downloader.DownloadError:
    nltk.download('brown')
sentences = brown.sents(categories='news')[:10]

In [104]:
import spacy
nlp = spacy.load('en_core_web_sm')
text = " ".join(word for sent in sentences for word in sent)
doc = nlp(text)
lemm_sent_spacy = [token.lemma_ for token in doc]
POS = [token.pos_ for token in doc]
dataframe={
    'Original': [token.text for token in doc],
    'POS':POS,
    'Lemm':lemm_sent_spacy
}
df = pd.DataFrame(dataframe)
display(HTML(df.to_html(index=False)))

Original,POS,Lemm
The,DET,the
Fulton,PROPN,Fulton
County,PROPN,County
Grand,PROPN,Grand
Jury,PROPN,Jury
said,VERB,say
Friday,PROPN,Friday
an,DET,an
investigation,NOUN,investigation
of,ADP,of
