### Text Preprocessing
Tokenization, Stemming and Lemmatization


In [23]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text
text = "Tokenization is the process of breaking down text into individual words or sentences. It's a crucial step in NLP."

# Tokenize into words
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

# Tokenize into sentences
sentence_tokens = sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)

Word Tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'individual', 'words', 'or', 'sentences', '.', 'It', "'s", 'a', 'crucial', 'step', 'in', 'NLP', '.']
Sentence Tokens: ['Tokenization is the process of breaking down text into individual words or sentences.', "It's a crucial step in NLP."]


In [24]:
# Stemming
from nltk.stem import PorterStemmer, SnowballStemmer

# Initialize stemmers
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")

# Words to stem
words = ["running", "ran", "runner", "easily", "fairly"]

# Apply Porter Stemmer
porter_stemmed = [porter_stemmer.stem(word) for word in words]
print("Porter Stemmer Results:", porter_stemmed)

# Apply Snowball Stemmer
snowball_stemmed = [snowball_stemmer.stem(word) for word in words]
print("Snowball Stemmer Results:", snowball_stemmed)

Porter Stemmer Results: ['run', 'ran', 'runner', 'easili', 'fairli']
Snowball Stemmer Results: ['run', 'ran', 'runner', 'easili', 'fair']


In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Words to lemmatize
words = [("running", "v"), ("better", "a"), ("cats", "n")]

# Lemmatize words with POS tags
lemmatized_words = [lemmatizer.lemmatize(word, pos=getattr(wordnet, pos.upper())) for word, pos in words]
print("Lemmatized Words:", lemmatized_words)

Stopwords and rarewords removal

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk. stem import PorterStemmer
from nltk. probability import FreqDist
nltk. download ( 'punkt')
nltk. download ('stopwords')
def preprocess_text(text, rare_threshold=1) :
      tokens = word_tokenize (text)
      stop_words = set(stopwords.words('english'))
      filtered_tokens = [token for token in tokens if token. lower () not in stop_words]
      frequency_dist = FreqDist(filtered_tokens)
      rare_words = set(word for word, freq in frequency_dist. items () if freq <= rare_threshold)
      filtered_tokens = [token for token in filtered_tokens if token not in rare_words]
      stemmer = PorterStemmer ( )
      stemmed_tokens = [stemmer .stem(token) for token in filtered_tokens]
      return filtered_tokens, stemmed_tokens
def main():
     input_text = """Your input text goes here. NLP is a machine learning technology.
                   Applications of NLP are very useful in real life."""
     rare_threshold = 1
     filtered_tokens, stemmed_tokens = preprocess_text (input_text, rare_threshold)
     print("Filtered Tokens (after removing stopwords and rare words):")
     print(filtered_tokens)
     print("\nStemmed Tokens:")
     print(stemmed_tokens)
if __name__=="__main__":
  main()

Filtered Tokens (after removing stopwords and rare words):
['.', 'NLP', '.', 'NLP', '.']

Stemmed Tokens:
['.', 'nlp', '.', 'nlp', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Part-of-Speech (POS) Tagging:
 Identify the parts of speech

In [14]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def identify_parts_of_speech(text):
   tokens = word_tokenize (text)
   pos_tags = nltk.pos_tag (tokens)
   return pos_tags
def main():
   input_text = """NLP is a machine learning technology.
                Applications of NLP are very useful in real life."""
   pos_tags = identify_parts_of_speech(input_text)
   print ("Parts of Speech:")
   for token, pos_tag in pos_tags:
       print(f'{token}: {pos_tag}')
if __name__ == "__main__":
  main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Parts of Speech:
NLP: NNP
is: VBZ
a: DT
machine: NN
learning: VBG
technology: NN
.: .
Applications: NNS
of: IN
NLP: NNP
are: VBP
very: RB
useful: JJ
in: IN
real: JJ
life: NN
.: .


UnigramTagger, BigramTagger, TrigramTagger

In [16]:
#UnigramTagger, BigramTagger, TrigramTagger
import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger,BigramTagger,TrigramTagger
from nltk.tokenize import word_tokenize
nltk.download('brown')
nltk.download('punkt')

brown_corpus=brown.tagged_sents()
train_size=int(0.8*len(brown_corpus))
train_set=brown_corpus[:train_size]
test_set=brown_corpus[train_size:]

unigram_tagger=UnigramTagger(train_set)
bigram_tagger=BigramTagger(train_set)
trigram_tagger=TrigramTagger(train_set)

sentence="the quick brown fox jumps over the lazy dog"
tokens=word_tokenize(sentence)

unigram_tagged_sentence=unigram_tagger.tag(tokens)
bigram_tagged_sentence=bigram_tagger.tag(tokens)
trigram_tagged_sentence=trigram_tagger.tag(tokens)
print("The unigram tagged sentence is ",unigram_tagged_sentence)
print("The bigram tagged sentence is ",bigram_tagged_sentence)
print("The trigram tagged sentence is ",trigram_tagged_sentence)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The unigram tagged sentence is  [('the', 'AT'), ('quick', 'JJ'), ('brown', 'JJ'), ('fox', 'NN'), ('jumps', 'NNS'), ('over', 'IN'), ('the', 'AT'), ('lazy', 'JJ'), ('dog', 'NN')]
The bigram tagged sentence is  [('the', 'AT'), ('quick', 'JJ'), ('brown', 'JJ'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'AT'), ('lazy', 'JJ'), ('dog', 'NN')]
The trigram tagged sentence is  [('the', 'AT'), ('quick', 'JJ'), ('brown', 'JJ'), ('fox', None), ('jumps', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]


Brill Tagger

In [15]:
#Brill tagger
import nltk
from nltk.tag import brill
from nltk.tag import UnigramTagger, BrillTaggerTrainer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Sample sentence
sentence = "This is a sample sentence."

# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)

# Step 1: Prepare data (a single sentence)
tagged_sentence = nltk.pos_tag(tokens)

# Step 2: Initial tagging
baseline_tagger = UnigramTagger([tagged_sentence])

# Step 3: Error Analysis
initial_tags = baseline_tagger.tag(tokens)
print("Initial tags:", initial_tags)

# Step 4: Generate Transformation Rules
templates = brill.nltkdemo18()

# Step 5: Train the Brill Tagger
trainer = BrillTaggerTrainer(baseline_tagger, templates, trace=3)
brill_tagger = trainer.train([tagged_sentence], max_rules=10)

# Step 6: Tag the sentence using Brill Tagger
final_tags = brill_tagger.tag(tokens)
print("Final tags:", final_tags)

Initial tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN'), ('.', '.')]
TBL train (fast) (seqs: 1; tokens: 6; tpls: 18; min score: 2; min acc: None)
Finding initial useful rules...
    Found 0 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
Final tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN'), ('.', '.')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Affix Tagger

In [18]:
#Affix tagger
import nltk
from nltk.corpus import brown
from nltk.tag import AffixTagger
from nltk.tokenize import word_tokenize
nltk.download('brown')
nltk.download('punkt')

brown_corpus=brown.tagged_sents()
train_size=int(0.8*len(brown_corpus))
train_set=brown_corpus[:train_size]
test_set=brown_corpus[train_size:]
affix_tagger=AffixTagger(train_set)

sentence="the quick brown fox jumps over a lazy dog"
tokens=word_tokenize(sentence)
tagged_sentence=affix_tagger.tag(tokens)
print("the tagged sentence is: ",tagged_sentence)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


the tagged sentence is:  [('the', None), ('quick', 'JJ'), ('brown', 'VBN'), ('fox', None), ('jumps', 'NNS'), ('over', None), ('a', None), ('lazy', None), ('dog', None)]


### Named Entity Recognition (NER):

In [21]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

class NERTagger:
  def __init__(self):
    pass
  def tag(self, sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    ne_tagged_tokens = nltk.ne_chunk(tagged_tokens)
    return ne_tagged_tokens
ner_tagger = NERTagger()

sentence = "Miami is known for its stunning beaches and beautiful skyline views making it a popular tourist destination"
tagged_sentence = ner_tagger.tag(sentence)
print(tagged_sentence)


(S
  (GPE Miami/NNP)
  is/VBZ
  known/VBN
  for/IN
  its/PRP$
  stunning/JJ
  beaches/NNS
  and/CC
  beautiful/JJ
  skyline/JJ
  views/NNS
  making/VBG
  it/PRP
  a/DT
  popular/JJ
  tourist/NN
  destination/NN)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
