<a href="https://colab.research.google.com/github/MapariPrajwal/NLP/blob/main/PoS_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import indian
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger

In [2]:
# Step 1: Inbuilt PoS Tagging
def inbuilt_pos_tagging(sentence):
    tokens = word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    return tagged

In [3]:
# Step 2: Regular Expression PoS Tagger
def regex_pos_tagging(sentence):
    patterns = [
        (r'\b(?:N\w+)\b', 'NOUN'),      # Nouns
        (r'\b(?:VB\w+)\b', 'VERB'),     # Verbs
        (r'\b(?:JJ\w*)\b', 'ADJ'),      # Adjectives
        (r'\b(?:RB\w*)\b', 'ADV'),      # Adverbs
        (r'\b(?:DT\w*)\b', 'DET')       # Determiners
    ]
    regex_tagger = nltk.RegexpTagger(patterns)
    tokens = word_tokenize(sentence)
    tagged = regex_tagger.tag(tokens)
    return tagged

In [5]:
def dict_pos_tagging(sentence):
    dictionary = {
        'dog': 'NOUN',
        'run': 'VERB',
        'fast': 'ADV',
        'beautiful': 'ADJ',
        'the': 'DET',
        'cat': 'NOUN',
        'jump': 'VERB',
        'quickly': 'ADV',
        'brown': 'ADJ',
        'lazy': 'ADJ',
        'apple': 'NOUN',
        'eat': 'VERB',
        'deliciously': 'ADV',
        'red': 'ADJ',
        'car': 'NOUN',
        'drive': 'VERB',
        'carefully': 'ADV',
        'big': 'ADJ',
        'book': 'NOUN',
        'read': 'VERB',
        'slowly': 'ADV',
        'interesting': 'ADJ',
    }
    tokens = word_tokenize(sentence)
    tagged = [(token, dictionary.get(token.lower(), 'UNK')) for token in tokens]
    return tagged

In [6]:
# Step 4: N-Gram Model based PoS Tagger
def ngram_pos_tagging(sentence):
    tagged_sents = indian.tagged_sents()
    train_size = int(len(tagged_sents) * 0.9)
    train_sents = tagged_sents[:train_size]
    test_sents = tagged_sents[train_size:]

    default_tagger = DefaultTagger('NN')  # Default tagger
    unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)  # Unigram tagger
    bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)    # Bigram tagger
    trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)    # Trigram tagger

    tokens = word_tokenize(sentence)
    tagged = trigram_tagger.tag(tokens)
    return tagged

In [7]:
# Step 5: Extend to Indian languages (Hindi and Marathi)
def indian_languages_pos_tagging(sentence, language='hindi'):
    if language.lower() == 'hindi':
        # Use Hindi corpus for training n-gram model
        tagged_sents = indian.tagged_sents('hindi.pos')
    elif language.lower() == 'marathi':
        # Use Marathi corpus for training n-gram model
        tagged_sents = indian.tagged_sents('marathi.pos')
    else:
        return "Language not supported."

    train_size = int(len(tagged_sents) * 0.9)
    train_sents = tagged_sents[:train_size]
    test_sents = tagged_sents[train_size:]

    default_tagger = DefaultTagger('NN')
    unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)
    bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)
    trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)

    tokens = word_tokenize(sentence)
    tagged = trigram_tagger.tag(tokens)
    return tagged

In [9]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
 nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [13]:
nltk.download('indian')

[nltk_data] Downloading package indian to /root/nltk_data...
[nltk_data]   Unzipping corpora/indian.zip.


True

In [14]:
english_sentence = "The quick brown fox jumps over the lazy dog."
print("Inbuilt PoS Tagging:", inbuilt_pos_tagging(english_sentence))
print("Regex PoS Tagging:", regex_pos_tagging(english_sentence))
print("Dictionary PoS Tagging:", dict_pos_tagging(english_sentence))
print("N-Gram PoS Tagging:", ngram_pos_tagging(english_sentence))

hindi_sentence = "राम बहुत तेज़ भागता है।"
print("Hindi PoS Tagging:", indian_languages_pos_tagging(hindi_sentence, language='hindi'))

marathi_sentence = "राम खूप जल्द धावतो."
print("Marathi PoS Tagging:", indian_languages_pos_tagging(marathi_sentence, language='marathi'))

Inbuilt PoS Tagging: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]
Regex PoS Tagging: [('The', None), ('quick', None), ('brown', None), ('fox', None), ('jumps', None), ('over', None), ('the', None), ('lazy', None), ('dog', None), ('.', None)]
Dictionary PoS Tagging: [('The', 'DET'), ('quick', 'UNK'), ('brown', 'ADJ'), ('fox', 'UNK'), ('jumps', 'UNK'), ('over', 'UNK'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN'), ('.', 'UNK')]
N-Gram PoS Tagging: [('The', 'NN'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'NN'), ('over', 'NN'), ('the', 'NN'), ('lazy', 'NN'), ('dog', 'NN'), ('.', 'SYM')]
Hindi PoS Tagging: [('राम', 'NN'), ('बहुत', 'INTF'), ('तेज़', 'NN'), ('भागता', 'NN'), ('है।', 'NN')]
Marathi PoS Tagging: [('राम', 'NN'), ('खूप', 'QF'), ('जल्द', 'NN'), ('धावतो', 'NN'), ('.', 'SYM')]
