#*6.Implement a basic N-gram model for text generation. For example, generate text using a bigram model using python.*#

In [8]:
import random

def build_bigram_model(corpus):
    bigram_model = {}
    for i in range(len(corpus) - 1):
        current_word = corpus[i]
        next_word = corpus[i + 1]
        if current_word in bigram_model:
            bigram_model[current_word].append(next_word)
        else:
            bigram_model[current_word] = [next_word]
    return bigram_model

def generate_text(bigram_model, start_word, length=10):
    current_word = start_word
    generated_text = [current_word]

    for _ in range(length - 1):
        if current_word in bigram_model:
            next_word = random.choice(bigram_model[current_word])
            generated_text.append(next_word)
            current_word = next_word
        else:
            break

    return ' '.join(generated_text)

# Example usage:
corpus = "This is a simple example for building a bigram model for text generation".split()
bigram_model = build_bigram_model(corpus)

start_word = "This"
generated_text = generate_text(bigram_model, start_word, length=10)
print(generated_text)


This is a simple example for text generation


#*7.Write program using the NLTK library to perform part-of-speech tagging on a text.*#

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tagging(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Perform part-of-speech tagging
    pos_tags = pos_tag(words)

    return pos_tags

# Example usage
text = "NLTK is a powerful library for natural language processing."

pos_tags = pos_tagging(text)
print("Part-of-Speech Tags:")
print(pos_tags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Part-of-Speech Tags:
[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#*8 Implement a simple stochastic part-of-speech tagging algorithm using a basic probabilistic model to assign POS tags using python*#

In [22]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tag import DefaultTagger, UnigramTagger

# Sample training data
training_data = [
    ("The cat is on the mat", [('The', 'DT'), ('cat', 'NN'), ('is', 'VB'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN')]),
    ("A dog is chasing a ball", [('A', 'DT'), ('dog', 'NN'), ('is', 'VB'), ('chasing', 'VBG'), ('a', 'DT'), ('ball', 'NN')]),
    # Add more training data as needed
]

# Function to train a stochastic POS tagger
def train_pos_tagger(training_data):
    # Calculate word frequencies
    word_freq = FreqDist(word_tag for sentence in training_data for word_tag in sentence[1])

    # Calculate the most likely POS tag for each word
    default_tagger = DefaultTagger('NN')  # Default tag is 'NN' (noun)
    pos_tagger = UnigramTagger(model=word_freq, backoff=default_tagger)

    return pos_tagger

# Function to tag a new sentence using the trained tagger
def stochastic_pos_tag(sentence, tagger):
    words = word_tokenize(sentence)
    pos_tags = tagger.tag(words)
    return pos_tags

# Example usage
pos_tagger = train_pos_tagger(training_data)

new_sentence = "The cat is chasing a ball"
tags = stochastic_pos_tag(new_sentence, pos_tagger)

print("Stochastic Part-of-Speech Tags:")
print(tags)

Stochastic Part-of-Speech Tags:
[('The', 'NN'), ('cat', 'NN'), ('is', 'NN'), ('chasing', 'NN'), ('a', 'NN'), ('ball', 'NN')]


#*9.Implement a rule-based part-of-speech tagging system using regular expressions using python*#

In [14]:
import nltk

# Create a RegexpTagger with rules
regexp_tagger = nltk.RegexpTagger([
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # Cardinal numbers
    (r'(The|the|A|a|An|an)$', 'AT'),  # Articles
    (r'.*ing$', 'VBG'),               # Gerunds
    (r'.*ed$', 'VBD'),                # Past tense verbs
    (r'.*es$', 'VBZ'),                # 3rd singular present verbs
    (r'.*ould$', 'MD'),               # Modal verbs
    (r'.*\'s$', 'NN$'),               # Possessive nouns
    (r'.*s$', 'NNS'),                 # Plural nouns
    (r'.*', 'NN')                     # Default to nouns
])

# Example usage
text = "The cat sat on the mat."
tokens = nltk.word_tokenize(text)
tagged_tokens = regexp_tagger.tag(tokens)
print(tagged_tokens)


[('The', 'AT'), ('cat', 'NN'), ('sat', 'NN'), ('on', 'NN'), ('the', 'AT'), ('mat', 'NN'), ('.', 'NN')]


#*10  Implement transformation-based tagging using a set of transformation rules, apply a simple rule to tag words using python.*#


In [20]:

# Define a simple transformation rule
def transform_rule(word, tag):
    if word.endswith('ed'):
        return ('VBD', tag)
    else:
        return (word, tag)

# Apply the transformation rule to a tagged sentence
tagged_sentence = [('The', 'DT'), ('cat', 'NN'), ('sat', 'VBD'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN')]
transformed_sentence = [(word, transform_rule(word, tag)[0]) for word, tag in tagged_sentence]

# Print the transformed sentence
print(transformed_sentence)


[('The', 'The'), ('cat', 'cat'), ('sat', 'sat'), ('on', 'on'), ('the', 'the'), ('mat', 'mat')]
