In [None]:
!pip install nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Tokenization**

In [None]:
text = "She sings beautifully at the concert"
tokens = word_tokenize(text)
print(tokens)

['She', 'sings', 'beautifully', 'at', 'the', 'concert']


**Stemming**

In [None]:
stemmer = PorterStemmer()
words = ["She", "sings", "beautifully", "at", "the", "concert"]
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)


['she', 'sing', 'beauti', 'at', 'the', 'concert']


**Morphemes Identification**

In [None]:
def identify_morphemes(word):
    morphemes = []
    prefix = word[:2]
    root = word[2:-2]
    suffix = word[-2:]
    morphemes.append(prefix)
    morphemes.append(root)
    morphemes.append(suffix)
    return morphemes

word = "undoable"
morphemes = identify_morphemes(word)
print("Morphemes:", morphemes)

Morphemes: ['un', 'doab', 'le']


**Parse Tree**

In [None]:
def parse_morphology(word):
    parse_tree = {}
    parse_tree['prefix'] = word[:2]
    parse_tree['root'] = word[2:-2]
    parse_tree['suffix'] = word[-2:]
    return parse_tree

word = "inactive"
parse_tree = parse_morphology(word)
print("Parse Tree:", parse_tree)

Parse Tree: {'prefix': 'in', 'root': 'acti', 'suffix': 've'}


**Morpheme Frequencies**

In [None]:
from collections import Counter

corpus = ["redo", "unnatural", "outcome", "understandable"]

def identify_morphemes(word):
    prefix = word[:2]
    suffix = word[-2:]
    return prefix, suffix

morpheme_counts = Counter()
for word in corpus:
    prefix, suffix = identify_morphemes(word)
    morpheme_counts[prefix] += 1
    morpheme_counts[suffix] += 1

print("Morpheme Frequencies:")
for morpheme, count in morpheme_counts.items():
    print(f"{morpheme}: {count}")


Morpheme Frequencies:
re: 1
do: 1
un: 2
al: 1
ou: 1
me: 1
le: 1


**Overstemming Error & Correction**

In [None]:
def correct_overstemming(word):
    if word.endswith("ing") or word.endswith("ly"):
        corrected_word = word[:-2]
    else:
        corrected_word = word
    return corrected_word

words = ["running","quickly","happily", "friendship"]

corrected_words = [correct_overstemming(word) for word in words]

print("Corrected Words:", corrected_words)

Corrected Words: ['runni', 'quick', 'happi', 'friendship']
