In [None]:
# from google.colab import drive
# drive.mount('content/')

Mounted at content/


Certainly! Here are some advanced linguistic processing techniques that can be used to enhance lemmatization:

1. Part-of-Speech (POS) Tagging: POS tagging is the process of assigning grammatical tags to words in a sentence, such as noun, verb, adjective, etc. POS tags can provide valuable information for disambiguating the correct lemmas. For example, the word "running" can be a verb or a noun, but knowing its POS tag can help determine the appropriate lemma.

2. Morphological Analysis: Morphological analysis involves breaking down words into their constituent morphemes, such as prefixes, suffixes, and roots. By identifying and manipulating these morphemes, lemmatizers can handle inflectional and derivational forms more accurately. Tools like morphological analyzers and finite-state transducers can be utilized for this purpose.

3. Statistical Lemmatization: Statistical approaches leverage machine learning algorithms to learn lemmatization patterns from annotated training data. These models can capture complex relationships between word forms and their corresponding lemmas, improving accuracy for irregular and less common words. Techniques like sequence labeling (e.g., Conditional Random Fields) and sequence-to-sequence models (e.g., Recurrent Neural Networks, Transformers) can be employed.

4. Language-Specific Resources: Language-specific linguistic resources, such as dictionaries, lexicons, and specialized corpora, can provide valuable insights into lemmatization. These resources may include irregular word forms, exceptions to general rules, and specific linguistic phenomena unique to a particular language.

5. Contextual Information: Incorporating contextual information, such as surrounding words or syntactic structure, can aid in disambiguating the correct lemmas. Context-aware lemmatizers can take advantage of contextual cues to make more informed decisions. Dependency parsing, word embeddings, and contextual language models (e.g., BERT, GPT) can assist in capturing and utilizing contextual information.

6. Evaluation and Error Analysis: It is crucial to evaluate the performance of lemmatizers on annotated test data and conduct error analysis to identify and address common errors. This iterative process helps refine the lemmatization techniques and fine-tune parameters, ensuring better accuracy and coverage.

It's important to note that implementing these techniques can be complex, and it often requires specialized libraries, linguistic resources, and domain expertise. Several open-source NLP libraries, such as NLTK, spaCy, and StanfordNLP, offer pre-built components and models for various linguistic tasks, including lemmatization, that incorporate many of these advanced techniques.

By combining multiple approaches and techniques, you can develop a more accurate and robust lemmatizer that handles a wide range of word forms, linguistic phenomena, and language-specific challenges.

In [7]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
from nltk.corpus import wordnet as wn

class CorpusBasedLemmatizer:
    def __init__(self):
        self.lemmatizer = nltk.WordNetLemmatizer()
    
    def lemmatize(self, word):
        lemma = self.lemmatizer.lemmatize(word)
        if lemma == word:
            synsets = wn.synsets(word)
            if synsets:
                lemma = synsets[0].lemmas()[0].name()
        return lemma


corpus_lemmatizer = CorpusBasedLemmatizer()
words = "The striped bat are hanging on their foot for best".split()

for word in words:
    # rule_lemma = rule_lemmatizer.lemmatize(word)
    corpus_lemma = corpus_lemmatizer.lemmatize(word)
    print(f"Word: {word}\tCorpus Lemma: {corpus_lemma}")


Word: The	Corpus Lemma: The
Word: striped	Corpus Lemma: stripe
Word: bat	Corpus Lemma: bat
Word: are	Corpus Lemma: are
Word: hanging	Corpus Lemma: hanging
Word: on	Corpus Lemma: on
Word: their	Corpus Lemma: their
Word: foot	Corpus Lemma: foot
Word: for	Corpus Lemma: for
Word: best	Corpus Lemma: best


## On sentence identifies POS and lemmatize accordingly 

In [11]:
import re
import nltk
from nltk.corpus import wordnet

class RuleBasedLemmatizer:
    def __init__(self):
        self.rules = [
            (r"(s|es|ies)$", ""),
            (r"([^aeiou])s$", r"\1"),
            (r"ing$", ""),
            (r"([aeiou].+)ed$", r"\1"),
            (r"([aeiou].+)ed$", r"\1"),
            (r"([aeiou].+)y$", r"\1"),
        ]
    
    def lemmatize(self, word, pos):
        for rule in self.rules:
            pattern, replacement = rule
            if re.search(pattern, word):
                return re.sub(pattern, replacement, word)
        
        # Handle complex cases based on POS
        if pos.startswith('V'):  # Verb
            return self.lemmatize_verb(word)
        elif pos.startswith('N'):  # Noun
            return self.lemmatize_noun(word)
        elif pos.startswith('J'):  # Adjective
            return self.lemmatize_adjective(word)
        elif pos.startswith('R'):  # Adverb
            return self.lemmatize_adverb(word)
        
        return word
    
    def lemmatize_verb(self, word):
        lemmatizer = nltk.WordNetLemmatizer()
        lemma = lemmatizer.lemmatize(word, 'v')
        return lemma
    
    def lemmatize_noun(self, word):
        lemmatizer = nltk.WordNetLemmatizer()
        lemma = lemmatizer.lemmatize(word, 'n')
        return lemma
    
    def lemmatize_adjective(self, word):
        lemmatizer = nltk.WordNetLemmatizer()
        lemma = lemmatizer.lemmatize(word, 'a')
        return lemma
    
    def lemmatize_adverb(self, word):
        lemmatizer = nltk.WordNetLemmatizer()
        lemma = lemmatizer.lemmatize(word, 'r')
        return lemma


rule_lemmatizer = RuleBasedLemmatizer()

sentence = "The cats are running and jumping playfully."
tokens = nltk.word_tokenize(sentence)

lemmas = []
for token in tokens:
    pos = nltk.pos_tag([token])[0][1]
    rule_lemma = rule_lemmatizer.lemmatize(token, pos)
    lemmas.append((token, rule_lemma))

print("Original sentence:", sentence)
print("Token\tRule Lemma")
for lemma in lemmas:
    print(f"{lemma[0]}\t{lemma[1]}")


Original sentence: The cats are running and jumping playfully.
Token	Rule Lemma
The	The
cats	cat
are	be
running	runn
and	and
jumping	jump
playfully	playfull
.	.


## Using Spacy

In [15]:
import spacy

nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])

sentence = "The cats are running and jumping playfully."
doc = nlp(sentence)

print("Original sentence:", sentence)
print("Token\tRule Lemma")
for token in doc:
  print(f"{token} \t {token.lemma_}")


Original sentence: The cats are running and jumping playfully.
Token	Rule Lemma
The 	 the
cats 	 cat
are 	 be
running 	 run
and 	 and
jumping 	 jump
playfully 	 playfully
. 	 .


# RuleBasedLemmatizer & CorpusBasedLemmatizer

In [3]:
import re

class RuleBasedLemmatizer:
    def __init__(self):
        self.rules = [
            (r"(s|es|ies)$", ""),
            (r"([^aeiou])s$", r"\1"),
            (r"ing$", ""),
            (r"([aeiou].+)ed$", r"\1"),
            (r"([aeiou].+)ed$", r"\1"),
            (r"([aeiou].+)y$", r"\1"),
        ]
    
    def lemmatize(self, word):
        for rule in self.rules:
            pattern, replacement = rule
            if re.search(pattern, word):
                return re.sub(pattern, replacement, word)
        return word


class CorpusBasedLemmatizer:
    def __init__(self, corpus):
        self.lemmas = {}
        for line in corpus:
            word, lemma = line.strip().split("\t")
            self.lemmas[word] = lemma
    
    def lemmatize(self, word):
        return self.lemmas.get(word, word)


rule_lemmatizer = RuleBasedLemmatizer()
corpus = [
    "cats\tcat",
    "dogs\tdog",
    "running\trun",
    "jumping\tjump",
    "children\tchild",
]
corpus_lemmatizer = CorpusBasedLemmatizer(corpus)

words = ["cats", "running", "jumping", "children"]

for word in words:
    rule_lemma = rule_lemmatizer.lemmatize(word)
    corpus_lemma = corpus_lemmatizer.lemmatize(word)
    print(f"Word: {word}\tRule Lemma: {rule_lemma}\tCorpus Lemma: {corpus_lemma}")


Word: cats	Rule Lemma: cat	Corpus Lemma: cat
Word: running	Rule Lemma: runn	Corpus Lemma: run
Word: jumping	Rule Lemma: jump	Corpus Lemma: jump
Word: children	Rule Lemma: children	Corpus Lemma: child


# Morphological Analysis using Finite-State Transducers (FSTs)

Finite-State Transducers (FSTs) can be used for morphological analysis by encoding morphological rules and transformations into a computational model. Here's an example of how to use the OpenFST library in Python to construct and apply an FST for morphological analysis:

First, you'll need to install the OpenFST library. You can find installation instructions specific to your operating system on the OpenFST website (http://www.openfst.org).

Once OpenFST is installed, you can use the pywrapfst module in Python to work with FSTs. Here's an example code snippet:

In [17]:
!pip install -q pynini

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.3/161.3 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In this example, we use the pynini.Fst() class to create an FST and add transitions based on the provided rules. The morphological_analysis() function takes an input word and the morphological FST, composes them, and retrieves the morphological analysis using the shortest path algorithm.

Please note that pynini may have some differences in usage compared to hfst, but the basic concepts and functionality for working with FSTs remain similar. This example demonstrates a simple morphological analysis where "a" remains unchanged, "b" is replaced with "c," and "c" is replaced with "d" when the word "abc" is analyzed.

In [23]:
import pynini

def build_morphological_fst(rules):
    morph_fst = pynini.Fst()

    # Define the input and output symbol tables
    input_syms = pynini.SymbolTable()
    output_syms = pynini.SymbolTable()

    # Iterate over the rules and add transitions to the FST
    for rule in rules:
        input_label, output_label = rule

        # Add the labels to the symbol tables
        input_id = input_syms.add_symbol(input_label)
        output_id = output_syms.add_symbol(output_label)

        # Add a state and transition to the FST
        state = morph_fst.add_state()
        morph_fst.add_arc(state, pynini.Arc(input_id, output_id, None, state))

    # Set the start and final states
    morph_fst.set_start(0)
    morph_fst.set_final(morph_fst.add_state())

    # Set the input and output symbol tables
    morph_fst.set_input_symbols(input_syms)
    morph_fst.set_output_symbols(output_syms)

    return morph_fst

def morphological_analysis(word, morph_fst):
    input_fst = pynini.Fst()
    input_fst.add_state()
    input_fst.set_start(0)

    # Add transitions to the FST based on input word
    for i, char in enumerate(word):
        input_id = morph_fst.input_symbols().find(char)
        input_fst.add_arc(i, pynini.Arc(input_id, input_id, None, i + 1))
        input_fst.set_final(i + 1)

    # Compose the input FST with the morphological FST
    composed_fst = pynini.compose(input_fst, morph_fst)

    # Get the shortest path to extract the analysis
    shortest_path = pynini.shortestpath(composed_fst)

    # Extract the analysis results
    analysis = shortest_path.paths().ostrings()
    return analysis

# Example usage
rules = [
    ('a', 'a'),   # Rule: a -> a
    ('b', 'c'),   # Rule: b -> c
    ('c', 'd')    # Rule: c -> d
]

morph_fst = build_morphological_fst(rules)

word = "abc"
analysis = morphological_analysis(word, morph_fst)
print(f"Morphological analysis of '{word}': {analysis}")


FstIndexError: ignored