<a href="https://colab.research.google.com/github/NeoZ666/classroom_NLP/blob/main/NLP_exp6_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dictionary of known lemma forms
lemma_dict = {
    "running": "run",
    "ran": "run",
    "eaten": "eat",
    "ate": "eat",
    "better": "good",
    "worse": "bad",
    "happier": "happy",
    "happiest": "happy",
    "cats": "cat",
    "watches": "watch"
}

# Simplified POS tagging function
def pos_tag(word):
    # Heuristic rules for POS tagging based on suffixes
    if word.endswith('ing'):
        return 'VBG'  # Present participle/gerund
    if word.endswith('ed'):
        return 'VBD'  # Past tense
    if word.endswith('es') or word.endswith('s'):
        return 'NNS'  # Plural noun
    if word in ["better", "worse", "happier", "happiest"]:
        return 'JJR'  # Comparative/Superlative adjective
    return 'NN'  # Default to noun

# Function to lemmatize a word with POS tagging
def lemmatize(word):
    # Step 1: Check if the word is in the lemma dictionary
    if word in lemma_dict:
        return lemma_dict[word]
    
    # Step 2: Perform POS tagging
    pos = pos_tag(word)
    
    # Step 3: Apply lemmatization rules based on POS
    if pos == 'VBG' or pos == 'VBD':  # Verbs
        if word.endswith('ing') and len(word) > 4:
            return word[:-3]
        if word.endswith('ed') and len(word) > 3:
            return word[:-2]
    elif pos == 'NNS':  # Plural nouns
        if word.endswith('es'):
            return word[:-2]
        if word.endswith('s') and len(word) > 2:
            return word[:-1]
    elif pos == 'JJR':  # Comparative/Superlative adjectives
        if word.endswith('er') or word.endswith('est'):
            return word[:-2]
    
    # Step 4: If no rules apply, return the word as is
    return word

# Test cases
words = ["running", "ran", "eaten", "ate", "better", "happier", "cats", "watches", "played", "thinking"]
lemmatized_words = [lemmatize(word) for word in words]

# Print the results in a table format
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Original Word", "Lemmatized Word", "Part of Speech"]

for word in words:
    lemma = lemmatize(word)
    pos = pos_tag(word)
    table.add_row([word, lemma, pos])

print(table)

# Example sentence
sentence = "The happy cats were thinking about running and playing in the garden."
lemmatized_sentence = " ".join([lemmatize(word) for word in sentence.split()])
print(f"Original Sentence: {sentence}")
print(f"Lemmatized Sentence: {lemmatized_sentence}")


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Step 2: Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Step 3: Define a sample paragraph
paragraph = """The Faludi Ferenc Jesuit Academy launched in 2022 a new dialogue between faith and science through a series of eight “mirror” conferences held between January and June. The novelty of this dialogue is that it brought to the same table representatives of religious institutions and men and women with a scientific background to debate on selected topics relevant both for the protection of creation and sustainable development. The following topics were selected for social reflection and debate: partnership and dialogue, green economy, sustainable lifestyle, climate change, poverty, sustainable communities, environmental change, social justice. The original approach of these series of mirror conferences, entitled “Forum for an Integral Ecology,” was embedded in the format of all the conferences, so that each selected topic was addressed by two specialists, one presenting the religious angle and the other the scientific point of view. Interactive debates open to the public, people attending in situ as well as those participating on line, followed the keynote presentations. Through this dynamic, we tried to generate a social reflection on the created world and sustainable development.
The result of this first round of debates was the book entitled Integral Ecology. Dialogue between faith and science in the spirit of Laudato si’, published by the Jesuit Printing House in Budapest. It includes 16 reflections on the eight major topics selected for the forum. Each chapter of the publication offers many ways of finding authentic individual and collective answers to the multiple socio-economic and ecological crises, in the specific cultural and territorial context of Hungary.
The need to supplement the three classical dimensions of sustainable development (the social, economic, and environmental dimensions) with a spiritual dimension, adding specific Christian values to each sustainable development goal, as defined in the 2030 Agenda of the United Nations. Sustainable Development Goals without clearly assumed or agreed upon values cannot mobilize individuals or lead to collective actions. This “supplement” will help provide clearly defined orientation for institutions. Faith provides, in these circumstances, an immanent motivation for the authentic enforcement of the protection of creation by giving us an internal moral guidance. This spiritual dimension of sustainability may lead to the ecological conversion stressed in the papal encyclical Laudato si’.

Ignatian spirituality could play a special role in giving shape to the spiritual dimension of sustainability, specifically through the Spiritual Exercises. The Ignatian method and orientations may help to distinguish between ecological sins and ecological virtues. The strengthening of the relationship between God, humanity, and nature may help in a meaningful ecological conversion process."""

# Step 4: Tokenize and POS Tag the paragraph
tokens = word_tokenize(paragraph)
tagged_tokens = pos_tag(tokens)

# Step 5: Count the occurrences of each POS tag
tag_counts = Counter(tag for word, tag in tagged_tokens)

# Display the counts
tag_counts

# Define POS categories for display
pos_categories = {
    'VERB': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'NOUN': ['NN', 'NNS', 'NNP', 'NNPS'],
    'ADJ': ['JJ', 'JJR', 'JJS'],
    'ADV': ['RB', 'RBR', 'RBS'],
    'CONJ': ['CC'],
    'PRON': ['PRP', 'PRP$', 'WP', 'WP$'],
    'DET': ['DT'],
    'ADP': ['IN'],
    'PART': ['RP'],
    'NUM': ['CD']
}

# Initialize counts
pos_summary = {category: 0 for category in pos_categories}

# Aggregate counts by POS category
for pos_category, tags in pos_categories.items():
    pos_summary[pos_category] = sum(count for tag, count in tag_counts.items() if tag in tags)

# Display the summarized counts
pos_summary


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


{'VERB': 46,
 'NOUN': 142,
 'ADJ': 64,
 'ADV': 6,
 'CONJ': 19,
 'PRON': 4,
 'DET': 59,
 'ADP': 62,
 'PART': 0,
 'NUM': 8}

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import defaultdict
import json

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Define a sample paragraph
paragraph = """The Faludi Ferenc Jesuit Academy launched in 2022 a new dialogue between faith and science through a series of eight “mirror."""

# Tokenize and POS Tag the paragraph
tokens = word_tokenize(paragraph)
tagged_tokens = pos_tag(tokens)

# Define POS categories for classification
pos_categories = {
    'VERB': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'NOUN': ['NN', 'NNS', 'NNP', 'NNPS'],
    'ADJ': ['JJ', 'JJR', 'JJS'],
    'ADV': ['RB', 'RBR', 'RBS'],
    'CONJ': ['CC'],
    'PRON': ['PRP', 'PRP$', 'WP', 'WP$'],
    'DET': ['DT'],
    'ADP': ['IN'],
    'PART': ['RP'],
    'NUM': ['CD']
}

# Function to classify POS tags
def classify_pos(tag):
    for category, tags in pos_categories.items():
        if tag in tags:
            return category
    return 'OTHER'  # For tags not classified in the provided categories

# Classify each word
classified_words = [{'word': word, 'pos': classify_pos(tag)} for word, tag in tagged_tokens]

# Save to JSON file
with open('classified_words.json', 'w') as json_file:
    json.dump(classified_words, json_file, indent=4)

# Display the classified words (for verification)
classified_words


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[{'word': 'The', 'pos': 'DET'},
 {'word': 'Faludi', 'pos': 'NOUN'},
 {'word': 'Ferenc', 'pos': 'NOUN'},
 {'word': 'Jesuit', 'pos': 'NOUN'},
 {'word': 'Academy', 'pos': 'NOUN'},
 {'word': 'launched', 'pos': 'VERB'},
 {'word': 'in', 'pos': 'ADP'},
 {'word': '2022', 'pos': 'NUM'},
 {'word': 'a', 'pos': 'DET'},
 {'word': 'new', 'pos': 'ADJ'},
 {'word': 'dialogue', 'pos': 'NOUN'},
 {'word': 'between', 'pos': 'ADP'},
 {'word': 'faith', 'pos': 'NOUN'},
 {'word': 'and', 'pos': 'CONJ'},
 {'word': 'science', 'pos': 'NOUN'},
 {'word': 'through', 'pos': 'ADP'},
 {'word': 'a', 'pos': 'DET'},
 {'word': 'series', 'pos': 'NOUN'},
 {'word': 'of', 'pos': 'ADP'},
 {'word': 'eight', 'pos': 'NUM'},
 {'word': '“', 'pos': 'NOUN'},
 {'word': 'mirror', 'pos': 'NOUN'},
 {'word': '.', 'pos': 'OTHER'}]