In [21]:
import nltk
from nltk.tokenize import word_tokenize # For tokenization
from nltk.stem import WordNetLemmatizer, PorterStemmer # For lemmatization
from nltk.tag import pos_tag # For POS tagging
from nltk.corpus import cmudict # For pronunciation
import spacy # For syntactic parsing
from spacy import displacy
from nltk.corpus import brown

In [22]:
# Load the CMU Pronouncing Dictionary
pronouncing_dict = cmudict.dict()

# Function to get pronunciation using CMU Pronouncing Dictionary
def get_pronunciation(words):
    try:
        return pronouncing_dict[word.lower()][0]  # Returning the first pronunciation variant
    except KeyError:
        return ["No pronunciation found"]
        
word = "friend"

get_pronunciation(word)


['F', 'R', 'EH1', 'N', 'D']

Pronunciation variants can significantly impact speech recognition or synthesis systems in several ways:
- Accent Variations: Different regions and speakers may pronounce words differently, leading to variations in pronunciation variants. Speech recognition systems need to be robust enough to recognize and interpret various accents accurately.
- Homophones: Words that sound the same but have different meanings (homophones) may have the same pronunciation variants. This ambiguity can introduce challenges for speech recognition systems in accurately understanding the intended word based on context.
- Ambiguous Pronunciations: Some words may have multiple valid pronunciation variants, leading to ambiguity in interpretation. Speech recognition systems need to handle such cases by considering contextual cues to determine the correct pronunciation.
- Error Handling: In cases where a word is not found in the pronunciation dictionary, speech recognition systems may need to fallback to alternative strategies, such as spell correction or context-based prediction, to accurately transcribe speech.
Overall, understanding and effectively handling pronunciation variants are essential for robust and accurate speech recognition and synthesis systems.

In [23]:
tokens = ["running", "ran", "runs", "leaves", "left", "leftover"]

# Stem word
def stem_word(word):
    porter = PorterStemmer()
    return porter.stem(word)
# Lemmatize word
def lemmatize_word(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

print("Stemming:")
for token in tokens:
    stemmed_token = stem_word(token)
    print(f"{token}: {stemmed_token}")

print("\nLemmatization:")
for token in tokens:
    lemmatized_token = lemmatize_word(token)
    print(f"{token}: {lemmatized_token}")


Stemming:
running: run
ran: ran
runs: run
leaves: leav
left: left
leftover: leftov

Lemmatization:
running: running
ran: ran
runs: run
leaves: leaf
left: left
leftover: leftover


Comparison:
- Stemming: Stemming removes suffixes from words to obtain the root or base form. For example, "running", "ran", and "runs" all stem to "run".
- Lemmatization: Lemmatization considers the context of the word and aims to return the base or dictionary form (lemma). For example, "running" lemmatizes to "running", "ran" to "run", and "runs" to "run".

Preferred Scenarios:
- Stemming is generally faster and simpler than lemmatization, making it suitable for applications where speed is crucial, such as information retrieval or text classification.
- Lemmatization produces more accurate results by considering the context of words, which is important for tasks requiring precision, such as language understanding or sentiment analysis.
- Stemming may be preferred in scenarios where the goal is to reduce words to their base form for indexing or similarity calculations, while lemmatization may be preferred when maintaining the grammatical integrity and semantics of words is essential.
- In applications where the distinction between different forms of words (e.g., verb tense or pluralization) is important, lemmatization is usually preferred over stemming.

In [24]:

# Function for Context Analysis
def find_common_pos_context(corpus):
    adj_noun_pairs = {}
    noun_adj_pairs = {}

    # Iterate over sentences in the corpus
    for sentence in corpus:
        tagged_sentence = nltk.pos_tag(sentence)

        # Iterate over tagged words in the sentence
        for i in range(len(tagged_sentence) - 1):
            current_word, current_pos = tagged_sentence[i]
            next_word, next_pos = tagged_sentence[i + 1]

            # If the current POS is an adjective and the next POS is a noun
            if current_pos.startswith('JJ') and next_pos.startswith('NN'):
                if (current_word, next_word) in adj_noun_pairs:
                    adj_noun_pairs[(current_word, next_word)] += 1
                else:
                    adj_noun_pairs[(current_word, next_word)] = 1

            # If the current POS is a noun and the next POS is an adjective
            elif current_pos.startswith('NN') and next_pos.startswith('JJ'):
                if (current_word, next_word) in noun_adj_pairs:
                    noun_adj_pairs[(current_word, next_word)] += 1
                else:
                    noun_adj_pairs[(current_word, next_word)] = 1

    # Find the most common adjective-noun pairs
    top_adj_noun_pairs = sorted(adj_noun_pairs.items(), key=lambda x: x[1], reverse=True)[:10]

    # Find the most common noun-adjective pairs
    top_noun_adj_pairs = sorted(noun_adj_pairs.items(), key=lambda x: x[1], reverse=True)[:10]

    return top_adj_noun_pairs, top_noun_adj_pairs

# Analyze Context 
top_adj_noun_pairs, top_noun_adj_pairs = find_common_pos_context(brown.sents())

print("Top Adjective-Noun Pairs:")
for pair, count in top_adj_noun_pairs:
    print(f"{pair}: {count}")

print("\nTop Noun-Adjective Pairs:")
for pair, count in top_noun_adj_pairs:
    print(f"{pair}: {count}")


Top Adjective-Noun Pairs:
('same', 'time'): 94
('last', 'year'): 68
('first', 'time'): 67
('other', 'hand'): 60
('fiscal', 'year'): 57
('last', 'night'): 56
('high', 'school'): 54
('old', 'man'): 52
('few', 'years'): 49
('young', 'man'): 47

Top Noun-Adjective Pairs:
('years', 'old'): 32
('nothing', 'more'): 16
('feet', 'high'): 11
('hearing', "officer's"): 8
('years', 'older'): 7
('electron', 'optical'): 6
('feet', 'tall'): 5
('feet', 'long'): 5
('something', 'less'): 5
('body', 'weight'): 5


- The analysis reveals the most common adjective-noun pairs and noun-adjective pairs found in the corpus.
- Understanding the context of POS tags can enhance natural language understanding systems by providing insights into the syntactic patterns and relationships between words in text.
- For example, identifying common adjective-noun pairs can help in tasks such as sentiment analysis or descriptive text generation, where adjectives often provide additional information about nouns.
- Similarly, recognizing common noun-adjective pairs can aid in tasks like topic modeling or text summarization, where nouns represent key entities or concepts and adjectives provide attributes or characteristics.
- By analyzing the context of POS tags, NLP systems can better comprehend the semantics and structure of text, leading to more accurate and nuanced language processing.

In [26]:
print(brown.sents())

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
