In [1]:
import nltk
from nltk import word_tokenize

In [2]:
text = word_tokenize("And now for something completely different")
print(nltk.pos_tag(text))

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]


In [3]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
print(text.similar('woman'))

man time day year car moment world house family child country boy
state job place way war girl work word
None


In [6]:
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token)

('fly', 'NN')
('And now for something completely', 'DIFFERENT')


In [7]:
print(nltk.corpus.brown.tagged_words())
print(nltk.corpus.brown.tagged_words(tagset='universal'))

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
[('The', 'DET'), ('Fulton', 'NOUN'), ...]


In [12]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print(tag_fd.most_common())

[('NOUN', 30654), ('VERB', 14399), ('ADP', 12355), ('.', 11928), ('DET', 11389), ('ADJ', 6706), ('ADV', 3349), ('CONJ', 2717), ('PRON', 2535), ('PRT', 2264), ('NUM', 2166), ('X', 92)]


In [13]:
from nltk.probability import ConditionalFreqDist
from nltk.tokenize import word_tokenize
sent = "the the the dog dog some other words that we do not care about"
cfdist = ConditionalFreqDist()
for word in word_tokenize(sent):
    condition = len(word)
    cfdist[condition][word] += 1

# Task 4
Find all POS tags for word ‘promised’ in the Brown POS-tagged corpus (available in NLTK).
The lecture slides provide some suggestions as how to start. How long did it take to process the
whole corpus? Compare that with your colleagues – is your solution efficient.

In [9]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text_tags = nltk.pos_tag(text)
promised_POS = []
for tag in text_tags:
    if tag[0] == 'promised':
        promised_POS.append(tag[1])
print(promised_POS)

['VBD', 'VBD', 'VBD', 'VBD', 'VBD', 'VBN', 'VBN', 'VBD', 'VBN', 'VBD', 'VBN', 'VBD', 'VBN', 'VBN', 'VBD', 'VBN', 'VBN', 'VBN', 'VBD', 'VBN', 'VBN', 'VBN', 'VBN', 'JJ', 'VBN', 'VBN', 'VBD', 'VBD', 'VBD', 'VBN', 'VBD', 'VBN', 'VBN', 'VBN', 'VBN', 'VBD', 'VBD', 'VBD', 'VBD', 'VBD', 'VBD', 'VBD', 'VBN', 'VBN', 'VBD']


In [10]:
import numpy as np
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text_tags = nltk.pos_tag(text)
a = np.array(text_tags)
print(a)

[['the' 'DT']
 ['fulton' 'NN']
 ['county' 'NN']
 ...
 ['was' 'VBD']
 ['stupefying' 'VBG']
 ['.' '.']]


# Task 5
Use the POS-tagged Brown corpus (available in NLTK) to estimate word likelihood and tag transition probabilities you would need to be able to disambiguate which of the following two POS tagging results is more likely:

    (1) People/NNS continue/VB to/TO inquire/VB the/DT reason/NN for/IN the/AT race/NN for/IN outer/JJ space/NN
    
    (2) People/NNS continue/VB to/TO inquire/VB the/DT reason/NN for/IN the/AT race/VB for/IN outer/JJ space/NN

If necessary, you can use add-one smoothing to estimate the probabilities of words not seen in the corpus.

What do the probabilities you have obtained tell you – which of the two POS tags (NN or VB) for word ‘race’ are more likely in this context? Is that the correct POS tag?

In [26]:
import nltk
from nltk.corpus import brown
from nltk.probability import FreqDist, ConditionalFreqDist

nltk.download('brown')
brown_corpus = brown.tagged_sents()
word_tag_fd = nltk.FreqDist()
tag_transition_fd = nltk.FreqDist()


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\neoni\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [27]:
for sentence in brown_corpus:
    previous_tag = None
    for word, tag in sentence:
        word_tag_fd[(word, tag)] += 1
        if (previous_tag != None):
            tag_transition_fd[(previous_tag, tag)] += 1
        previous_tag = tag

In [31]:
def calculate_word_likelihood(word, tag):
    # Smoothing parameters
    V = len(word_tag_fd)  # Vocabulary size
    N = sum(word_tag_fd.values())  # Total word-tag pairs
    smoothing_term = 1.0 / (N + V)

    # Calculate smoothed probability
    return (word_tag_fd[(word, tag)] + 1) / (N + V) if (word, tag) in word_tag_fd else smoothing_term

# Calculate tag transition probabilities with add-one smoothing
def calculate_tag_transition_probability(tag1, tag2):
    # Smoothing parameters
    tags = set(tag for (_, tag) in tag_transition_fd)
    V = len(tags)  # Number of unique tags
    N = sum(tag_transition_fd.values())  # Total tag transitions
    smoothing_term = 1.0 / (N + V)

    # Calculate smoothed probability
    return (tag_transition_fd[(tag1, tag2)] + 1) / (N + V) if (tag1, tag2) in tag_transition_fd else smoothing_term

In [32]:
def calc_sentence_probability(sentence):
    word_tag_pairs = [word_tag_pair.split('/') for word_tag_pair in sentence]
    sentence_probability = 1
    previous_tag = None
    for word, tag in word_tag_pairs:
        word_probability = calculate_word_likelihood(word, tag)
        if (previous_tag == None):
            sentence_probability *= word_probability
        else:
            transition_probability = calculate_tag_transition_probability(previous_tag, tag)
            sentence_probability *= word_probability * transition_probability
        previous_tag = tag
    return sentence_probability

In [50]:
sentence1 = ["People/NNS", "continue/VB", "to/TO", "inquire/VB", "the/DT", "reason/NN", "for/IN", "the/AT", "race/NN", "for/IN", "outer/JJ", "space/NN"]
sentence2 = ["People/NNS", "continue/VB", "to/TO", "inquire/VB", "the/DT", "reason/NN", "for/IN", "the/AT", "race/VB", "for/IN", "outer/JJ", "space/NN"]
probability1 = calc_sentence_probability(sentence1)
probability2 = calc_sentence_probability(sentence2)
print(probability1, probability2)
if probability1 > probability2:
    print("Tagging Result 1 is more likely.")
elif probability2 > probability1:
    print("Tagging Result 2 is more likely.")
else:
    print("Both tagging results are equally likely.")

1.832576371905368e-67 3.8057610018359524e-73
Tagging Result 1 is more likely.


# New weird idea

In [2]:

import nltk
from nltk.probability import FreqDist, ConditionalFreqDist
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

In [3]:
text_tags = nltk.pos_tag(text)

In [None]:
print(len(text_tags))