In [1]:
import nltk
from nltk.corpus import brown

nltk.download('brown')

# Load the tagged sentences from the 'news' category
news_sentences = brown.tagged_sents(categories='news')

# Calculate the split index for the last 10% of the sentences
split_index = int(len(news_sentences) * 0.9)

# Divide the sentences into training and test sets
train_set = news_sentences[:split_index]
test_set = news_sentences[split_index:]

# Optionally, you can print the sizes of the train and test sets
print(f"Training set size: {len(train_set)} sentences")
print(f"Test set size: {len(test_set)} sentences")

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\inbar\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Training set size: 4160 sentences
Test set size: 463 sentences


In [39]:
train_set

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [2]:
import pandas as pd
import ex3

In [3]:
list_of_pd = []
for i in range(len(train_set)):
    list_of_pd.append(pd.DataFrame(train_set[i]))

In [4]:
list_of_pd[1]

Unnamed: 0,0,1
0,The,AT
1,jury,NN
2,further,RBR
3,said,VBD
4,in,IN
5,term-end,NN
6,presentments,NNS
7,that,CS
8,the,AT
9,City,NN-TL


In [5]:
df_train = pd.concat(list_of_pd)

In [6]:
df_train.columns = ["name","tag"]

In [7]:
count_per_tag = df_train.groupby(["name", "tag"]).size().reset_index(name='appearances')


In [8]:
count_per_word = df_train.groupby(["name"]).size().reset_index(name='total_word_appearances')


In [9]:
count_per_tag = count_per_tag.merge(count_per_word,on ="name",how = "left")

In [10]:
count_per_tag["rate"] = count_per_tag["appearances"] /count_per_tag["total_word_appearances"]

In [11]:
count_dict = {}

for _, row in count_per_tag.iterrows():
    word = row['name']
    tag = row['tag']
    appearances = row['rate']
    
    if word not in count_dict:
        count_dict[word] = []
    
    # Append the (tag, appearances) tuple
    count_dict[word].append((tag, appearances))

# Sort each list of tags by appearances in descending order
for word in count_dict:
    count_dict[word] = sorted(count_dict[word], key=lambda x: x[1], reverse=True)


In [12]:
list_of_pd_test = []
for i in range(len(test_set)):
    list_of_pd_test.append(pd.DataFrame(test_set[i]))
df_test = pd.concat(list_of_pd_test)

In [13]:
df_test.columns = ["name","tag"]

In [14]:
def get_prob(word):
    if count_dict.get(word):
        return count_dict[word][0][0]
    else:
        return "NN"

In [15]:
df_test["predicted tag"] = df_test.apply(lambda x:get_prob(x["name"]),axis = 1)

In [16]:
accuracy = len(df_test[df_test["tag"] == df_test["predicted tag"]]) / len(df_test)

In [17]:
accuracy

0.8377354729393003

In [18]:
error_rate = 1 - accuracy

In [19]:
error_rate

0.16226452706069971

In [20]:
train_set[0]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [21]:
from collections import defaultdict

def compute_transition_emission_probabilities(train_set):
    # Transition and emission counts
    transition_counts = defaultdict(lambda: defaultdict(int))
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)

    # Process each sentence
    for sentence in train_set:
        previous_tag = '<s>'  # Start-of-sentence tag
        for word, tag in sentence:
            # Update counts
            transition_counts[previous_tag][tag] += 1
            emission_counts[tag][word] += 1
            tag_counts[tag] += 1
            previous_tag = tag
        # Handle end-of-sentence transition
        transition_counts[previous_tag]['</s>'] += 1

    # Compute probabilities
    transition_probabilities = {}
    for prev_tag, next_tags in transition_counts.items():
        total = sum(next_tags.values())
        transition_probabilities[prev_tag] = {tag: count / total for tag, count in next_tags.items()}

    emission_probabilities = {}
    for tag, words in emission_counts.items():
        total = tag_counts[tag]
        emission_probabilities[tag] = {word: count / total for word, count in words.items()}

    return transition_probabilities, emission_probabilities

In [22]:
transition_probabilities, emission_probabilities = compute_transition_emission_probabilities(train_set)

In [23]:
transition_probabilities

{'<s>': {'AT': 0.18798076923076923,
  '``': 0.062259615384615385,
  'PPS': 0.07475961538461538,
  'NN-HL': 0.015625,
  'WRB': 0.010576923076923078,
  'NN': 0.028365384615384615,
  'RB': 0.03894230769230769,
  'NNS-HL': 0.007211538461538462,
  'IN': 0.06826923076923076,
  'VB-HL': 0.001201923076923077,
  'CD': 0.014423076923076924,
  '(': 0.005048076923076923,
  'NP': 0.13317307692307692,
  'NN-TL': 0.022115384615384617,
  'PP$': 0.014663461538461538,
  'PPSS': 0.021875,
  'NNS': 0.02548076923076923,
  'NP$': 0.0069711538461538465,
  'AT-HL': 0.0016826923076923078,
  'CS': 0.02283653846153846,
  'CC': 0.03413461538461538,
  'NP-HL': 0.019471153846153846,
  '--': 0.018028846153846152,
  'AP': 0.0125,
  'EX': 0.010336538461538461,
  'VBG': 0.012740384615384615,
  'PPSS+BEM': 0.0016826923076923078,
  'DT': 0.01971153846153846,
  'JJ': 0.01730769230769231,
  'ABN': 0.0033653846153846156,
  'DTS': 0.0038461538461538464,
  'VBZ-HL': 0.005048076923076923,
  'VBN': 0.005528846153846154,
  'VBD-

In [24]:
emission_probabilities

{'AT': {'The': 0.0855508579955235,
  'an': 0.03320069634419299,
  'no': 0.011937329022631187,
  'the': 0.622855011191246,
  'a': 0.22767968167122607,
  'A': 0.014175578214374533,
  'every': 0.002611290723700572,
  'An': 0.0011191245958716736,
  'Every': 0.0002486943546381497,
  'No': 0.0006217358865953743},
 'NP-TL': {'Fulton': 0.014534883720930232,
  'Atlanta': 0.005813953488372093,
  'Grady': 0.005813953488372093,
  'Georgia': 0.00872093023255814,
  'Jackson': 0.0014534883720930232,
  'Miller': 0.0029069767441860465,
  'Colquitt': 0.0014534883720930232,
  'Texas': 0.015988372093023256,
  'Dallas': 0.015988372093023256,
  'Beaumont': 0.0014534883720930232,
  'Lamar': 0.0029069767441860465,
  'Texan': 0.0014534883720930232,
  'Wise': 0.0014534883720930232,
  'Paris': 0.0014534883720930232,
  'Oklahoma': 0.00436046511627907,
  'Rhode': 0.01308139534883721,
  'Massachusetts': 0.00436046511627907,
  'U.S.': 0.01308139534883721,
  'Denton': 0.0014534883720930232,
  'York': 0.0625,
  'St.':

In [28]:
freq_threshold = 5

In [33]:
unknown_words = df_test[~df_test["name"].isin(df_train["name"])]["name"].tolist()
low_freq_words = df_test.groupby("name").value_counts().to_frame("cnt").query("cnt < @freq_threshold").reset_index()["name"].tolist()



In [36]:
words_for_pseudo_tagging = low_freq_words+unknown_words

In [37]:
df_test.groupby("name").value_counts().to_frame("cnt").query("cnt < @freq_threshold").reset_index()

Unnamed: 0,name,tag,predicted tag,cnt
0,"$10,000-per-year",NNS,NN,1
1,"$100,000",NNS,NN,1
2,$139.3,NNS,NN,1
3,"$16,000",NNS,NN,1
4,$2.80,NNS,NN,1
...,...,...,...,...
2752,you,PPSS,PPSS,1
2753,young,JJ,JJ,2
2754,your,PP$,PP$,2
2755,youth,NN,NN,2


In [52]:
from ex3 import *
import ex3
import imp
imp.reload(ex3)

<module 'ex3' from 'C:\\Users\\inbar\\Documents\\nlp\\nlp-ex3\\ex3.py'>

In [51]:
for sentence in test_set:
    for word, tag in sentence:
        print(word)

But
in
all
its
175
years
,
not
a
single
Negro
student
has
entered
its
classrooms
.
Last
week
Federal
District
Judge
William
A.
Bootle
ordered
the
university
to
admit
immediately
a
``
qualified
''
Negro
boy
and
girl
.
Their
entry
will
crack
the
total
segregation
of
all
public
education
,
from
kindergarten
through
graduate
school
,
in
Georgia
--
and
in
Alabama
,
Mississippi
and
South
Carolina
as
well
.
For
18
months
,
Hamilton
Holmes
,
19
,
and
Charlayne
Hunter
,
18
,
had
tried
to
get
into
the
university
.
They
graduated
together
from
Atlanta's
Turner
High
School
,
where
Valedictorian
Holmes
was
first
in
the
class
and
Charlayne
third
.
The
university
rejected
them
on
a
variety
of
pretexts
,
but
was
careful
never
to
mention
the
color
of
their
skins
.
Holmes
went
to
Atlanta's
Morehouse
(
Negro
)
College
,
where
he
is
a
B
student
and
star
halfback
.
Charlayne
studied
journalism
at
Detroit's
Wayne
State
University
.
Last
fall
,
after
they
took
their
hopes
for
entering
Georgia
to
court
,
Judg

In [53]:
pseudo_train, pseudo_test = ex3.create_pseudo_words(train_set, test_set, 5)


AttributeError: 'NoneType' object has no attribute 'seek'

In [59]:
words_for_pseudo_tagging = get_unknown_and_low_freq_words(df_train, df_test, freq_threshold)
# Assign pseudo-words to each word
pseudo_words = {word: assign_pseudo_word(word) for word in words_for_pseudo_tagging}

In [62]:
new_train = []
for sentence in train_set:
    new_sentence = []
    for word, tag in sentence:

        if word in words_for_pseudo_tagging:
            # Replace word with its pseudo-word and retain the tag
            new_sentence.append((word, pseudo_words[word]))
        else:
            # Keep the original word and tag
            new_sentence.append((word, tag))
    new_train.append(new_sentence)

new_test = []
try:
    for i, sentence in enumerate(test_set):
        new_sentence = []
        # print(f"Sentenec is {sentence}, index {i}")
        for word, tag in sentence:
            # print(f"Processing word: {word}, tag: {tag}")
            if word in words_for_pseudo_tagging:
                # Replace word with its pseudo-word and retain the tag
                new_sentence.append((word, pseudo_words[word]))
                # If the word is unknown (from test set) and not in the training set,
                # add it to training data
                if word not in train_set:
                    new_train.append([(word, pseudo_words[word])])
            else:
                # Keep the original word and tag
                new_sentence.append((word, tag))
        new_test.append(new_sentence)
except Exception as err:
    print(f"the error is {err}")
    print(f"idx is {i}")
    print(f"train set in the point is {test_set[i]}")
    print(f"sentence is {sentence}")


the error is 'NoneType' object has no attribute 'seek'
idx is 139
train set in the point is [('Though', 'CS'), ('President', 'NN-TL'), ('John', 'NP'), ('F.', 'NP'), ('Kennedy', 'NP'), ('was', 'BEDZ'), ('primarily', 'RB'), ('concerned', 'VBN'), ('with', 'IN'), ('the', 'AT'), ('crucial', 'JJ'), ('problems', 'NNS'), ('of', 'IN'), ('Berlin', 'NP'), ('and', 'CC'), ('disarmament', 'NN'), ('adviser', 'NN'), ("McCloy's", 'NP$'), ('unexpected', 'JJ'), ('report', 'NN'), ('from', 'IN'), ('Khrushchev', 'NP'), (',', ','), ('his', 'PP$'), ('new', 'JJ'), ('enthusiasm', 'NN'), ('and', 'CC'), ('reliance', 'NN'), ('on', 'IN'), ('personal', 'JJ'), ('diplomacy', 'NN'), ('involved', 'VBD'), ('him', 'PPO'), ('in', 'IN'), ('other', 'AP'), ('key', 'NN'), ('problems', 'NNS'), ('of', 'IN'), ('U.S.', 'NP'), ('foreign', 'JJ'), ('policy', 'NN'), ('last', 'AP'), ('week', 'NN'), ('.', '.')]
sentence is [('Though', 'CS'), ('President', 'NN-TL'), ('John', 'NP'), ('F.', 'NP'), ('Kennedy', 'NP'), ('was', 'BEDZ'), ('prim

In [None]:
test_set[i]

In [56]:
test_set[44]

[('The', 'AT'),
 ('religions', 'NNS'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('people', 'NNS'),
 ('include', 'VB'),
 ('Christianity', 'NP'),
 (',', ','),
 ('Mohammedanism', 'NP'),
 (',', ','),
 ('paganism', 'NN'),
 (',', ','),
 ('ancestor', 'NN'),
 ('worship', 'NN'),
 ('and', 'CC'),
 ('animism', 'NN'),
 ('.', '.')]

In [106]:
import ex3
import imp
imp.reload(ex3)

<module 'ex3' from 'C:\\Users\\inbar\\Documents\\nlp\\nlp-ex3\\ex3.py'>

In [65]:
transition_probabilities, emission_probabilities = ex3.compute_transition_emission_probabilities(train_set)

In [68]:
len(emission_probabilities.keys())

212

In [107]:
transition_probabilities, emission_probabilities = ex3.compute_transition_emission_probabilities(
    train_set, smoothing_param=1)
# all_tags = ex3.get_all_tags(train_set)
# train_words = set(word for sentence in train_set for word, _ in sentence)
# ex3.run_viterbi_on_test_set(test_set, transition_probabilities, emission_probabilities, all_tags,
#                         train_words)

In [109]:
emission_probabilities

{'AT': {'The': 0.08556880278191753,
  'an': 0.033283656234475906,
  'no': 0.01204669647292598,
  'the': 0.6222056631892697,
  'a': 0.2275211127670144,
  'A': 0.014282165921510184,
  'every': 0.00273224043715847,
  'An': 0.0012419274714356682,
  'Every': 0.00037257824143070045,
  'No': 0.0007451564828614009},
 'NP-TL': {'Fulton': 0.010466222645099905,
  'Atlanta': 0.004757373929590866,
  'Grady': 0.004757373929590866,
  'Georgia': 0.006660323501427212,
  'Jackson': 0.0019029495718363464,
  'Miller': 0.0028544243577545195,
  'Colquitt': 0.0019029495718363464,
  'Texas': 0.011417697431018078,
  'Dallas': 0.011417697431018078,
  'Beaumont': 0.0019029495718363464,
  'Lamar': 0.0028544243577545195,
  'Texan': 0.0019029495718363464,
  'Wise': 0.0019029495718363464,
  'Paris': 0.0019029495718363464,
  'Oklahoma': 0.003805899143672693,
  'Rhode': 0.009514747859181731,
  'Massachusetts': 0.003805899143672693,
  'U.S.': 0.009514747859181731,
  'Denton': 0.0019029495718363464,
  'York': 0.04186489

In [91]:
smoothing_param = 1
END_TAG= ex3.END_TAG
START_TAG = ex3.START_TAG

In [92]:
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)
word_set = set()

# Process each sentence
for sentence in train_set:
    previous_tag = START_TAG
    for word, tag in sentence:
        # Update counts
        transition_counts[previous_tag][tag] += 1
        emission_counts[tag][word] += 1
        tag_counts[tag] += 1
        previous_tag = tag

        if smoothing_param:
            word_set.add(word)

    # Handle end-of-sentence transition
    transition_counts[previous_tag][END_TAG] += 1

In [105]:
tag_counts[tag]

3970

In [103]:
emission_counts[tag]["?"]+1 /3974

82.0002516356316

In [99]:
sum(emission_counts[tag].values())+smoothing_param*len(emission_counts[tag])

3974

In [75]:
len(transition_probabilities[ex3.START_TAG])

106

In [78]:
df_test[~df_test["tag"].isin(df_train["tag"])]

Unnamed: 0,name,tag,predicted tag
14,la,FW-AT,NN
19,le,FW-AT,NN
9,Pas,FW-*,NN
10,une,FW-CD,NN
0,However,WQL,WRB
14,quo,FW-WDT,NN
4,la,FW-AT,NN
0,What's,WDT+BEZ,NN


In [76]:
df_train["name"].nunique()

13574

In [71]:
df_train["tag"].nunique()

212

In [72]:
emission_probabilities["OD-TL"]

{'Sixth': 0.034482758620689655,
 '2d': 0.034482758620689655,
 '61st': 0.034482758620689655,
 'First': 0.41379310344827586,
 'Fifth': 0.034482758620689655,
 '30th': 0.034482758620689655,
 '51st': 0.034482758620689655,
 '14th': 0.06896551724137931,
 '50th': 0.034482758620689655,
 '3rd': 0.034482758620689655,
 'Ninth': 0.034482758620689655,
 '3': 0.034482758620689655,
 'Eighteenth': 0.034482758620689655,
 '24th': 0.034482758620689655,
 '16th': 0.06896551724137931,
 'Second': 0.034482758620689655}

In [110]:
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)

# Process each senten

unique_words ,unique_tags = set(), set()
for dataset in [train_set, test_set]:
    for sentence in dataset:
        for word, tag in sentence:
            unique_words.add(word)
            unique_tags.add(tag)

# Initialize emission counts with all words and tags set to 0
for tag in unique_tags:
    for word in unique_words:
        emission_counts[tag][word] = 0

In [119]:
sum([emission_counts[tag][word] + smoothing_param for word in emission_counts[tag].keys()])


14394

In [133]:
emission_counts[tag]['inconsistencies']

0

In [134]:
for x , y in  emission_counts.items():
    print (x)
    print(y)
    break


)


In [116]:
sum(emission_counts[tag].values())

0

In [118]:
sum([
    emission_counts[tag][word] + (smoothing_param if emission_counts[tag][word] > 0 else 0)
    for word in emission_counts[tag].keys()
])

0

In [135]:
train_words = set(df_train["name"].unique())

In [144]:
import imp
imp.reload(ex3)

<module 'ex3' from 'C:\\Users\\inbar\\Documents\\nlp\\nlp-ex3\\ex3.py'>

In [145]:
transition_probabilities,emission_probabilities = ex3.compute_transition_emission_probabilities(train_set,test_set,smoothing_param=1)

KeyboardInterrupt: 

In [139]:
sentence = test_set[0]
# Separate words and true tags
words = [word for word, _ in sentence]
true_tags = [tag for _, tag in sentence]

# Run Viterbi algorithm to get predicted tags
predicted_tags = ex3.viterbi_algorithm(words, transition_probabilities, emission_probabilities,
                                   all_tags, train_words, 1)

In [143]:
pd.DataFrame(sentence).columns = ["name","true_tag"]

In [141]:
predicted_tags

['CC',
 'IN',
 'ABN',
 'PP$',
 'NN',
 'NNS',
 ',',
 '*',
 'AT',
 'AP',
 'NP',
 'NN',
 'HVZ',
 'VBN',
 'PP$',
 'NN',
 '.']