In [1]:
import nltk
from nltk.corpus import brown

nltk.download('brown')

# Load the tagged sentences from the 'news' category
news_sentences = brown.tagged_sents(categories='news')

# Calculate the split index for the last 10% of the sentences
split_index = int(len(news_sentences) * 0.9)

# Divide the sentences into training and test sets
train_set = news_sentences[:split_index]
test_set = news_sentences[split_index:]

# Optionally, you can print the sizes of the train and test sets
print(f"Training set size: {len(train_set)} sentences")
print(f"Test set size: {len(test_set)} sentences")

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\inbar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


Training set size: 4160 sentences
Test set size: 463 sentences


In [4]:
import pandas as pd

In [6]:
list_of_pd = []
for i in range(len(train_set)):
    list_of_pd.append(pd.DataFrame(train_set[i]))

In [8]:
list_of_pd[1]

Unnamed: 0,0,1
0,The,AT
1,jury,NN
2,further,RBR
3,said,VBD
4,in,IN
5,term-end,NN
6,presentments,NNS
7,that,CS
8,the,AT
9,City,NN-TL


In [9]:
df_train = pd.concat(list_of_pd)

In [11]:
df_train.columns = ["name","tag"]

In [28]:
count_per_tag = df_train.groupby(["name", "tag"]).size().reset_index(name='appearances')


In [32]:
count_per_word = df_train.groupby(["name"]).size().reset_index(name='total_word_appearances')


In [33]:
count_per_tag = count_per_tag.merge(count_per_word,on ="name",how = "left")

In [36]:
count_per_tag["rate"] = count_per_tag["appearances"] /count_per_tag["total_word_appearances"]

In [37]:
count_dict = {}

for _, row in count_per_tag.iterrows():
    word = row['name']
    tag = row['tag']
    appearances = row['rate']
    
    if word not in count_dict:
        count_dict[word] = []
    
    # Append the (tag, appearances) tuple
    count_dict[word].append((tag, appearances))

# Sort each list of tags by appearances in descending order
for word in count_dict:
    count_dict[word] = sorted(count_dict[word], key=lambda x: x[1], reverse=True)


In [45]:
count_dict[0][0]

{'!': [('.', 0.9375), ('.-HL', 0.0625)],
 '$1': [('NN', 1.0)],
 '$1,000': [('NNS', 1.0)],
 '$1,000,000,000': [('NNS', 1.0)],
 '$1,500': [('NNS', 1.0)],
 '$1,500,000': [('NNS', 1.0)],
 '$1,600': [('NNS', 1.0)],
 '$1,800': [('NNS', 1.0)],
 '$1.1': [('NNS', 1.0)],
 '$1.4': [('NNS', 1.0)],
 '$1.5': [('NNS', 1.0)],
 '$1.80': [('NNS', 1.0)],
 '$10': [('NNS', 1.0)],
 '$10,000': [('NNS', 1.0)],
 '$100': [('NNS', 1.0)],
 '$102,285,000': [('NNS', 1.0)],
 '$109': [('NNS', 1.0)],
 '$11.50': [('NNS', 1.0)],
 '$115,000': [('NNS', 1.0)],
 '$12': [('NNS', 1.0)],
 '$12,192,865': [('NNS', 1.0)],
 '$12,500': [('NNS', 0.5), ('NNS-HL', 0.5)],
 '$12.50': [('NNS', 1.0)],
 '$12.7': [('NNS', 1.0)],
 '$120': [('NNS', 1.0)],
 '$125': [('NNS', 1.0)],
 '$135': [('NNS', 1.0)],
 '$14': [('NNS', 0.5), ('NNS-HL', 0.5)],
 '$15': [('NNS', 1.0)],
 '$15,000': [('NNS', 1.0)],
 '$15,000,000': [('NNS', 1.0)],
 '$150': [('NNS', 1.0)],
 '$157,460': [('NNS', 1.0)],
 '$16': [('NNS', 1.0)],
 '$17': [('NNS', 1.0)],
 '$17,000': [('

In [41]:
list_of_pd_test = []
for i in range(len(test_set)):
    list_of_pd_test.append(pd.DataFrame(test_set[i]))
df_test = pd.concat(list_of_pd_test)

In [44]:
df_test.columns = ["name","tag"]

In [48]:
def get_prob(word):
    if count_dict.get(word):
        return count_dict[word][0][0]
    else:
        return "NN"

In [51]:
df_test["predicted tag"] = df_test.apply(lambda x:get_prob(x["name"]),axis = 1)

In [56]:
accuracy = len(df_test[df_test["tag"] == df_test["predicted tag"]]) / len(df_test)

In [57]:
error_rate = 1 - accuracy