In [16]:
import nltk

In [17]:
def get_reader(filename: str = None) -> nltk.corpus.reader.tagged.TaggedCorpusReader:
    if filename is not None:
        return nltk.corpus.reader.tagged.TaggedCorpusReader('input/', filename)
    return nltk.corpus.reader.tagged.TaggedCorpusReader('input/', r'*\.sent')

In [18]:
def read_corpus_from_file(reader: nltk.corpus.reader.tagged.TaggedCorpusReader = None) -> list[list[tuple[str, str]]]:
    if reader is None:
        reader = get_reader()
    return reader.tagged_sents()

In [19]:
mim_gold_reader = get_reader('MIM-GOLD.sent')
mim_gold_sents = read_corpus_from_file(mim_gold_reader)

In [20]:
print("Number of sentences in MIM-GOLD: ", len(mim_gold_sents))
print("Tokens of sentence 100: ", mim_gold_sents[99])

Number of sentences in MIM-GOLD:  58412
Tokens of sentence 100:  [('Púttað', 'SÞGHEN'), ('á', 'AF'), ('Listatúni', 'NHEÞ-S'), ('í', 'AF'), ('dag', 'NKEO'), (',', 'PK'), ('laugardag', 'NKEO'), (',', 'PK'), ('kl.', 'KS'), ('10.30', 'TA'), ('.', 'PL')]


In [21]:
total_tokens = []
total_types = []
for sent in mim_gold_sents:
    for token, tag in sent:
        total_tokens.append(token)
        total_types.append(tag)

unique_tokens = list(set(total_tokens))
unique_types = list(set(total_types))

print("Total number of tokens in MIM-GOLD: ", len(total_tokens))
print("Total number of types in MIM-GOLD: ", len(total_types))
print("Total number of unique tokens in MIM-GOLD: ", len(unique_tokens))
print("Total number of unique types in MIM-GOLD: ", len(unique_types))

Total number of tokens in MIM-GOLD:  1000218
Total number of types in MIM-GOLD:  1000218
Total number of unique tokens in MIM-GOLD:  106529
Total number of unique types in MIM-GOLD:  558


In [22]:
tokens_frequency_distribution = nltk.FreqDist(total_tokens)
types_frequency_distribution = nltk.FreqDist(total_types)

In [23]:
print("Most common tokens: ", tokens_frequency_distribution.most_common(10))
print("Most common types: ", types_frequency_distribution.most_common(20))

Most common tokens:  [('.', 49066), ('að', 35749), ('og', 33813), (',', 29990), ('í', 27622), ('á', 21833), ('er', 16604), ('sem', 15199), ('til', 9888), ('um', 8799)]
Most common types:  [('AF', 109899), ('AA', 74151), ('C', 68278), ('PL', 53212), ('SFG3EN', 36289), ('PK', 30647), ('SNG', 26345), ('TA', 22992), ('SFG3EÞ', 19773), ('CN', 19540), ('SÞGHEN', 17190), ('PA', 13043), ('N----S', 12783), ('CT', 12664), ('SFG3FN', 12294), ('NKEN', 11411), ('NVEN', 11171), ('NVEO', 11123), ('NHEÞ', 10979), ('NVEÞ', 10281)]


In [24]:
types_bigrams = nltk.bigrams(total_types)
conditional_frequency_distribution = nltk.probability.ConditionalFreqDist(types_bigrams)

In [27]:
most_common_following_af = conditional_frequency_distribution['AF'].most_common(10)
print("Most common following AF: ", most_common_following_af)

Most common following AF:  [('NVEÞ', 5714), ('NHEÞ', 5361), ('NKEÞ', 4222), ('CN', 3903), ('NVEO', 3556), ('NKEO', 3347), ('NHEO', 3046), ('NHEÞG', 2867), ('NVEÞG', 2723), ('FPHEÞ', 2703)]
