In [19]:
import numpy as np
import time
import re
import postagging as post

In [20]:
# start the timer
start_time = time.time()

# Training the model
transition_count, emission_count, tag_count, word_count, f_tag_counter, f_tag_total = post.process_file('data/WSJ_02-21.pos')
trans_mx, tag_to_idx = post.transition_matrix(transition_count, tag_count)
emiss_mx, word_to_idx = post.emission_matrix(emission_count, tag_count, word_count)

# display the transmission matrix
print(f'Transition matrix:\n {trans_mx}\n')
#display the emssion matrix
print(f'Emission matrix:\n {emiss_mx}\n')

Transition matrix:
 [[2.04557907e-02 3.28449378e-01 1.49725024e-01 ... 1.01467216e-05
  1.01467216e-05 2.02934432e-05]
 [9.68940153e-03 1.58842648e-03 1.10542264e-01 ... 0.00000000e+00
  2.44373305e-05 0.00000000e+00]
 [4.06271183e-02 2.61299281e-03 3.77495463e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.03092784e-02 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  3.09278351e-02 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 6.54761905e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]

Emission matrix:
 [[1.69419810e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 4.01871900e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.27990729e-05 1.09330243e-05 3.46576870e-03 ... 1.09330243e-05
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.000000

In [21]:
# Get the initial probability
init_prob = post.init(tag_count, f_tag_counter, f_tag_total)

# Test for accuracy
print(post.test('data/WSJ_24.pos',trans_mx, emiss_mx, tag_count, tag_to_idx, word_to_idx, init_prob))

# Show the total time 
print()
print("--- %s seconds ---" % (time.time() - start_time))

Accuracy:0.9364137217301312; # Correct: 30764; # Incorrect:2089; Total: 32853

--- 29.996575117111206 seconds ---


In [22]:
# Testing a sentence
simple_sentence = 'The quick brown fox jumps over the lazy dog.'
obs_simple = re.findall(r'\w+|[^\w\s]', simple_sentence)


complex_sentence = "Despite the heavy rain, the determined marathon runners continued their race through the winding streets of the historic city, undeterred by the challenging weather conditions."
obs_complex = re.findall(r'\w+|[^\w\s]', complex_sentence)


# Get the most probable sequence
path = post.viterbi(trans_mx, emiss_mx, tag_count, tag_to_idx, word_to_idx, init_prob, obs_simple)
print(f'simple sample: {simple_sentence}')
print(f'output: {path}')

print()
path = post.viterbi(trans_mx, emiss_mx, tag_count, tag_to_idx, word_to_idx, init_prob, obs_complex)
print(f'complex sample: {complex_sentence}')
print(f'output: {path}')

simple sample: The quick brown fox jumps over the lazy dog.
output: ['DT', 'JJ', 'NNP', 'NNP', 'VBZ', 'IN', 'DT', 'JJ', 'NN', '.']

complex sample: Despite the heavy rain, the determined marathon runners continued their race through the winding streets of the historic city, undeterred by the challenging weather conditions.
output: ['IN', 'DT', 'JJ', 'NN', ',', 'DT', 'VBN', 'NNP', 'NNS', 'VBD', 'PRP$', 'NN', 'IN', 'DT', 'VBG', 'NNS', 'IN', 'DT', 'JJ', 'NN', ',', 'JJ', 'IN', 'DT', 'JJ', 'NN', 'NNS', '.']
