# Parts-of-Speech Tagging - Working with tags and Numpy

In [11]:
import numpy as np
import pandas as pd
import math


In [12]:
# Tags for Advarb, Noun and To
tags = ['RB', 'NN', 'TO']

In [13]:
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

## Using numpy for matrix creation

In [14]:
num_tags = len(tags)

transition_matrix = np.zeros((num_tags, num_tags))
print(transition_matrix)
print(f"Matrix shape: {transition_matrix.shape}")

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Matrix shape: (3, 3)


In [15]:
sorted_tags = sorted(tags)
sorted_tags

['NN', 'RB', 'TO']

In [16]:
for i in range(len(sorted_tags)):
    for j in range(len(sorted_tags)):
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        transition_matrix[i][j] = transition_counts.get(tag_tuple, 0)

In [17]:
transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

## Numpy matrix manipulation

In [19]:
transition_matrix /= 10
print(transition_matrix)

[[1.6241e+02 2.4310e+01 5.2560e+01]
 [3.5800e+00 2.2630e+01 8.5500e+00]
 [7.3400e+00 2.0000e+00 2.0000e-02]]


In [25]:
test_matrix = np.ones((len(tags), len(tags) + 1))
print(test_matrix)

rows_sum = test_matrix.sum(axis=1, keepdims=True)
print(rows_sum)

normalized_matrix = test_matrix / rows_sum
print(normalized_matrix)

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
[[4.]
 [4.]
 [4.]]
[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]


In [26]:
test_matrix2 = np.copy(test_matrix)

for i in range(num_tags):
    test_matrix2[i, i] += math.log(rows_sum[i])

print(test_matrix2)

[[2.38629436 1.         1.         1.        ]
 [1.         2.38629436 1.         1.        ]
 [1.         1.         2.38629436 1.        ]]
