In [2]:
import nltk
from nltk.corpus import treebank
import random


# Set random seed for reproducibility
random.seed(42)
# Make sure you have the corpora
nltk.download('treebank')
nltk.download('universal_tagset')


# Convert to list so we can sample
test_sentences = list(treebank.tagged_sents(tagset='universal'))

# Now sample 100
gpt_test_sentences = random.sample(test_sentences, 100)

print(f"Sampled {len(test_sentences)} sentences.")
print("\nExample sentence:\n", test_sentences[0])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Sampled 3914 sentences.

Example sentence:
 [('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]


In [3]:
len(gpt_test_sentences)

100

In [4]:
!nvidia-smi

Sat Aug 30 06:51:32 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   53C    P0              68W / 300W |  74293MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [5]:
def get_answer(input_words):
    # Use only the last fold
    pos_tags = viterbi_algo(input_words, 0)
    return pos_tags

In [6]:
def viterbi_algo(sentence, fold_index):
    sentence = sentence.copy()
    sentence.append('.')
    # if re.search(r'[a-zA-Z]',sentence[-1]):
    #   sentence.append('.')
    #   print("Sentence didn't ended with a .")
    viterbi_table = [{} for _ in range(len(sentence))]
    backpointer = [{} for _ in range(len(sentence))]
    emission_prob = emission_prob_list[fold_index]
    transition_prob = transition_prob_list[fold_index]

    for tag in transition_prob[start_tag]:
        word = sentence[0].lower()
        viterbi_table[0][tag] = transition_prob[start_tag].get(tag, 1e-6) * emission_prob.get(word, {}).get(tag, 1e-6)
        backpointer[0][tag] = start_tag

    for t in range(1, len(sentence)):
        word = sentence[t].lower()
        for curr_tag in transition_prob:
            max_prob, best_prev_tag = max(
                (viterbi_table[t-1][prev_tag] * transition_prob[prev_tag].get(curr_tag, 1e-6) * emission_prob.get(word, {}).get(curr_tag, 1e-6), prev_tag)
                for prev_tag in viterbi_table[t-1]
            )
            viterbi_table[t][curr_tag] = max_prob
            backpointer[t][curr_tag] = best_prev_tag

    best_path = []
    best_last_tag = max(viterbi_table[-1], key=viterbi_table[-1].get)
    best_path.append(best_last_tag)
    for t in range(len(sentence) - 1, 0, -1):
        best_last_tag = backpointer[t][best_last_tag]
        best_path.insert(0, best_last_tag)
    best_path.pop()
    return best_path

In [7]:
import json
import numpy as np
from nltk.tag import map_tag
from collections import defaultdict, Counter
import re

start_tag = '^'
# Load JSON from file
with open('emission_prob_list.json', 'r') as f:
    json_data = f.read()

# Convert JSON to Python object
emission_prob_list = json.loads(json_data)
num_folds = len(emission_prob_list)
# Load JSON from file
with open('transition_prob_list.json', 'r') as f:
    json_data = f.read()

# Convert JSON to Python object
transition_prob_list = json.loads(json_data)
sentence = "This is the course."
input_words = re.findall(r"\w+|[^\w\s]", sentence)

get_answer(input_words)

['DET', 'VERB', 'DET', 'NOUN', '.']

In [9]:
sorted_tags = ['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X',
 '^']

In [12]:
test_sentences = gpt_test_sentences
X_test = [[word for word, tag in sent] for sent in test_sentences]
y_test = [[tag for word, tag in sent] for sent in test_sentences]

In [13]:
import re
from collections import defaultdict, Counter
from tqdm import tqdm

# Initialize confusion matrix
per_pos_accuracy = defaultdict(lambda: defaultdict(int))
for each_tag1 in sorted_tags:
    for each_tag2 in sorted_tags:
        per_pos_accuracy[each_tag1][each_tag2] = 0

tag_predicted = []
actual_tags = []

def get_answer(input_words):
    # Use only the last fold
    pos_tags = viterbi_algo(input_words, 0)
    return pos_tags

# Evaluate on test set with tqdm
num_folds = len(emission_prob_list)

for sent, true_tags in tqdm(zip(X_test, y_test), total=len(X_test), desc="Evaluating"):
    # sent is already a list of words
    pred_tags = get_answer(sent)
    
    actual_tags.extend(true_tags)
    tag_predicted.extend(pred_tags)

    # Update confusion matrix
    for t_true, t_pred in zip(true_tags, pred_tags):
        per_pos_accuracy[t_true][t_pred] += 1

Evaluating: 100%|████████████████████████| 100/100 [00:00<00:00, 335.74it/s]


In [14]:
def calc_acc(actual, predicted):
    correct = sum(1 for a, p in zip(actual, predicted) if a == p)
    total = len(actual)
    return (correct / total) * 100

In [15]:
# Compute overall accuracy
test_accuracy = calc_acc(actual_tags, tag_predicted)
print(f"Test accuracy: {test_accuracy:.2f}%")

Test accuracy: 81.51%


### HMM Analysis (Penn Dataset)

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(actual_tags, tag_predicted)
precision = precision_score(actual_tags, tag_predicted, average="weighted")
recall = recall_score(actual_tags, tag_predicted, average="weighted")
f1 = f1_score(actual_tags, tag_predicted, average="weighted")

print(f"Accuracy : {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1:.2f}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy : 0.81
Precision: 0.76
Recall   : 0.81
F1 Score : 0.78


In [9]:
from sklearn.metrics import classification_report
print(classification_report(actual_tags, tag_predicted))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           .       0.89      0.91      0.90     11715
         ADJ       0.77      0.82      0.79      6397
         ADP       0.74      0.97      0.84      9857
         ADV       0.87      0.78      0.83      3171
        CONJ       0.99      0.99      0.99      2265
         DET       0.71      0.94      0.81      8725
        NOUN       0.87      0.85      0.86     28867
         NUM       0.72      0.64      0.68      3546
        PRON       0.67      0.69      0.68      2737
         PRT       0.65      0.40      0.49      3219
        VERB       0.80      0.92      0.86     13564
           X       0.00      0.00      0.00      6613
           ^       0.00      0.00      0.00         0

    accuracy                           0.81    100676
   macro avg       0.67      0.69      0.67    100676
weighted avg       0.76      0.81      0.78    100676



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
