<a href="https://colab.research.google.com/github/LindaSekhoasha/POS_Tagging/blob/main/POS_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Meet Our Group



*   Linda Sekhoasha (Student Number)
*   Wandile Ngobese (222056013)
*   Khonzinkosi Mkhize (Student Number)
*   Samukelo Mkhize (Student Number)

# Imports

In [None]:
!pip install datasets

In [2]:
import pandas as pd
from datasets import Dataset
from collections import Counter, defaultdict, namedtuple
import nltk
from nltk.tag import hmm

# Data Pre-processing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_csv(r"/content/drive/MyDrive/dataset/zu.gold.seg.data", delimiter='\t')
df.head(3)

Unnamed: 0,TOKEN,MORPH ANALYSIS,UPOS
0,Ukwengeza,u-kw-engez-a,V
1,kulokhu,ku-lokhu,CDEM
2,",",",",PUNC


In [5]:
# drop/remove the MORPH ANALYSIS column
df = df.drop(columns=['MORPH ANALYSIS'])

# convert all tokens to lowercase for a consistent vocab
df['TOKEN'] = [token.lower() for token in df['TOKEN']]
df.head(10)

Unnamed: 0,TOKEN,UPOS
0,ukwengeza,V
1,kulokhu,CDEM
2,",",PUNC
3,imibandela,N
4,iyenziwa,V
5,ukwakha,V
6,amakomiti,N
7,amawadi,N
8,",",PUNC
9,njengezinhlangano,ADV


In [6]:
# convert panda dataframe to HuggingFace dataset
dataset = Dataset.from_pandas(df)
print(dataset.features)

{'TOKEN': Value(dtype='string', id=None), 'UPOS': Value(dtype='string', id=None)}


In [7]:
data = dataset.train_test_split(test_size=0.2)

print(f"There are {len(data['train'])} tokens in the training set.")
print(f"There are {len(data['test'])} tokens in the testing set.\n")

print(f"First 3 training tokens:\n{data['train'][:3]}\n")
print(f"First 3 test tokens:\n{data['test'][:3]}")

There are 39277 tokens in the training set.
There are 9820 tokens in the testing set.

First 3 training tokens:
{'TOKEN': ['kanye', '.', 'iqhaza'], 'UPOS': ['ADV', 'PUNC', 'N']}

First 3 test tokens:
{'TOKEN': ['amalungu', 'abawumthombo', '.'], 'UPOS': ['N', 'REL', 'PUNC']}


# Most Frequent Class Tagger (Base)

## MFC Model Implementation

In [8]:
"""A function to map either the tags to the count of words with said tag or
the words to the count of tags corresponding to the word."""
# keys -> list of either tags or words
# values -> is a similar, corresponding list
def pair_counts(keys, values):
    out = defaultdict(Counter)

    for key, value in zip(keys, values):
        out[key][value] += 1

    return {key: dict(value_counts) for key, value_counts in out.items()}

In [9]:
subset = data['train'].select(range(7))
tags = [example['UPOS'] for example in subset]
words = (example['TOKEN'] for example in subset)

emission_counts = pair_counts(tags, words)
print(emission_counts.keys())

dict_keys(['ADV', 'PUNC', 'N', 'V', 'PRO'])


In [10]:
"""MFC model class.
FakeState is used as the 'states' are actually strings to mimmick a complete tagger like HMM tagger.
FakeState has a named field 'name' which could be a Tag or word."""
FakeState = namedtuple("FakeState", "name")

class MFCTagger:
    missing = FakeState(name="<UNK>")

    def __init__(self, table):
        self.table = defaultdict(lambda: MFCTagger.missing)
        self.table.update({word: FakeState(name=tag) for word, tag in table.items()})

    def viterbi(self, seq):
        """This method simplifies predictions by matching the Pomegranate viterbi() interface"""
        return 0., list(enumerate(["<s>"] + [self.table[w] for w in seq] + ["</s>"]))

In [11]:
train_set = data['train']
test_set = data['test']

tags = [example['UPOS'] for example in train_set]
words = (example['TOKEN'] for example in train_set)

word_counts = pair_counts(words, tags)

mfc_table = dict()
for word, tags in word_counts.items():
    mfc_table[word] = max(tags.keys(), key = lambda key: tags[key])

mfc_model = MFCTagger(mfc_table) # Create a Most Frequent Class tagger instance

In [12]:
# function that returns the vocab of a given dataset
def vocab(data_s):
    vocab = set()
    for token in data_s['TOKEN']:
        vocab.add(token)
    return sorted(vocab)

vocab_list = vocab(train_set)

# show vocab starting from 50 to hide punct
print(vocab_list[50:60])

['ababacabangeli', 'ababambe', 'ababambiqhaza', 'ababambisene', 'ababamele', 'ababandakanyekayo', 'ababanga', 'ababebambe', 'ababelwe', 'ababencishwe']


In [13]:
def replace_unknown(sequence):
    """Return a copy of the input sequence where each unknown word is replaced
    by the literal string value 'nan'. Pomegranate will ignore these values
    during computation.
    """
    return [w if w in vocab(train_set) else 'nan' for w in sequence]

def simplify_decoding(observations, model):
    """observations are the sequences (words) for the model to predict"""
    _, state_path = model.viterbi(replace_unknown(observations))
    return [state[1].name for state in state_path[1:-1]]  # do not show the start/end state predictions

In [14]:
print("Sentence:\n-----------------")
print(test_set['TOKEN'][:10])
print()
print("Predicted labels:\n-----------------")
print(simplify_decoding(test_set['TOKEN'][:10], mfc_model))
print()
print("Actual labels:\n--------------")
print(test_set['UPOS'][:10])
print("\n")

Sentence:
-----------------
['amalungu', 'abawumthombo', '.', '.', '.', 'intela', 'imali', 'kanjani', 'umsebenzi', 'owodwa']

Predicted labels:
-----------------
['N', 'REL', 'PUNC', 'PUNC', 'PUNC', 'N', 'N', 'ADV', 'N', 'REL']

Actual labels:
--------------
['N', 'REL', 'PUNC', 'PUNC', 'PUNC', 'N', 'N', 'ADV', 'N', 'REL']




## Accuracy Score

In [15]:
"""This is a function to determine the accuracy of the model.
The way it works is:
1) it takes a observation list from test set: test_tokens
2) it takes a list of actual tags to predict from test set: actual_tags
3) predicts tags from test_sentences using a specified model and stores them in: predicted_tags
4) it takes the ratio of the correct predicted_tags to the total tags and returns that as the accuracy score
"""

def accuracy(test_tokens, actual_tags, model):
    correct = total_predictions = 0
    most_likely_tags = simplify_decoding(test_tokens, model)
    correct += sum(p == t for p, t in zip(most_likely_tags, actual_tags))
    total_predictions += len(test_tokens)
    return correct / total_predictions

In [18]:
# model eval time = 1/240 * (train_limit + test limit) minutes
train_limit = 2000
test_limit = 600
mfc_training_acc = accuracy(train_set['TOKEN'][:train_limit], train_set['UPOS'][:train_limit], mfc_model)
print("training accuracy mfc_model: {:.2f}%".format(100 * mfc_training_acc))

mfc_testing_acc = accuracy(test_set['TOKEN'][:test_limit], test_set['UPOS'][:test_limit], mfc_model)
print("testing accuracy mfc_model: {:.2f}%".format(100 * mfc_testing_acc))

training accuracy mfc_model: 96.95%
testing accuracy mfc_model: 77.50%


# HMM Model

In [21]:
"""
This code block implements a for loop that generates sentences
from the dataset in the form -> [(w1, t1), (w2, t2), ..., (wn, tn)]
"""
sentences = []
current_sentence = []

for example in dataset:
    token = example["TOKEN"]
    tag = example["UPOS"]

    if token is None or tag is None:
        continue  # skip bad data

    current_sentence.append((token, tag))

    # sentence ends at a full stop or other terminating punctuations
    if token in [".", "!", "?"] and tag == "PUNC":
        sentences.append(current_sentence)
        current_sentence = []

# add the last sentence if the dataset doesn't end with punctuation
if current_sentence:
    sentences.append(current_sentence)

print(len(sentences))

3155


In [22]:
train_data = sentences[:2800]
test_data = sentences[2800:]

In [24]:
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train(train_data)

accuracy = hmm_tagger.accuracy(test_data)
print(f"Accuracy Score: {accuracy * 100:.2f}%")

# tagging a sentence from test data
pred_sentence = test_data[0]
new_sentence = []
for i in range(len(pred_sentence)):
    new_sentence.append(pred_sentence[i][0])

tagged_sentence = hmm_tagger.tag(new_sentence)

print()
print(f"Actual Sentence (No tags):\n{new_sentence}")
print()
print(f"Actual Sentence (To Predict):\n{pred_sentence}")
print()
print(f"Tagged Sentence (Predicted):\n{tagged_sentence}")
print()

Accuracy Score: 37.55%

Actual Sentence (No tags):
['ikomidi', 'lesigceme', 'likamasipala', 'abc', 'livame', 'ukuhambela', 'imihlangano', 'yomkhandlu', 'njalo', '.']

Actual Sentence (To Predict):
[('ikomidi', 'N'), ('lesigceme', 'POSS'), ('likamasipala', 'N'), ('abc', 'ABBR'), ('livame', 'V'), ('ukuhambela', 'V'), ('imihlangano', 'N'), ('yomkhandlu', 'POSS'), ('njalo', 'ADV'), ('.', 'PUNC')]

Tagged Sentence (Predicted):
[('ikomidi', 'N'), ('lesigceme', 'CDEM'), ('likamasipala', 'N'), ('abc', 'ABBR'), ('livame', 'V'), ('ukuhambela', 'V'), ('imihlangano', 'V'), ('yomkhandlu', 'V'), ('njalo', 'V'), ('.', 'V')]



##  CRF Model

# How the CRF Model Works

A Conditional Random Field (CRF) is a machine learning model that's especially powerful for sequence labeling tasks — like predicting the part-of-speech (POS) tag for each word in a sentence.
In this project, our goal is to train a CRF to predict the correct UPOS tag for each TOKEN in isiZulu text.<br>

Here’s how it will work, step-by-step:<br><br>

**1. Feature Extraction**<br>
Before training the CRF, we need to describe each word not just by itself, but by a set of useful features.
These features capture important clues that help the model make better predictions.

For example, for each word, we might include:<br>

The word itself (e.g., "ukwengeza")

Whether the word is all lowercase

Whether the word is a digit

The first few characters (prefix) or last few characters (suffix)

If the previous or next word is punctuation

Etc.

👉 These features give the CRF "hints" about what kind of word it is dealing with.<br><br>

**2. Learning Patterns in Sequences**<br>
The CRF does not predict each tag independently.
Instead, it models the dependencies between tags in a sequence.<br>

🔵 Example:<br>
If a word is tagged as a determiner (DET), it’s very likely that the next word is a noun (N).
If a word is a verb (V), maybe the next word is likely to be a noun or a punctuation mark.<br>

The CRF learns these patterns during training:<br>

Which features suggest which POS tag?<br>

What is the likely sequence of tags?<br><br>

**3. Training the CRF Model**<br>
During training, the CRF tries to find the best set of parameters that:<br>

Assign the correct UPOS tag to each token

While also respecting the typical flow of tags across sentences

It maximizes the probability of the correct sequence of tags given the observed words and their features.<br><br>

**4. Predicting POS Tags**<br>
Once trained, the CRF can be used to predict the POS tags for new, unseen text.

For a given sequence of words:

It looks at the features of each word

It chooses the sequence of tags that is most likely, according to what it learned.<br><br>


The CRF takes into account both the individual word features and the context (the neighboring words and tags) when making predictions — which makes it smarter than just predicting each word alone.<br><br>

🎯 **Why Use a CRF for POS Tagging?**<br>
It understands the structure of language (not just word-by-word).

It can correct mistakes by looking at neighboring tags.

It works very well when you design good features.<br><br>

In this project, the CRF model will be compared against a simpler model (Most Frequent Class Tagger) and the HMM model, to see if its extra intelligence leads to better tagging performance.

**1. Feature Extraction**

In [None]:
def word2features(sentence, i):
  current_word = sentence[i]

  # The following dictionary stores features that a single word in a sequence can have.
  # Since the CRF model does not see raw words but only sees features about each word, we use a feature dict.
  # You can think of the features function as a collection of small "facts" about a particular word/token.
  features = {
      'word.lower():': current_word.lower(),
      'word.isupper()': current_word.isupper(),
      'word.istitle()': current_word.istitle(),
      'word.isdigit()': current_word.isdigit(),
      'prefix-3': current_word[:3],
      'suffix-3': current_word[-3:]
  }

  # from the sentence we got as the parameter, we look at word i, the following code is to update (or dynamically add) the features, for the previous word.
  #
  if i > 0:
    prev_word = sentence[i-1]
    features.update({
        'prev_word.lower()': prev_word.lower(),
        'prev_word.istitle()': prev_word.istitle(),
        'prev_word.isupper()': prev_word.isupper(),
    })
  else:
    features['BOS'] = True  # Dynamically add the feature for Beginning of sentence for the current_word

  if i < len(sentence) - 1:
    next_word = sentence[i+1]
    features.update({
        'next_word.lower()': next_word.lower(),
        'next_word.istitle()': next_word.istitle(),
        'next_word.isupper()': next_word.isupper()
    })
  else:
    features['EOS'] = True  # Dynamically add the feature for End of sentence for the current_word

  return features

In [None]:
def sentence2features(sentence):
  return [word2features(sentence, i) for i in range(len(sentence))]

The following is a test case for word2features() and setence2features()

In [None]:
test_sentence = ["UKhonzinkosi", "wazalwa", "ngonyaka", "oka", "1990"]

# Run feature extraction
features_of_sentence = sentence2features(test_sentence)
# Print nicely
for idx, features in enumerate(features_of_sentence):
    print(f"Word {idx}: '{test_sentence[idx]}' ➔ Features:")
    for key, value in features.items():
        print(f"   {key}: {value}")
    print("\n")