<a href="https://colab.research.google.com/github/LindaSekhoasha/POS_Tagging/blob/main/POS_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Meet Our Group



*   Linda Sekhoasha (Student Number)
*   Wandile Ngobese (222056013)
*   Khonzinkosi Mkhize (Student Number)
*   Samukelo Mkhize (Student Number)

# Imports

In [None]:
!pip install datasets
!pip install pomegranate

In [None]:
import pandas as pd
from datasets import Dataset
from collections import Counter, defaultdict, namedtuple
# from pomegranate import HiddenMarkovModel, DiscreteDistribution

# Data Pre-processing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv(r"/content/drive/MyDrive/dataset/zu.gold.seg.data", delimiter='\t')
df.head(3)

Unnamed: 0,TOKEN,MORPH ANALYSIS,UPOS
0,Ukwengeza,u-kw-engez-a,V
1,kulokhu,ku-lokhu,CDEM
2,",",",",PUNC


In [None]:
# drop/remove the MORPH ANALYSIS column
df = df.drop(columns=['MORPH ANALYSIS'])

# convert all tokens to lowercase for a consistent vocab
df['TOKEN'] = [token.lower() for token in df['TOKEN']]
df.head(10)

Unnamed: 0,TOKEN,UPOS
0,ukwengeza,V
1,kulokhu,CDEM
2,",",PUNC
3,imibandela,N
4,iyenziwa,V
5,ukwakha,V
6,amakomiti,N
7,amawadi,N
8,",",PUNC
9,njengezinhlangano,ADV


In [None]:
# convert panda dataframe to HuggingFace dataset
dataset = Dataset.from_pandas(df)
print(dataset.features)

{'TOKEN': Value(dtype='string', id=None), 'UPOS': Value(dtype='string', id=None)}


In [None]:
data = dataset.train_test_split(test_size=0.2)

print(f"There are {len(data['train'])} tokens in the training set.")
print(f"There are {len(data['test'])} tokens in the testing set.\n")

print(f"First 3 training tokens:\n{data['train'][:3]}\n")
print(f"First 3 test tokens:\n{data['test'][:3]}")

There are 39277 tokens in the training set.
There are 9820 tokens in the testing set.

First 3 training tokens:
{'TOKEN': [',', 'kahulumeni', 'umbiko'], 'UPOS': ['PUNC', 'POSS', 'N']}

First 3 test tokens:
{'TOKEN': ['.', 'yomthetho', '.'], 'UPOS': ['PUNC', 'POSS', 'PUNC']}


# Most Frequent Class Tagger (Base)

In [None]:
"""A function to map either the tags to the count of words with said tag or
the words to the count of tags corresponding to the word."""
# keys -> list of either tags or words
# values -> is a similar, corresponding list
def pair_counts(keys, values):
    out = defaultdict(Counter)

    for key, value in zip(keys, values):
        out[key][value] += 1

    return {key: dict(value_counts) for key, value_counts in out.items()}

In [None]:
subset = data['train'].select(range(7))
tags = [example['UPOS'] for example in subset]
words = (example['TOKEN'] for example in subset)

emission_counts = pair_counts(tags, words)
print(emission_counts.keys())

dict_keys(['PUNC', 'POSS', 'N', 'REL', 'FOR'])


In [None]:
"""MFC model class.
FakeState is used as the 'states' are actually strings to mimmick a complete tagger like HMM tagger.
FakeState has a named field 'name' which could be a Tag or word."""
FakeState = namedtuple("FakeState", "name")

class MFCTagger:
    missing = FakeState(name="<UNK>")

    def __init__(self, table):
        self.table = defaultdict(lambda: MFCTagger.missing)
        self.table.update({word: FakeState(name=tag) for word, tag in table.items()})

    def viterbi(self, seq):
        """This method simplifies predictions by matching the Pomegranate viterbi() interface"""
        return 0., list(enumerate(["<s>"] + [self.table[w] for w in seq] + ["</s>"]))

In [None]:
train_set = data['train']
test_set = data['test']

tags = [example['UPOS'] for example in train_set]
words = (example['TOKEN'] for example in train_set)

word_counts = pair_counts(words, tags)

mfc_table = dict()
for word, tags in word_counts.items():
    mfc_table[word] = max(tags.keys(), key = lambda key: tags[key])

mfc_model = MFCTagger(mfc_table) # Create a Most Frequent Class tagger instance

In [None]:
# function that returns the vocab of a given dataset
def vocab(data_s):
    vocab = set()
    for token in data_s['TOKEN']:
        vocab.add(token)
    return sorted(vocab)

vocab_list = vocab(train_set)

# show vocab starting from 50 to hide punct
print(vocab_list[50:60])

['aba-3', 'aba-4', 'ababacabangeli', 'ababamba', 'ababambe', 'ababambi', 'ababambiqhaza', 'ababambisene', 'ababandakanyeka', 'ababandakanyekayo']


In [None]:
def replace_unknown(sequence):
    """Return a copy of the input sequence where each unknown word is replaced
    by the literal string value 'nan'. Pomegranate will ignore these values
    during computation.
    """
    return [w if w in vocab(train_set) else 'nan' for w in sequence]

def simplify_decoding(observations, model):
    """observations are the sequences (words) for the model to predict"""
    _, state_path = model.viterbi(replace_unknown(observations))
    return [state[1].name for state in state_path[1:-1]]  # do not show the start/end state predictions

In [None]:
print("Predicted labels:\n-----------------")
print(simplify_decoding(test_set[:20], mfc_model))
print()
print("Actual labels:\n--------------")
print(test_set['UPOS'][:10])
print("\n")

Predicted labels:
-----------------
['<UNK>', '<UNK>']

Actual labels:
--------------
['PUNC', 'POSS', 'PUNC', 'N', 'N', 'CDEM', 'N', 'N', 'V', 'N']


