# HMM PoS tagger

## Introduction

## HMM implementation

In [1]:
import os
from conllu import parse_incr
from typing import Dict, List, Tuple
from collections import Counter
import numpy as np

https://gucorpling.org/gum/

https://gucorpling.org/gum/gentle.html

['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']

In [2]:
# Implementation of the HMM model
class HMM_PoS_tagger:
    def __init__(self, path_data: str, lemmatize: bool, threshold: float):
        self.path_data = path_data
        self.lemmatize = lemmatize
        self.threshold = threshold
        self.counter = Counter()  # 12770 unique words in train
        # Read data from train and test datasets & check that is correct
        self.train = self.read_train_data()
        self.test_GUM = self.read_test_data(is_GUM=True)
        self.test_GENTLE = self.read_test_data(is_GUM=False)
        # Create vocabulary
        self.vocab, self.frec = self.create_vocab()

    def read_train_data(self) -> Dict[str, List[List[Tuple[str, str]]]]:
        # Train and Dev datasets will be used to train the model
        # https://github.com/UniversalDependencies/UD_English-GUM/blob/master/stats.xml
        paths = [
            os.path.join(self.path_data, "en_gum-ud-train.conllu"),
            os.path.join(self.path_data, "en_gum-ud-dev.conllu"),
        ]

        data = self.read_data(paths)

        tags = set()
        cont_sentences = 0
        for sentences in data.values():
            cont_sentences += len(sentences)
            for sentence in sentences:
                for word, tag in sentence:
                    tags.add(tag)
                    self.counter.update([word])

        # Check that number of sentences and tags are correct
        assert len(tags) == 17
        assert cont_sentences == 9520 + 1341
        assert list(data.keys()) == [
            "GUM_academic",
            "GUM_bio",
            "GUM_conversation",
            "GUM_court",
            "GUM_essay",
            "GUM_fiction",
            "GUM_interview",
            "GUM_letter",
            "GUM_news",
            "GUM_podcast",
            "GUM_speech",
            "GUM_textbook",
            "GUM_vlog",
            "GUM_voyage",
            "GUM_whow",
        ]
        return data

    def read_test_data(self, is_GUM: bool) -> Dict[str, List[List[Tuple[str, str]]]]:
        # Both GUM and GENTLE test datasets will be used to evaluate the model
        path = (
            os.path.join(self.path_data, "en_gum-ud-test.conllu")
            if is_GUM
            else os.path.join(self.path_data, "en_gentle-ud-test.conllu")
        )

        data = self.read_data([path])

        # Check that number of sentences and tags are correct
        cont_sentences = 0
        tags = set()
        for sentences in data.values():
            cont_sentences += len(sentences)
            for sentence in sentences:
                for _, tag in sentence:
                    tags.add(tag)

        if is_GUM:
            assert len(tags) == 17
            assert cont_sentences == 1285
            assert list(data.keys()) == [
                "GUM_academic",
                "GUM_bio",
                "GUM_conversation",
                "GUM_court",
                "GUM_essay",
                "GUM_fiction",
                "GUM_interview",
                "GUM_letter",
                "GUM_news",
                "GUM_podcast",
                "GUM_speech",
                "GUM_textbook",
                "GUM_vlog",
                "GUM_voyage",
                "GUM_whow",
            ]
        else:
            # https://github.com/UniversalDependencies/UD_English-GENTLE/blob/master/stats.xml
            assert len(tags) == 17
            assert cont_sentences == 1334
            assert list(data.keys()) == [
                "GENTLE_dictionary",
                "GENTLE_esports",
                "GENTLE_legal",
                "GENTLE_medical",
                "GENTLE_poetry",
                "GENTLE_proof",
                "GENTLE_syllabus",
                "GENTLE_threat",
            ]

        return data

    def read_data(self, paths: list[str]) -> Dict[str, List[List[Tuple[str, str]]]]:
        data = {}
        for path in paths:
            assert os.path.exists(path), f"The {path} path does not exist"
            # Name of the read last document type
            last_doc_id = ""
            with open(path, "r", encoding="utf-8") as file:
                for tokenlist in parse_incr(file):
                    if "newdoc id" in tokenlist.metadata:
                        # https://github.com/UniversalDependencies/UD_English-GUM/tree/master/not-to-release/file-lists
                        # https://github.com/UniversalDependencies/UD_English-GENTLE/tree/master/not-to-release/sources
                        # Get the document type and remove unnecessary additional information
                        doc_type = "_".join(
                            tokenlist.metadata["newdoc id"].split("_")[:2]
                        )
                        # Avoid the first case and change of document type
                        if doc_type != last_doc_id:
                            last_doc_id = doc_type

                    # Auxiliar list to store in a Tuple the words and tags of the sentence
                    auxiliar = []
                    for token in tokenlist:
                        # Token has the following structure:
                        # token: <class 'conllu.models.Token'> /// dict_keys(['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'])
                        if token["upos"] == "_":
                            """AVOID THIS CASE, as _ is not a valid PoS tag
                            12-13	workforce’s	_	_	_	_	_	_	_	_
                            12	workforce	workforce	NOUN	NN	Number=Sing	14	nmod:poss	14:nmod:poss	_
                            13	’s	's	PART	POS	_	12	case	12:case	Entity=42)
                            """
                            continue
                        # Possibility to use the lemma or the form of the word
                        auxiliar.append(
                            (
                                token["lemma" if self.lemmatize else "form"].lower(),
                                token["upos"],
                            )
                        )
                    
                    # If the document type is already in the dictionary, append the new data else create a new key
                    if doc_type in data:
                        data[doc_type].append(auxiliar)
                    else:
                        data[doc_type] = [auxiliar]
        return data

    def create_vocab(self) -> List[str]:
        # Get the most common words in the train dataset
        words, times = zip(*self.counter.most_common())
        words = np.array(words)
        times = np.array(times)

        # Calculate the index of the words that are necessary to reach the threshold
        cum_times = np.cumsum(times)
        total_words = cum_times[-1]
        idx = np.searchsorted(cum_times, self.threshold * total_words)

        return words[:idx + 1].tolist(), times[:idx + 1].tolist()

In [3]:
PATH_TO_REPO_FOLDER = input(
    "Enter the path to the repository folder (must end in HMM_PoS_Tagger): "
)

PATH_TO_DATA_FOLDER = os.path.join(PATH_TO_REPO_FOLDER, "data")
assert os.path.exists(
    PATH_TO_DATA_FOLDER
), f"The {PATH_TO_DATA_FOLDER} path does not exist"
LEMMATIZE = True
THRESHOLD = 0.9

tagger = HMM_PoS_tagger(path_data=PATH_TO_DATA_FOLDER, lemmatize=LEMMATIZE, threshold=THRESHOLD)

## Viterbi algorithm

In [None]:
# Implementation of the viterbi algorithm

##Experiments

### In-domain

### Out-of-domain

## Conclusion