In [None]:
from dataclasses import dataclass, field
from typing import List, Any, Dict, Set, TextIO
from collections import Counter
from abc import ABC
import pprint
import re
import os
pp = pprint.PrettyPrinter(indent=4)    

In [None]:
@dataclass
class Word:
    word_init: str
    word_mod: str
    pos_tags: str
    
    def __str__(self) -> None:
        return f'{self.word_init}@{self.pos_tags}\n'
    
    def save(self, dest:TextIO) -> None:
        dest.write(self.__str__())
        

In [None]:
DOC_TAG = 'document'
TEXT_TAG = 'text'
DIV_TAG = 'div'
PARAGRAPH_TAG = 'p'


WORD_PATTERN = re.compile(r'(?P<word_forms>.+?)¦(?P<lemma1>.*?)¦?'
                               r'.*?¦(?P<lemma2>.*?)¦.*?¦(?P<pos_tags>.+?)¦\s*(?P<title>.+)', re.DOTALL)
START_TAG_PATTERN = re.compile(r'<(?P<tag>\w+)(?P<attributes>(\s*?\w+?=.+?)*)>\s+(?P<title>.+)', 	re.DOTALL)
END_TAG_PATTERN = re.compile(r'<(/?(?P<tag>\w+)/?)>\s+(?P<title>.+)', 	re.DOTALL)

@dataclass
class CorpusParser:
    dest: TextIO
    tag_stack: List[str] = field(default_factory=list)
    
    def handle_starttag(self, match: re.Match) -> None:
        tag = match.groupdict()['tag']
        self.tag_stack.append(tag)  # add tag on top of the stack, marking start of the tag
        if tag == PARAGRAPH_TAG or tag == DIV_TAG:
            self.dest.write('<START>\n')
                
    def handle_endtag(self, match: re.Match) -> None:
        tag = match.groupdict()['tag']
        try:
            tag = self.tag_stack.pop()  ## remove tag from the top of the stack, marking end of the tag
            if tag == PARAGRAPH_TAG or tag == DIV_TAG:
                self.dest.write('<END>\n')
        except IndexError:
            print(tag, match.group())
            
    def handle_parts(self, parts: List[str]) -> Word:
        # title = "".join(parts[-1].strip().split())
        word_forms = parts[0]
        word_forms = word_forms.split()
        init_form = " ".join(word_forms[:len(word_forms) // 2 ])
        ext_form = " ".join(word_forms[len(word_forms) // 2: ])
        pos_tags = parts[-2].split()[0]
        # lemma_basic = Counter([parts[1]])
        # lemma_adv = Counter(parts[3].split())
        word = Word(init_form, ext_form, pos_tags)
        return word

    def handle_word(self, line: str) -> None:
        curr_tag = self.tag_stack[-1]
        if curr_tag == PARAGRAPH_TAG or curr_tag == DIV_TAG:
            parts = line.split('¦')
            word = self.handle_parts(parts)
            word.save(self.dest)
            
            
    def feed(self, line: str) -> None:
        open_tag_match = START_TAG_PATTERN.match(line)
        closed_tag_match = END_TAG_PATTERN.match(line)
        if open_tag_match is not None:
            self.handle_starttag(open_tag_match)
        elif closed_tag_match is not None:
            self.handle_endtag(closed_tag_match)
        else:
            self.handle_word(line)
            

In [None]:
CORPUS_FILES_PATH = r'corpus/'
DEST_FILE_PATH = r'pos/'

source = os.path.join(CORPUS_FILES_PATH, f"grc-cwb")
dest = os.path.join(DEST_FILE_PATH, f"texts.txt")
print(source)
with open(source, "r") as f:
    with open(dest, 'w') as f2:
        parser = CorpusParser(f2)
        for number, line in enumerate(f):
            parser.feed(line)
    
