# Imports

In [None]:
from dataclasses import dataclass, field
from typing import NamedTuple, Dict, Set, List, TextIO
import os

import pandas as pd

# Exploration

In [None]:
class Word(NamedTuple):
    word: str
    word_mod: str
    pos: str

    
@dataclass
class WordsParser:
    words: dict[str, Set[Word]] = field(default_factory=dict)
    def reset(self) -> None:
        self.words = {}
        
    def word(self, line: str):
        try:
            word = Word(*line.split('@'))
            self.words[word.word].add(word.pos)
        except TypeError:
            pass
        except KeyError:
            self.words[word.word] = {word.pos}
    def div(self, line: str):
        pass
    
    def feed(self, line: str):
        if line == '<START>' or line == '<END>':
            self.div(line)
        else:
            self.word(line)
            
    def build(self) -> dict[str, Set[Word]]:
        return self.words
        

In [None]:
parser = WordsParser()
with open(os.path.join('pos', 'texts.txt'), 'r') as f:
    for i, line in enumerate(f):
        parser.feed(line[:-1])
        

In [None]:
words = parser.build()

In [None]:
len(words)

In [None]:
avg_pos_per_word = sum(len(pos) for pos in words.values()) / len(words)
avg_pos_per_word

In [None]:
words_with_multiple_pos = len([pos for pos in words.values() if len(pos) > 1])
words_with_multiple_pos

In [None]:
print(f'Perc. of vocabulary with single POS: {(len(words) - words_with_multiple_pos) / len(words) * 100:.2f}%')

In [None]:
main_pos = set(['N', 'V', 'Adv', 'A', 'Num', 'Pron', 'Cj', 'Interj', 'Pp', 'Punct'])
junk_pos = set(['Foreign', 'Unknown', '>XCOMP', 'Symbol', 'Guess', '<MWE>', 'X', 'X=12', 'Zoom', 'VIDEO', 'R', 'Unrecognized', 'B', 'D', 'Z', 'Energia', 'Food', 'Energy', 'General'])
mapping = {"Nom":"N", "Dat":"N", "Gen":"N", "Erg":"N", "Voc":"N", "Inst":"N", "Foc":"Adv", "Att": 'Adv', "Pv": "V"}

In [None]:
words_with_multiple_pos = [pos for pos in [[pos for pos in poss if pos in main_pos] for poss in words.values()] if len(pos) > 1]
words_with_multiple_pos = len(words_with_multiple_pos)
words_with_multiple_pos

In [None]:
print(f'Real perc. of vocabulary with single POS: {(len(words) - words_with_multiple_pos) / len(words) * 100:.2f}%')

# Creating a Vocabulary 

## Clear words

In [None]:
puncts = '…?!.,"„“\';:()`~_‘$%/\\=+*|”'
dash = '-'
georgian_alphabet = 'აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ'
numbers = '0123456789'

In [None]:
def contains_any(str1, str2):
    return any(char in str1 for char in str2)

In [None]:
word_df = pd.DataFrame(words.keys())

In [None]:
# word not contains any of the following: georgian letters, numbers or punctuations
cond1_mal = set(word_df[word_df[0].str.contains(f'[^{georgian_alphabet}{numbers}{puncts}{dash}]')][0])  

cond2_mal = set(word_df[word_df[0].str.contains(f'[{georgian_alphabet}{numbers}]') & word_df[0].str.contains(f'[{puncts}]')][0]) 

cond3_mal = set(word_df[(word_df[0].str.startswith('-') | word_df[0].str.endswith('-') | (word_df[0].str.count('-') > 1)) & (word_df[0].str.len() != 1)][0])

cond4_mal = set(word_df[word_df[0].str.contains(f'[{georgian_alphabet}]') & word_df[0].str.contains(f'[{numbers}]') & ~word_df[0].str.contains(f'[{dash}]')][0])

cond5_mal = set(word_df[~word_df[0].str.contains(f'[{georgian_alphabet}{numbers}]') & ~(word_df[0].str.contains(f'[{puncts}]') & (word_df[0].str.len() == 1))][0])


In [None]:
malformed_words = (cond1_mal | cond2_mal | cond3_mal | cond4_mal | cond5_mal)
print(f'Number of malformed words: {len(malformed_words)}')

In [None]:
word_set = words.keys() - malformed_words
# remove malformed words from dict
words = {word: pos for word, pos in words.items() if word in word_set}

## Clear POS Tags

In [None]:
# map auxilary pos tags to main pos tags and remove junk
words = {word: set([p if p not in mapping else mapping[p] for p in pos if p not in junk_pos]) for word, pos in words.items()}
# assign "Other" to non-major POS tag types
words = {word: set(['Other' if p not in main_pos else p for p in pos]) for word, pos in words.items()}

In [None]:
# remove "Other" tag from set if there is already a main pos tag present
words = {word: set([p for p in pos if len(pos) == 1 or p != 'Other']) for word, pos in words.items()}

In [None]:
# get frequency data. in other words words and their most frequent pos tags
by_freq_df = pd.read_csv(os.path.join('data', 'data.csv'))
by_freq_df = by_freq_df[['word', 'pos_tag']].set_index('word')
by_freq_df

In [None]:
# assign pos: "Num" to every word that contains a number
words = {word: set(['Num']) if contains_any(word, numbers) else pos for word, pos in words.items()}

In [None]:
# for words with ambigious pos tags assign pos tag from dataframe
words = {word: set([by_freq_df.loc[word].pos_tag]) if len(pos) != 1 and word in by_freq_df.index and by_freq_df.loc[word].pos_tag in main_pos else pos for word, pos in words.items()}

In [None]:
words = {word: list(pos)[0] if len(pos) >= 1 else 'Other' for word, pos in words.items()}

In [None]:
POS_TAGS = set(words.values())
print(f'POS tags: {POS_TAGS}')

In [None]:
@dataclass
class Word:
    word: str
    word_mod: str
    pos: str

    
@dataclass
class Sentence:
    words: List[Word] = field(default_factory=list)
    
    def write(self, dest: TextIO):
        for idx, word in enumerate(self.words):
            dest.write(f'{idx}\t{word.word}\t{word.word_mod}\t{word.pos}\n')
        dest.write('\n')
            
        
    
@dataclass
class SentenceParser:
    source: TextIO
    dest: TextIO
    vocab: Set[str]
    word_to_pos: Dict[str, str]
    pos_set: Set[str]
    curr_sentence: Sentence = field(default_factory=Sentence)
        
    def __word(self, line: str) -> None:
        try:
            word = Word(*line.split('@'))
            try:
                last_word = self.curr_sentence.words[-1]
                if (last_word.word == '.' or last_word.word == '?' or last_word.word == '!') \
                and not (word.word == '.' or word.word == '?' or word.word == '!'):
                    self.__end_sentence()
            except IndexError:
                pass
            self.curr_sentence.words.append(word)
        except TypeError:
            # TODO: Mark sentence as malformed
            pass
        
    def __div(self, line: str) -> None:
        pass
    
    def __check_sentence(self) -> bool:
        for word in self.curr_sentence.words:
            if word.word_mod not in self.vocab:
                # print(word.word_mod)
                return False  # we shouldn't save sentences with malformed words
            if word.pos not in self.pos_set:
                word.pos = self.word_to_pos[word.word_mod]
        return True
    
    def __end_sentence(self) -> None:
        if self.__check_sentence(): # we save a sentence only when it contains no malformed words
            self.curr_sentence.write(self.dest)
        self.curr_sentence = Sentence()
    
    def __feed(self, line: str) -> None:
        if line == '<START>' or line == '<END>':
            return self.__div(line)
        else:
            return self.__word(line)
            
    def run(self) -> None:
        for i, line in enumerate(self.source):
            self.__feed(line.strip())
                

In [None]:
with open(os.path.join('pos', 'texts.txt'), 'r') as src:
    with open(os.path.join('data', 'SentenceDatabase.txt'), 'w') as dest:
        parser = SentenceParser(src, dest, set(words.keys()) , words, set(POS_TAGS))
        parser.run()

In [None]:
[1, 2, 3]