# Imports
<a id='imports_id'> </a>

In [29]:
import os

import pandas as pd
from dataclasses import dataclass, field
from typing import TextIO, List
from abc import ABC
import pyarrow as pa
from functools import cached_property

In [45]:


@dataclass
class DatabaseParser(ABC):
    dest: str
    seq_per_csv: int = field(default=100000)


    @cached_property
    def table_schema(self) -> pa.Schema:
        list_str_type = pa.list_(pa.string())
        table_schema = pa.schema([
            pa.field('init_words', list_str_type),
            pa.field('mod_words', list_str_type),
            pa.field('pos_tags', list_str_type)])
        return table_schema

    def __parse_word(self, line: str) -> List[str]:
        return line.split()[1:]  # discard word index

    def __read_sentence(self, database: TextIO) -> pa.Table:
        init_words = []
        mod_words = []
        pos_tags = []

        for i, line in enumerate(database):
            if not line[0].isnumeric():
                break
            init_word, mod_word, pos_tag = self.__parse_word(line)
            init_words.append(init_word)
            mod_words.append(mod_word)
            pos_tags.append(pos_tag)

        return pa.Table.from_arrays([[init_words], [mod_words], [pos_tags]], schema=self.table_schema)

    def parse(self, database: TextIO) -> None:
        sentences = self.__read_sentence(database)
        curr: int = 0
        part: int = 1
        while not database.closed:
            curr_sent = self.__read_sentence(database)
            curr += 1
            if curr == self.seq_per_csv:
                sentences.to_pandas(types_mapper=pd.ArrowDtype).to_orc(os.path.join(self.dest, f'sent_{part}.orc'))
                sentences = curr_sent
                curr = 0
                part += 1
            else:
                sentences = pa.concat_tables([curr_sent, sentences])


        sentences.to_pandas(types_mapper=pd.ArrowDtype).to_orc(os.path.join(self.dest, f'sent_{part}.orc'))





In [None]:
with open('data/SentenceDatabase.txt', 'r') as f:
   DatabaseParser('data', 100000).parse(f)


In [None]:
print('Done')