# Imports
<a id='imports_id'> </a>

In [2]:
import os
import json

import pandas as pd
from dataclasses import dataclass, field
from typing import TextIO, List
from abc import ABC
import pyarrow as pa
from functools import cached_property


In [5]:
PREFIX = 'sent'
DATA_FOLDER = 'data'
RAW_DATA = 'SentenceDatabase.txt'

# POS Data Extraction

In [17]:
@dataclass
class DatabaseParser(ABC):
    dest: str
    seq_per_csv: int = field(default=100000)


    @cached_property
    def table_schema(self) -> pa.Schema:
        list_str_type = pa.list_(pa.string())
        table_schema = pa.schema([
            pa.field('init_words', list_str_type),
            pa.field('mod_words', list_str_type),
            pa.field('pos_tags', list_str_type)])
        return table_schema

    def __parse_word(self, line: str) -> List[str]:
        return line.split()[1:]  # discard word index

    def __read_sentence(self, database: TextIO) -> pa.Table:
        init_words = []
        mod_words = []
        pos_tags = []

        for i, line in enumerate(database):
            if line == "":
                raise EOFError
            if not line[0].isnumeric():
                break
            init_word, mod_word, pos_tag = self.__parse_word(line)
            init_words.append(init_word)
            mod_words.append(mod_word)
            pos_tags.append(pos_tag)

        return pa.Table.from_arrays([[init_words], [mod_words], [pos_tags]], schema=self.table_schema)

    def parse(self, database: TextIO) -> None:
        sentences = []
        curr: int = 0
        part: int = 1
        while True:
            try:
                curr_sent = self.__read_sentence(database)
            except EOFError:
                break
            curr += 1
            if curr == self.seq_per_csv:
                pa.concat_tables(sentences).to_pandas(types_mapper=pd.ArrowDtype).to_orc(os.path.join(self.dest, f'{PREFIX}_{part}.orc'))
                sentences = [curr_sent]
                curr = 0
                part += 1
            else:
                sentences.append(curr_sent)


        sentences.to_pandas(types_mapper=pd.ArrowDtype).to_orc(os.path.join(self.dest, f'{PREFIX}_{part}.orc'))




In [None]:
with open(os.path.join(DATA_FOLDER, RAW_DATA), 'r') as f:
   DatabaseParser(DATA_FOLDER, 1000000).parse(f)

In [6]:
NUM_PARTS = 0

for root, dirs, files in  os.walk(DATA_FOLDER):
    NUM_PARTS += len([file for file in files if file.startswith(PREFIX)])

NUM_PARTS

9

In [7]:
def get_dataset(idx: int) -> pd.DataFrame:
    return pd.read_orc(os.path.join(DATA_FOLDER, f'{PREFIX}_{idx}.orc'))

# Frequency Data

In [8]:
from collections import Counter
freq = Counter()
for idx in range(1, NUM_PARTS + 1):
    df = get_dataset(idx)
    for word in df['mod_words'].explode():
        freq[word] += 1

In [10]:
with open(os.path.join(DATA_FOLDER, 'vocab', 'freq.json'), 'w') as f:
    json.dump(freq, f, ensure_ascii = False)

In [11]:
# Freq. Score

In [16]:
freq_scores = []
for idx in range(1, NUM_PARTS + 1):
    df = get_dataset(idx)
    freq_scores.append(df.mod_words.apply(lambda l: sum([freq[word] for word in l]) / len(l) if len(l) > 0 else 0).quantile(0.02))
    

In [21]:
min_freq_score = sum(freq_scores) / len(freq_scores)
min_freq_score

382749.2299380481

In [None]:
for idx in range(1, NUM_PARTS + 1):
    df = get_dataset(idx)
    

# Vocabulary Exctraction

In [10]:
unique_words = set()
for idx in range(1, NUM_PARTS + 1):
    df = get_dataset(idx)
    unique_words |= set(df['mod_words'].explode())

In [11]:
len(unique_words)

1836994

In [18]:
with open(os.path.join(DATA_FOLDER, 'vocab', 'words.json'), 'w') as f:
    json.dump(list(unique_words), f, ensure_ascii = False)