# 1. Preprocess Training Data

*Version: 2022-04-19*

Prerequisite: necessary corpora have been downloaded and stored in `lexnlp/ml/catalog/data/`.

Algorithm:

```text
For each corpus:
    Extract all `.txt` files.
    For each text file:
        Get sentences until their combined lengths sum to some limit L.
        Normalize each sentence.
        Lemmatize each sentence.
        Tokenize each sentence.
        Combine token lists into one string.
        Write each string as a single line into a combined text file.
```

---

**Imports**

In [1]:
# standard library
from pathlib import Path
from tarfile import TarFile
from tempfile import TemporaryDirectory
from typing import Final, List, Set, Tuple
from concurrent.futures import ProcessPoolExecutor

# third-party libraries
from psutil import Process
from tqdm.notebook import tqdm

# LexNLP
from lexnlp.ml.catalog import get_path_from_catalog
from lexnlp.ml.normalizers import Normalizer
from lexnlp.ml.sklearn_transformers import TransformerPreprocessor
from lexnlp.extract.en.amounts import get_amount_annotations
from lexnlp.extract.en.dates import get_date_annotations
from lexnlp.extract.en.percents import get_percent_annotations
from lexnlp.extract.en.ratios import get_ratio_annotations

---

## Constants, Objects, and Functions

In [2]:
PATH_PREPROCESSED: Final[Path] = Path('./preprocessed/')
PATH_PREPROCESSED.mkdir(exist_ok=True)
MAX_WORKERS: Final[int] = (len(Process().cpu_affinity()) - 1) or 1
HEAD_CHARACTER_N: Final[int] = 2000

In [3]:
TAGS: Set[str] = {

    # a sample of USPTO grant backgrounds
    'corpus/uspto-sample/0.1',

    # United States Federal Register, 2021
    'corpus/govinfo-fr-2021/0.1',

    # labeled contract types
    'corpus/contract-types/0.1',

    # a sample of municipal bonds
    'corpus/bonds/0.1',

    # Caselaw Access Project; opinions from Arkansas, Illinois, North Carolina, New Mexico
    'corpus/caselaw-access-project-ark-ill-nc-nm-subset-144million-characters/0.1',

    # Atticus CUAD v1 contracts
    'corpus/atticus-cuad-v1-plaintext/0.1',

    # EUR-Lex documents downloaded via api.epdb.eu
    'corpus/eurlex-sample-10000/0.1',

    # ArXiv abstracts containing "agreement"
    'corpus/arxiv-abstracts-with-agreement/0.1',

    # assorted SEC EDGAR filings
    'corpus/sec-edgar-forms-3-4-5-8k-10k-sample/0.1',
}

In [4]:
NORMALIZER: Final[Normalizer] = Normalizer(
    normalizations=(
        (get_ratio_annotations, '__RATIO__'),
        (get_date_annotations, '__DATE__'),
        (get_percent_annotations, '__PERCENT__'),
        (get_amount_annotations, '__AMOUNT__'),
    ),
)

In [5]:
TRANSFORMER_PREPROCESSOR: TransformerPreprocessor = TransformerPreprocessor(
    normalizer=NORMALIZER,
    head_character_n=HEAD_CHARACTER_N,
)

In [6]:
def preprocess_corpus(path_corpus: Path) -> Path:
    """
    Extracts and reads all `.txt` files, saving their contents in a list of strings.
    Then preprocesses each text and saves to a different list of strings.
    Each string from the second list is written as a line in the output file.

    Args:
        path_corpus (Path):
            A Path to a corpus tarfile containing text files.

    Returns:
        A Path to the output file.
    """
    texts: List[str] = []
    with TemporaryDirectory() as temporary_directory:
        with TarFile.open(name=path_corpus, mode=f'r|{path_corpus.suffix.split(".")[-1]}') as tar_file:
            tar_file.extractall(path=temporary_directory)
            text_files: Tuple[Path] = tuple(Path(temporary_directory).rglob('*.txt'))
            for text_file in text_files:
                with text_file.open('r') as f:
                    try:
                        text: str = f.read()
                        texts.append(text[:HEAD_CHARACTER_N * 2])
                    except UnicodeDecodeError as unicode_decode_error:
                        print(f'...{unicode_decode_error} @ {text_file}')

    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_pool_executor:
        documents: List[str] = list(tqdm(
            process_pool_executor.map(TRANSFORMER_PREPROCESSOR.preprocess_document, texts),
            total=len(texts)
        ))

    directory: Path = PATH_PREPROCESSED / tag
    directory.mkdir(parents=True, exist_ok=True)
    path_output_file: Path = directory / f'{path_corpus.name}.txt'
    with path_output_file.open('w') as f:
        for document in documents:
            f.write(f'{document}\n')
    return path_output_file

---

## Preprocess Corpora

In [7]:
for tag in TAGS:
    path_corpus: Path = get_path_from_catalog(tag=tag)
    print('Preprocessing', tag)
    preprocess_corpus(path_corpus=path_corpus)

Preprocessing corpus/sec-edgar-forms-3-4-5-8k-10k-sample/0.1


  0%|          | 0/992 [00:00<?, ?it/s]

