# Docx Helper

> Function read / write Docx

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp docx

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from pathlib import Path
import docx
import pandas as pd

from lbpy.fs import list_dir

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

## Read Docx

### Read Single File

In [None]:
#| export
def read_docx(path: Path) -> str:
    """Reads the content of a .docx file and returns it as a string."""
    doc = docx.Document(path)
    doc_gen = (para.text for para in doc.paragraphs)
    return "".join(doc_gen)

### Read `*.docx` in Dir

In [None]:
#| export
def read_docx_dir(directory: Path, recursive=True) -> dict:
    """Reads all .docx files in a directory and returns a dictionary."""
    paths_ls = list_dir(directory, pattern="*.docx", recursive=recursive)
    stem_ls = [path.stem for path in paths_ls]
    doc_ls = [read_docx(path) for path in paths_ls]
    return dict(zip(stem_ls, doc_ls))

### Read `.docx` in DF

In [None]:
#| export
def read_docx_dir_df(directory: Path, recursive=True) -> pd.DataFrame:
    """Reads all .docx files in a directory and returns a pandas DataFrame."""
    paths_ls = list_dir(directory, pattern="*.docx", recursive=recursive)
    stem_ls = [path.stem for path in paths_ls]
    doc_ls = [read_docx(path) for path in paths_ls]
    return pd.DataFrame({"filename": stem_ls, "path": paths_ls, "content": doc_ls})

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()