In [7]:
import sys
import json
from pathlib import Path
from zipfile import ZipFile
from pprint import pprint

import pandas as pd
import xmltodict
from bs4 import BeautifulSoup as bs
from loguru import logger

from funcs.paths import paths

In [3]:
input_dir = paths["raw_data_dir"] / "examples"
assert input_dir.exists(), input_dir

output_dir = paths["tmp_output"] / "examples"
output_dir.mkdir(exist_ok=True)

In [5]:
sub_dir_list = [_ for _ in input_dir.iterdir() if _.is_dir()]
print(sub_dir_list)

[PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/0c1a93e7-6c7d-1014-8263-f3e34f963ff6'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/0a3e547c-6c14-1014-b23d-977b4d602bda'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/1c135323-6c01-1014-9e9e-bcee29abca77'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/24887b80-6c04-1014-9445-972a17b06fac'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/0a2ef310-6c04-1014-8ee5-ac250845df11'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/1d7e60e6-6cb7-1014-be27-8a6966e18789'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/0a9a5225-6c3e-1014-b429-f3eaae354361'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/examples/0a797d56-6c79-1014-907b-e740ddf

In [6]:
[
    _.stem for _ in sub_dir_list
]

['0c1a93e7-6c7d-1014-8263-f3e34f963ff6',
 '0a3e547c-6c14-1014-b23d-977b4d602bda',
 '1c135323-6c01-1014-9e9e-bcee29abca77',
 '24887b80-6c04-1014-9445-972a17b06fac',
 '0a2ef310-6c04-1014-8ee5-ac250845df11',
 '1d7e60e6-6cb7-1014-be27-8a6966e18789',
 '0a9a5225-6c3e-1014-b429-f3eaae354361',
 '0a797d56-6c79-1014-907b-e740ddfacd0c']

In [18]:
def convert_xml_to_json(input_path: Path, output_dir: Path) -> bool:
    # get stem name
    stem = input_path.stem
    logger.info(f"start: {stem}")
    # get fulltext location
    manifest_path = input_path / "manifest.xml"
    with manifest_path.open() as f:
        manifest_content = f.read()
    bs_content = bs(manifest_content, "xml")
    find_res = bs_content.find_all("instance")
    # NOTE: sometimes it is the downward slash...
    fulltext_file = find_res[0].attrs["href"].replace("\\", "/")
    fulltext_path = input_path / fulltext_file
    assert fulltext_path.exists(), fulltext_path
    # read in xml
    with fulltext_path.open() as f:
        fulltext_content = f.read()
    bs_content = bs(fulltext_content, "xml")
    # convert xml
    bs_dict = xmltodict.parse(str(bs_content))
    # dump output json
    output_path = output_dir / f"{stem}.json"
    with output_path.open("w") as f:
        json.dump(bs_dict, f)
    logger.info(f"done: {stem}")
    return True

In [19]:
[
    convert_xml_to_json(_, output_dir=output_dir) for _ in sub_dir_list
]

[32m2023-07-19 17:31:17.974[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_xml_to_json[0m:[36m4[0m - [1mstart: 0c1a93e7-6c7d-1014-8263-f3e34f963ff6[0m
[32m2023-07-19 17:31:18.023[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_xml_to_json[0m:[36m24[0m - [1mdone: 0c1a93e7-6c7d-1014-8263-f3e34f963ff6[0m
[32m2023-07-19 17:31:18.024[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_xml_to_json[0m:[36m4[0m - [1mstart: 0a3e547c-6c14-1014-b23d-977b4d602bda[0m
[32m2023-07-19 17:31:18.098[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_xml_to_json[0m:[36m24[0m - [1mdone: 0a3e547c-6c14-1014-b23d-977b4d602bda[0m
[32m2023-07-19 17:31:18.100[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_xml_to_json[0m:[36m4[0m - [1mstart: 1c135323-6c01-1014-9e9e-bcee29abca77[0m
[32m2023-07-19 17:31:18.122[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_xml_to_json[0m:[36m24[0m - [1mdone: 1c135323-6c01-1014-9e9e-bcee29abca77[0m
[32

[True, True, True, True, True, True, True, True]