In [71]:
import sys
import json
from pathlib import Path
from zipfile import ZipFile
from pprint import pprint

import pandas as pd
import xmltodict
import html2text
from bs4 import BeautifulSoup as bs
from pydash import py_
from lxml import etree

from funcs.paths import paths

In [2]:
input_dir = paths["raw_data_dir"] / "medrxiv" / "Current_Content" / "March_2023"
assert input_dir.exists(), input_dir

output_dir = paths["tmp_output"]

# folder scan

In [3]:
file_list = [_ for _ in input_dir.iterdir() if str(_).endswith(".meca")]
print(len(file_list))
print(file_list[:5])

1227
[PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/medrxiv/Current_Content/March_2023/b0e350bd-6d67-1014-ad19-c96e66a09703.meca'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/medrxiv/Current_Content/March_2023/de06d6bd-6c2f-1014-8025-a0d429fa0c93.meca'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/medrxiv/Current_Content/March_2023/19da261a-6c3c-1014-8618-af7a582bd05b.meca'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/medrxiv/Current_Content/March_2023/f0c7fa39-6c1b-1014-b484-8038e16734f9.meca'), PosixPath('/data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/medrxiv/Current_Content/March_2023/035e37af-6bf4-1014-9da4-9c501f776874.meca')]


In [4]:
NUM_SAMPLE = 50

sample_file_list = file_list[:NUM_SAMPLE]

In [6]:
NUM_PREVIEW = 5
for idx, _ in enumerate(sample_file_list[:NUM_PREVIEW]):
    with ZipFile(_, "r") as zip:
        print(f"#{idx}/{len(sample_file_list)}: {str(_)}")
        zip.printdir()
        print("\n\n---\n\n")

#0/50: /data/ik18445/projects/biorxiv-medrxiv-tdm/data/local-source-data/medrxiv/Current_Content/March_2023/b0e350bd-6d67-1014-ad19-c96e66a09703.meca
File Name                                             Modified             Size
content/23286782.xml                           2023-03-08 22:47:40        99666
content/23286782.pdf                           2023-03-03 15:46:16       356007
content/                                       2023-03-08 22:39:46            0
content/23286782v1_fig1.tif                    2023-03-07 02:17:30      5273524
content/23286782v1_tbl1.tif                    2023-03-07 02:17:42     13843644
content/23286782v1_tbl1a.tif                   2023-03-07 02:17:52      6487852
content/23286782v1_tbl2.tif                    2023-03-07 02:18:04     13328436
directives.xml                                 2023-03-08 23:39:50          280
manifest.xml                                   2023-03-08 23:39:48         1500
mimetype                                       202

# zip archive scan

In [8]:
zip_file = sample_file_list[0]

### manifest

In [14]:
manifest_file = "manifest.xml"
with ZipFile(zip_file, "r") as zip:
    zip_data = zip.read(manifest_file)

bs_content = bs(zip_data, "xml")    
print(bs_content.prettify())

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE manifest SYSTEM "http://schema.highwire.org/public/MECA/v0.9/Manifest/Manifest.dtd">
<manifest version="1.0" xmlns="http://manuscriptexchange.org">
 <item id="23286782v1" type="article">
  <title>
   Epigenetic, psychological, and EEG changes after a 1-week retreat based on mindfulness and compassion for stress reduction: Study protocol of a cross-over randomized controlled trial
  </title>
  <instance href="content/23286782.xml" media-type="application/xml"/>
  <instance href="content/23286782.pdf" media-type="application/pdf"/>
 </item>
 <item id="directives" type="x-hw-directives">
  <title>
   HWX Processing Directives
  </title>
  <instance href="directives.xml" media-type="application/vnd.hw-ingest-pi+xml"/>
 </item>
 <item id="transfer" type="transfer-details">
  <title>
   MECA Transfer Info
  </title>
  <instance href="transfer.xml" media-type="application/meca-xfer+xml"/>
 </item>
 <item id="fig1" type="figure">
  <title>
   Fi

In [15]:
find_res = bs_content.find_all("instance")
print(find_res)

[<instance href="content/23286782.xml" media-type="application/xml"/>, <instance href="content/23286782.pdf" media-type="application/pdf"/>, <instance href="directives.xml" media-type="application/vnd.hw-ingest-pi+xml"/>, <instance href="transfer.xml" media-type="application/meca-xfer+xml"/>, <instance href="content/23286782v1_fig1.tif" media-type="image/tiff"/>, <instance href="content/23286782v1_tbl1.tif" media-type="image/tiff"/>, <instance href="content/23286782v1_tbl1a.tif" media-type="image/tiff"/>, <instance href="content/23286782v1_tbl2.tif" media-type="image/tiff"/>]


In [16]:
find_res[0].attrs["href"]

'content/23286782.xml'

In [18]:
fulltext_file = find_res[0].attrs["href"]

### fulltext

In [88]:
with ZipFile(zip_file, "r") as zip:
    zip_data = zip.read(fulltext_file)

bs_content = bs(zip_data, "xml")

In [64]:
fulltext_dump_path = output_dir / "example_fulltext.xml"
with fulltext_dump_path.open("w") as f:
    f.write(bs_content.prettify())

bs_dict = xmltodict.parse(str(bs_content))
parsed_json_path = output_dir / "example_parsed_json.json"
with parsed_json_path.open("w") as f:
    json.dump(bs_dict, f)

# metadata extract

In [40]:
py_.chain(bs_dict).at(
    ["article", "front", "article-meta", "article-id"]
    ).value()

[{'@pub-id-type': 'doi', '#text': '10.1101/2023.03.03.23286782'}]

## title

In [42]:
py_.chain(bs_dict).at(
    ["article", "front", "article-meta", "title-group", "article-title"]
    ).value()[0]

'Epigenetic, psychological, and EEG changes after a 1-week retreat based on mindfulness and compassion for stress reduction: Study protocol of a cross-over randomized controlled trial'

## article version

In [45]:
py_.chain(bs_dict).at(
    ["article", "front", "article-meta", "article-version"]
    ).value()[0]

'1.1'

## category

In [46]:
py_.chain(bs_dict).at(
    ["article", "front", "article-meta", "article-categories", "subj-group", "subject"]
    ).value()[0]

'Psychiatry and Clinical Psychology'

## year-month

In [59]:
find_res = py_.chain(bs_dict).at(
    ["article", "front", "article-meta", "history", "date"]
    ).flatten().filter(lambda col: col["@date-type"] == "accepted").value()[0]
find_res

{'@date-type': 'accepted', 'day': '05', 'month': '3', 'year': '2023'}

In [60]:
pub_date = {
    "year": find_res["year"],
    "month": find_res["month"]
}
pub_date

{'year': '2023', 'month': '3'}

# fulltext fulltext extract

## abstract

## discussion