Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
233 lines (215 sloc) 8.9 KB
""" Normalize a arXiv dump
- copy PDF files as is
- unzip gzipped single files
- copy if it's a LaTeX file
- extract gzipped tar archives
- try to flatten contents to a single LaTeX file
- ignores non LaTeX contents (HTML, PS, TeX, ...)
"""
import chardet
import gzip
import magic
import os
import re
import shutil
import subprocess
import sys
import tarfile
import tempfile
MAIN_TEX_PATT = re.compile(r'(\\begin\s*\{\s*document\s*\})', re.I)
# ^ with capturing parentheses so that the pattern can be used for splitting
PDF_EXT_PATT = re.compile(r'^\.pdf$', re.I)
GZ_EXT_PATT = re.compile(r'^\.gz$', re.I)
TEX_EXT_PATT = re.compile(r'^\.tex$', re.I)
NON_TEXT_PATT = re.compile(r'^\.(pdf|eps|jpg|png|gif)$', re.I)
BBL_SIGN = '\\bibitem'
# natbib fix
PRE_FIX_NATBIB = True
NATBIB_PATT = re.compile((r'\\cite(t|p|alt|alp|author|year|yearpar)\s*?\*?\s*?'
'(\[[^\]]*?\]\s*?)*?\s*?\*?\s*?\{([^\}]+?)\}'),
re.I)
# bibitem option fix
PRE_FIX_BIBOPT = True
BIBOPT_PATT = re.compile(r'\\bibitem\s*?\[[^]]*?\]', re.I|re.M)
# ↑ above two solve most tralics problems; except for mnras style bibitems
# (https://ctan.org/pkg/mnras)
# agressive math pre-removal
PRE_FILTER_MATH = False
FILTER_PATTS = []
for env in ['equation', 'displaymath', 'array', 'eqnarray', 'align', 'gather',
'multline', 'flalign', 'alignat']:
s = r'\\begin\{{{0}[*]?\}}.+?\\end\{{{0}\}}'.format(env)
patt = re.compile(s, re.I | re.M | re.S)
FILTER_PATTS.append(patt)
FILTER_PATTS.append(re.compile(r'\$\$.+?\$\$', re.S))
FILTER_PATTS.append(re.compile(r'\$.+?\$', re.S))
FILTER_PATTS.append(re.compile(r'\\\(.+?\\\)', re.S))
FILTER_PATTS.append(re.compile(r'\\\[.+?\\\]', re.S))
def read_file(path):
try:
with open(path) as f:
cntnt = f.read()
except UnicodeDecodeError:
blob = open(path, 'rb').read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
try:
cntnt = blob.decode(encoding)
except (UnicodeDecodeError, LookupError) as e:
encoding = chardet.detect(blob)['encoding']
if encoding:
try:
cntnt = blob.decode(encoding, errors='replace')
except:
return ''
else:
return ''
return cntnt
def read_gzipped_file(path):
blob = gzip.open(path, 'rb').read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
try:
cntnt = blob.decode(encoding)
except (UnicodeDecodeError, LookupError) as e:
encoding = chardet.detect(blob)['encoding']
if not encoding:
return False
cntnt = blob.decode(encoding, errors='replace')
return cntnt
def remove_math(latex_str):
parts = re.split(MAIN_TEX_PATT, latex_str, maxsplit=1)
for patt in FILTER_PATTS:
parts[2] = re.sub(patt, '', parts[2])
return ''.join(parts)
def normalize(IN_DIR, OUT_DIR, write_logs=True):
def log(msg):
if write_logs:
with open(os.path.join(OUT_DIR, 'log.txt'), 'a') as f:
f.write('{}\n'.format(msg))
if not os.path.isdir(IN_DIR):
print('dump directory does not exist')
return False
if not os.path.isdir(OUT_DIR):
os.makedirs(OUT_DIR)
for fn in os.listdir(IN_DIR):
path = os.path.join(IN_DIR, fn)
aid, ext = os.path.splitext(fn)
if PDF_EXT_PATT.match(ext):
# copy over pdf file as is
dest = os.path.join(OUT_DIR, fn)
shutil.copyfile(path, dest)
elif GZ_EXT_PATT.match(ext):
if tarfile.is_tarfile(path):
with tempfile.TemporaryDirectory() as tmp_dir_path:
# extract archive contents
tar = tarfile.open(path)
fnames = tar.getnames()
tar.extractall(path=tmp_dir_path)
# identify main tex file
main_tex_path = None
ignored_names = []
# check .tex files first
for tfn in fnames:
if not TEX_EXT_PATT.match(os.path.splitext(tfn)[1]):
ignored_names.append(tfn)
continue
tmp_file_path = os.path.join(tmp_dir_path, tfn)
if os.path.isdir(tmp_file_path):
continue
try:
cntnt = read_file(tmp_file_path)
except:
continue
if re.search(MAIN_TEX_PATT, cntnt) is not None:
main_tex_path = tmp_file_path
# try other files
if main_tex_path is None:
for tfn in ignored_names:
tmp_file_path = os.path.join(tmp_dir_path, tfn)
if NON_TEXT_PATT.match(os.path.splitext(tfn)[1]):
continue
try:
cntnt = read_file(tmp_file_path)
if re.search(MAIN_TEX_PATT, cntnt) is not None:
main_tex_path = tmp_file_path
except:
continue
# give up
if main_tex_path is None:
log(('couldn\'t find main tex file in dump archive {}'
'').format(fn))
continue
# "identify" bbl file
# https://arxiv.org/help/submit_tex#bibtex
main_tex_fn = os.path.normpath(
main_tex_path).split(os.sep)[-1]
fn_base = os.path.splitext(main_tex_path)[0]
bbl_fn = '{}.bbl'.format(fn_base)
if os.path.isfile(os.path.join(tmp_dir_path, bbl_fn)):
latexpand_args = ['latexpand',
'--expand-bbl',
bbl_fn,
main_tex_fn]
else:
latexpand_args = ['latexpand',
main_tex_fn]
# flatten to single tex file and save
new_tex_fn = '{}.tex'.format(aid)
tmp_dest = os.path.join(tmp_dir_path, new_tex_fn)
out = open(tmp_dest, mode='w')
if write_logs:
err = open(
os.path.join(OUT_DIR, 'log_latexpand.txt'), 'a'
)
else:
err = open(os.devnull, 'w')
err.write('\n------------- {} -------------\n'.format(aid))
err.flush()
subprocess.run(latexpand_args, stdout=out, stderr=err,
cwd=tmp_dir_path)
out.close()
err.close()
# re-read and write to ensure utf-8 b/c latexpand doesn't
# behave
cntnt = read_file(tmp_dest)
if PRE_FIX_NATBIB:
cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
if PRE_FIX_BIBOPT:
cntnt = BIBOPT_PATT.sub(r'\\bibitem', cntnt)
if PRE_FILTER_MATH:
cntnt = remove_math(cntnt)
dest = os.path.join(OUT_DIR, new_tex_fn)
with open(dest, mode='w', encoding='utf-8') as f:
f.write(cntnt)
else:
# extraxt gzipped tex file
cntnt = read_gzipped_file(path)
if not cntnt:
continue
if re.search(MAIN_TEX_PATT, cntnt) is None:
log('unexpected content in dump archive {}'.format(fn))
continue
new_fn = '{}.tex'.format(aid)
if PRE_FIX_NATBIB:
cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
if PRE_FIX_BIBOPT:
cntnt = BIBOPT_PATT.sub('\\bibitem', cntnt)
if PRE_FILTER_MATH:
cntnt = remove_math(cntnt)
dest = os.path.join(OUT_DIR, new_fn)
with open(dest, mode='w', encoding='utf-8') as f:
f.write(cntnt)
else:
log('unexpected file {} in dump directory'.format(fn))
return True
if __name__ == '__main__':
if len(sys.argv) != 3:
print(('usage: python3 nomalize_arxiv_dump.py </path/to/dump/dir> </pa'
'th/to/out/dir>'))
sys.exit()
IN_DIR = sys.argv[1]
OUT_DIR = sys.argv[2]
ret = normalize(IN_DIR, OUT_DIR)
if not ret:
sys.exit()
You can’t perform that action at this time.