Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cron/anomalies-cron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -o pipefail

cd "$(dirname "$0")/.."
python -u -m legi.download ./tarballs
python -u -m legi.download -r ./tarballs
echo "=> Starting tar2sqlite..."
python -u -m legi.tar2sqlite legi.raw.sqlite tarballs --anomalies --anomalies-dir=anomalies --raw | tee -a legi.raw.log
echo "=> Uploading anomaly logs..."
Expand Down
144 changes: 109 additions & 35 deletions legi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,64 +5,138 @@
import argparse
import ftplib
import os
import sys
from time import sleep
import traceback
import urllib.parse

import lxml.html
import requests


DILA_FTP_HOST = 'echanges.dila.gouv.fr'
DILA_FTP_PORT = 21
DILA_HTTP_URL = 'https://echanges.dila.gouv.fr/OPENDATA'
DILA_LEGI_DIR = '/LEGI'


def download_legi(dst_dir):
def log(*args, **kw):
kw.setdefault('file', sys.stderr)
print(*args, **kw)


def download_legi(dst_dir, retry_hours=0):
if not os.path.exists(dst_dir):
os.mkdir(dst_dir)
local_files = {filename: {} for filename in os.listdir(dst_dir)}
sleep_hours = 0
while True:
try:
try:
download_legi_via_ftp(dst_dir)
except Exception:
download_legi_via_http(dst_dir)
break
except Exception:
# Retry in an hour, unless we've reached our time limit
sleep_hours += 1
if sleep_hours > retry_hours:
raise
traceback.print_exc()
log("Waiting an hour before retrying...")
try:
sleep(3600)
except KeyboardInterrupt:
sys.exit(1)


def download_legi_via_ftp(dst_dir):
local_files = set(os.listdir(dst_dir))
ftph = ftplib.FTP()
ftph.connect(DILA_FTP_HOST, DILA_FTP_PORT)
ftph.login()
ftph.cwd(DILA_LEGI_DIR)
remote_files = [filename for filename in ftph.nlst() if '.tar.' in filename]
common_files = [f for f in remote_files if f in local_files]
missing_files = [f for f in remote_files if f not in local_files]
remote_files = {filename: {} for filename in remote_files}
for filename in common_files:
local_files[filename]['size'] = os.path.getsize(
os.path.join(dst_dir, filename)
)
ftph.voidcmd('TYPE I')
for filename in remote_files:
file_size = ftph.size(filename)
remote_files[filename]['size'] = int(file_size)
invalid_files = []
for filename in common_files:
if local_files[filename]['size'] < remote_files[filename]['size']:
invalid_files.append(filename)
print(
'{} remote files, {} common files ({} invalid), {} missing files'
.format(
len(remote_files),
len(common_files), len(invalid_files),
len(missing_files)
)
)
for filename in invalid_files:
filepath = os.path.join(dst_dir, filename)
with open(filepath, mode='a+b') as fh:
print('Continuing the download of the file {}'.format(filename))
ftph.retrbinary(
'RETR {}'.format(filename),
fh.write,
rest=local_files[filename]['size']
)
log('{} remote files, {} common files, {} missing files'.format(
len(remote_files), len(common_files), len(missing_files),
))
for filename in missing_files:
filepath = os.path.join(dst_dir, filename)
with open(filepath, mode='wb') as fh:
print('Downloading the file {}'.format(filename))
ftph.retrbinary('RETR {}'.format(filename), fh.write, rest=0)
with open(filepath + '.part', mode='a+b') as fh:
offset = fh.tell()
if offset:
log('Continuing the download of the file ' + filename)
else:
log('Downloading the file ' + filename)
ftph.retrbinary('RETR ' + filename, fh.write, rest=offset)
os.rename(filepath + '.part', filepath)
ftph.quit()


def download_legi_via_http(dst_dir):
local_files = set(os.listdir(dst_dir))
log("Downloading the index page...")
sess = requests.Session()
base_url = DILA_HTTP_URL + DILA_LEGI_DIR + '/'
r = sess.get(base_url)
assert r.status_code == 200, r
remote_files, common_files, missing_files = set(), [], []
html = lxml.html.document_fromstring(r.text)
for url in html.xpath('//a/@href'):
url = urllib.parse.urljoin(base_url, url)
if not url.startswith(base_url):
continue
url = urllib.parse.urlsplit(url)
filename = url.path.rsplit('/', 1)[-1]
if '.tar.' not in filename:
continue
if filename in remote_files:
continue
remote_files.add(filename)
if filename in local_files:
common_files.append(filename)
else:
missing_files.append(filename)
log('{} remote files, {} common files, {} missing files'.format(
len(remote_files), len(common_files), len(missing_files),
))
for filename in missing_files:
filepath = os.path.join(dst_dir, filename)
url = base_url + filename
with open(filepath + '.part', mode='a+b') as fh:
offset = fh.tell()
if offset:
log('Continuing the download of the file ' + filename)
headers = {'Range': f'bytes={offset}-'}
else:
log('Downloading the file ' + filename)
headers = None
r = sess.get(url, headers=headers, stream=True)
if offset:
if r.status_code == 200:
# It looks like the server is sending the whole file instead
# of only the requested range.
fh.seek(0)
elif r.status_code == 416:
# It looks like we had already finished downloading this file.
os.rename(filepath + '.part', filepath)
continue
else:
assert r.status_code == 206, r
else:
assert r.status_code == 200, r
for chunk in r.iter_content(chunk_size=None):
fh.write(chunk)
os.rename(filepath + '.part', filepath)


if __name__ == '__main__':
p = argparse.ArgumentParser()
p.add_argument('directory')
p.add_argument('-r', '--retry', action='store_true', default=False,
help="if the download fails, retry every hour for up to 6 hours")
args = p.parse_args()
download_legi(args.directory)
retry_hours = 6 if args.retry else 0
download_legi(args.directory, retry_hours=retry_hours)
5 changes: 4 additions & 1 deletion legi/spelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import re

from appdirs import site_data_dir
import hunspell

from .roman import ROMAN_PATTERN_SIMPLE

Expand All @@ -27,6 +26,10 @@ class Spellchecker:

def __init__(self, lang, filters=(), intra_word_chars=INTRA_WORD_CHARS):
self.lang = lang
try:
import hunspell
except ImportError:
raise SpellcheckingIsUnavailable("the hunspell module is missing")
paths = self._find_hunspell_files()
if not paths:
raise SpellcheckingIsUnavailable("the hunspell files for lang %r are missing" % lang)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ appdirs
hunspell
libarchive-c
lxml
requests
tqdm