Write/update DSS API definition atomically#140
Conversation
Codecov Report
@@ Coverage Diff @@
## master #140 +/- ##
==========================================
+ Coverage 85.75% 86.43% +0.68%
==========================================
Files 31 32 +1
Lines 1172 1202 +30
==========================================
+ Hits 1005 1039 +34
+ Misses 167 163 -4
Continue to review full report at Codecov.
|
| def swagger_spec(self): | ||
| if not self._swagger_spec: | ||
| swagger_url = self.config[self.__class__.__name__].swagger_url | ||
| temp_swagger_filename = os.path.join(self.config.user_config_dir, "temp_swagger.json") |
There was a problem hiding this comment.
I would use with tempfile.NamedTemporaryFile
There was a problem hiding this comment.
Oof you should then use it with the option to use the user config dir as the tempdir. Otherwise the rename is not atomic either (since it could actually be a copy).
5370e69 to
f331f7b
Compare
|
update: |
| """ | ||
| with tempfile.NamedTemporaryFile('wb', dir=dir, delete=False) as tmp: | ||
| tmp.write(content) | ||
| os.rename(tmp.name, filename) |
There was a problem hiding this comment.
I believe that on Windows, it's not kosher to rename a file that's open. Consider creating the tempfile with delete=False, close it, and then renaming it. Then wrap the entire try-finally with something that cleans up the tempfile in case the rename fails.
There was a problem hiding this comment.
Also note from https://docs.python.org/2/library/os.html#os.rename:
On Windows, if dst already exists, OSError will be raised even if it is a file
Here's some possible strategies for defeating that: https://bugs.python.org/issue8828
6f23f59 to
a352bd5
Compare
|
Addressed Windows issues re: atomic writes |
| with tempfile.NamedTemporaryFile('wb', dir=dir, delete=False) as temp: | ||
| temp_filename = temp.name | ||
| temp.write(content) | ||
| while writing and retry_count < max_retries: |
There was a problem hiding this comment.
suggestion:
for _ in range(max_retries):
try:
os.rename(temp_filename, dest_filename)
break
except OSError as ex:
if ex.errno == errno.EEXIST:
# handle Windows: https://docs.python.org/2/library/os.html#os.rename
os.remove(dest_filename)
continue
raise
else:
raise Exception("Could not rename {} to {}".format(temp_filename, dest_filename))
you should be able to get rid of writing and retry_count
There was a problem hiding this comment.
Right. My Python's a little rusty, thanks!
| :param dir: Shared directory of temp and dest file | ||
| :param content: Content to write to file | ||
| """ | ||
| temp_filename = "" |
There was a problem hiding this comment.
Not necessarily, but it clears an IDE warning (and possibly linter?)
| assert "swagger" in res.json() | ||
| with open(swagger_filename, "wb") as fh: | ||
| fh.write(res.content) | ||
| self._atomic_write_to_file(self._get_path_leaf(swagger_filename), |
There was a problem hiding this comment.
os.path.basename doesn't work here?
There was a problem hiding this comment.
Oh, it would - I was covering an edge case that, actually, we don't need to worry about: https://stackoverflow.com/questions/8384737/extract-file-name-from-path-no-matter-what-the-os-path-format
-> Will replace ntpath.basename with os.path.basename
However, _get_path_leaf handles the trailing slash case (e.g. path/to/file/).
There was a problem hiding this comment.
Why would swagger_filename end with a trailing slash?
c05639c to
02be576
Compare
| is_cached = os.path.exists(swagger_filename) | ||
| if (not is_cached) or (is_cached and | ||
| self._get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days): | ||
| if (not is_cached) or \ |
There was a problem hiding this comment.
stylistically, i think:
if ((not is_cached) or
(is_cached and FSHelper.get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days)):
is better.
Furthermore, I would say that you can remove the second is_cached as that is implied if you ever reach that clause.
| assert "swagger" in res.json() | ||
| with open(swagger_filename, "wb") as fh: | ||
| fh.write(res.content) | ||
| self._atomic_write_to_file(self._get_path_leaf(swagger_filename), |
There was a problem hiding this comment.
Why would swagger_filename end with a trailing slash?
| temp.write(content) | ||
| FSHelper._atomic_rename(temp_filename, dest_filename) | ||
| finally: | ||
| if temp_filename and os.path.isfile(temp_filename): |
There was a problem hiding this comment.
if NamedTemporaryFile throws as exception, then temp_filename will not be defined. one possible strategy is:
if 'temp_filename' in locals() and os.path.isfile(temp_filename):
another strategy is to define temp_filename before the loop.
| FSHelper._atomic_rename(temp_filename, dest_filename) | ||
| finally: | ||
| if temp_filename and os.path.isfile(temp_filename): | ||
| os.remove(temp_filename) |
There was a problem hiding this comment.
you should wrap os.remove in a try-except EnvironmentError clause.
| os.remove(temp_filename) | ||
|
|
||
| @staticmethod | ||
| def _atomic_rename(src, dest): |
There was a problem hiding this comment.
suggestion:
def _atomic_rename(src, dest, max_retries=5):
| FSHelper._handle_rename_error(e, src, dest) | ||
|
|
||
| @staticmethod | ||
| def _handle_rename_error(err, src, dest): |
There was a problem hiding this comment.
I think you could just inline this method. It's not particularly large and it's not called elsewhere.
There was a problem hiding this comment.
Yea I agree, however, it's pulled out now since when it was inline in _atomic_rename, it violated our linter's Code Complexity limit (7 > 6)
| :param dest: Rename destination filename | ||
| """ | ||
| if err.errno == errno.EEXIST: | ||
| os.remove(dest) |
There was a problem hiding this comment.
os.remove(..) should be wrapped in try-except EnvironmentError
9a209ff to
ef55643
Compare
298950a to
de89aa7
Compare
de89aa7 to
1870bdc
Compare
| if "swagger_filename" in self.config: | ||
| swagger_filename = self.config.swagger_filename | ||
| if not swagger_filename.startswith("/"): | ||
| use_local_config = True |
There was a problem hiding this comment.
shouldn't use_local_config always be set if swagger_filename is in the config?
furthermore, can you not just use that clause to test rather than creating a new variable?
| is_cached = os.path.exists(swagger_filename) | ||
| if (not is_cached) or (is_cached and | ||
| self._get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days): | ||
| if ((not use_local_config) and ((not os.path.exists(swagger_filename)) or |
There was a problem hiding this comment.
I think this is the wrong set of boolean conditions, if I understand the requirements correctly. If there's a swagger_filename in the config, then we shouldn't ever refresh the config.
| :param shared_dir: Shared directory of temp and dest file | ||
| :param content: Content to write to file | ||
| """ | ||
| temp_filename = "" |
| try: | ||
| os.remove(temp_filename) | ||
| except EnvironmentError: | ||
| print("Failed to clean up temporary file {}".format(temp_filename)) |
There was a problem hiding this comment.
| os.remove(temp_filename) | ||
| except EnvironmentError: | ||
| print("Failed to clean up temporary file {}".format(temp_filename)) | ||
| raise |
There was a problem hiding this comment.
I don't think this is a case where we should fatal. I would just warn and move on.
| try: | ||
| os.remove(dest) | ||
| except EnvironmentError: | ||
| print("Failed to delete {}".format(dest)) |
There was a problem hiding this comment.
| except EnvironmentError: | ||
| print("Failed to delete {}".format(dest)) | ||
| else: | ||
| print("Failed to rename {} to {}".format(src, dest)) |
There was a problem hiding this comment.
raise Exception("Failed to rename {} to {}".format(src, dest), err)
| return (now - last_modified).days | ||
|
|
||
| @staticmethod | ||
| def get_path_leaf(path): |
There was a problem hiding this comment.
this is still unnecessary as we will never pass in a path with a trailing backslash.
| test: lint install | ||
| # https://github.com/HumanCellAtlas/dcp-cli/issues/127 | ||
| coverage run --source=hca -m unittest discover -v -t . -s test/upload | ||
| coverage run --source=hca -m unittest discover -v -t . -s test -p test_dss_*.py |
There was a problem hiding this comment.
can you make this a separate PR and land this ASAP? Seems like a pretty big deal.
1870bdc to
da2d24a
Compare
| # https://github.com/HumanCellAtlas/dcp-cli/issues/127 | ||
| coverage run --source=hca -m unittest discover -v -t . -s test/upload | ||
| coverage run --source=hca -m unittest discover -v -t . -s test -p test_dss_*.py | ||
| coverage run --source=hca -m unittest discover -v -t . -s test -p test_fs_helper.py |
| is_cached = os.path.exists(swagger_filename) | ||
| if (not is_cached) or (is_cached and | ||
| self._get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days): | ||
| if (("swagger_filename" not in self.config) and ((not os.path.exists(swagger_filename)) or |
There was a problem hiding this comment.
do you want the auto-update to happen if someone specifies swagger_filename in the config?
There was a problem hiding this comment.
No, if someone specifies swagger_filename in the config, we assume that they are using a local copy of the Swagger file. We wouldn't want to overwrite their local copy and only want to update to the latest Swagger file in the user config directory ~/hca/.config.
There was a problem hiding this comment.
I made a mistake reading your code. I think it's correct, but your indentation is confusing. I would do it like this:
if (("swagger_filename" not in self.config) and
((not os.path.exists(swagger_filename)) or
(fs.get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days))):
da2d24a to
ea138fd
Compare
| is_cached = os.path.exists(swagger_filename) | ||
| if (not is_cached) or (is_cached and | ||
| self._get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days): | ||
| if (("swagger_filename" not in self.config) and ((not os.path.exists(swagger_filename)) or |
There was a problem hiding this comment.
I made a mistake reading your code. I think it's correct, but your indentation is confusing. I would do it like this:
if (("swagger_filename" not in self.config) and
((not os.path.exists(swagger_filename)) or
(fs.get_days_since_last_modified(swagger_filename) >= self._spec_valid_for_days))):
ea138fd to
47a5835
Compare
As per @ttung's suggestion (GH-139), these changes update the cached version of the DSS API definition atomically in order to avoid a race condition. This is achieved by writing the fetched file to a temp file and renamed (
os.rename) to the desired filename, where the latter operation is atomic.07/25/18 update:
These changes also handle
os.renameWindows specific behavior by deleting the destination file if it exists before renaming. This process is attempted multiple times (5) in order to handle a potential race condition in which the destination file is recreated in between delete and rename operations. These changes also introducehca.util.fs_helper.py, a static library of functions for interacting with the local filesystem.Connected to #79