Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pygmt.x2sys_cross: Refactor to use virtualfiles for output tables [BREAKING CHANGE: Dummy times in 3rd and 4th columns now have np.timedelta64 type] #3182

Merged
merged 43 commits into from
Jun 9, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
95fab98
pygmt.x2sys_cross: Add 'output_type' parameter for output in pandas/n…
seisman Apr 20, 2024
ce926a0
Move session-unrelated code block outside the session block
seisman Apr 20, 2024
86278cb
Refactor if-else using match statements
seisman Apr 20, 2024
58c6ea4
Fix static typing issue
seisman Apr 20, 2024
d6eeade
Fix warnings
seisman Apr 20, 2024
9d12ae1
Convert dummpy times to timedelta
seisman Apr 22, 2024
28eb1df
Let validate_output_table_type specify the supported output types
seisman Apr 22, 2024
5e926e8
Fix
seisman Apr 22, 2024
c1c756d
Merge branch 'main' into validator/valid-types
seisman Apr 25, 2024
3a3df0a
Update docstrings
seisman May 6, 2024
3aea9a6
Merge branch 'main' into validator/valid-types
seisman May 6, 2024
d869a32
Merge branch 'main' into validator/valid-types
seisman May 6, 2024
b46d21d
Remove support of numpy output type
seisman May 7, 2024
81a1ec0
Merge branch 'validator/valid-types' into refactor/x2sys_cross
seisman May 7, 2024
07fe53e
Merge branch 'main' into refactor/x2sys_cross
seisman May 7, 2024
5f04506
Remove the output_type parameter
seisman May 7, 2024
84765e4
Remove the output_type parameter from tests
seisman May 7, 2024
b9b4098
Improve tests
seisman May 7, 2024
bf59e61
Merge branch 'main' into refactor/x2sys_cross
seisman May 25, 2024
9bc063a
Deal with TIME_EPOCH
seisman May 25, 2024
b0b5099
Support TIME_UNIT for 'd' and 's'
seisman May 26, 2024
1396ee8
Fix a typo
seisman May 26, 2024
6f2671a
Update
seisman May 26, 2024
a9a4179
Fix
seisman May 26, 2024
04a1986
Typo
seisman May 26, 2024
b27212b
Fix
seisman May 27, 2024
aa3e9af
Merge branch 'main' into refactor/x2sys_cross
seisman May 27, 2024
97312fb
Improve the handling of TIME_UNIT
seisman May 27, 2024
6450ba0
Call the apply function separately
seisman May 27, 2024
29a7f9e
Merge branch 'main' into refactor/x2sys_cross
seisman May 27, 2024
d5294a4
Further simplify the logic
seisman May 28, 2024
55f7c30
Fix
seisman May 28, 2024
a44390d
Further simplification
seisman May 28, 2024
b81e292
Improve tests
seisman May 28, 2024
870d9c7
Update pygmt/src/x2sys_cross.py
seisman May 28, 2024
db94b91
Fix x2sys_cross tests on macOS M
seisman May 28, 2024
de17d5e
Update pygmt/src/x2sys_cross.py
seisman May 28, 2024
ebce56e
Update pygmt/src/x2sys_cross.py
seisman May 28, 2024
71af717
Merge branch 'main' into refactor/x2sys_cross
seisman May 29, 2024
cf2cfc7
Merge branch 'main' into refactor/x2sys_cross
seisman Jun 2, 2024
3a62fc1
Merge branch 'main' into refactor/x2sys_cross
seisman Jun 5, 2024
9fd35ce
Apply suggestions from code review
seisman Jun 9, 2024
2b3474b
Update pygmt/tests/test_x2sys_cross.py
seisman Jun 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
95 changes: 41 additions & 54 deletions pygmt/src/x2sys_cross.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@
import contextlib
import os
from pathlib import Path
from typing import Any

import pandas as pd
from packaging.version import Version
from pygmt.clib import Session
from pygmt.exceptions import GMTInvalidInput
from pygmt.helpers import (
GMTTempFile,
build_arg_list,
data_kind,
fmt_docstring,
Expand Down Expand Up @@ -71,7 +70,11 @@ def tempfile_from_dftrack(track, suffix):
Z="trackvalues",
)
@kwargs_to_strings(R="sequence")
def x2sys_cross(tracks=None, outfile=None, **kwargs):
def x2sys_cross(
tracks=None,
outfile: str | None = None,
**kwargs,
):
r"""
Calculate crossovers between track data files.

Expand Down Expand Up @@ -102,11 +105,7 @@ def x2sys_cross(tracks=None, outfile=None, **kwargs):
set it will default to $GMT_SHAREDIR/x2sys]. (**Note**: MGD77 files
will also be looked for via $MGD77_HOME/mgd77_paths.txt and .gmt
files will be searched for via $GMT_SHAREDIR/mgg/gmtfile_paths).

outfile : str
Optional. The file name for the output ASCII txt file to store the
table in.

{outfile}
seisman marked this conversation as resolved.
Show resolved Hide resolved
tag : str
Specify the x2sys TAG which identifies the attributes of this data
type.
Expand Down Expand Up @@ -183,68 +182,56 @@ def x2sys_cross(tracks=None, outfile=None, **kwargs):

Returns
-------
crossover_errors : :class:`pandas.DataFrame` or None
Table containing crossover error information.
Return type depends on whether the ``outfile`` parameter is set:

- :class:`pandas.DataFrame` with (x, y, ..., etc) if ``outfile`` is not
set
- None if ``outfile`` is set (track output will be stored in the set in
``outfile``)
crossover_errors
Table containing crossover error information. A :class:`pandas.DataFrame` object
is returned if ``outfile`` is not set, otherwise ``None`` is returned and output
will be stored in file set by ``outfile``.
"""
with Session() as lib:
file_contexts = []
for track in tracks:
kind = data_kind(track)
if kind == "file":
# Determine output type based on 'outfile' parameter
output_type = "pandas" if outfile is None else "file"

file_contexts: list[contextlib.AbstractContextManager[Any]] = []
for track in tracks:
match data_kind(track):
case "file":
file_contexts.append(contextlib.nullcontext(track))
elif kind == "matrix":
case "matrix":
# find suffix (-E) of trackfiles used (e.g. xyz, csv, etc) from
# $X2SYS_HOME/TAGNAME/TAGNAME.tag file
lastline = (
Path(os.environ["X2SYS_HOME"], kwargs["T"], f"{kwargs['T']}.tag")
.read_text(encoding="utf8")
.strip()
.split("\n")[-1]
) # e.g. "-Dxyz -Etsv -I1/1"
tagfile = Path(
os.environ["X2SYS_HOME"], kwargs["T"], f"{kwargs['T']}.tag"
)
# Last line is like "-Dxyz -Etsv -I1/1"
lastline = tagfile.read_text().splitlines()[-1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is encoding="utf8" not needed anymore?

Suggested change
lastline = tagfile.read_text().splitlines()[-1]
lastline = tagfile.read_text(encoding="utf8").splitlines()[-1]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added back.

for item in sorted(lastline.split()): # sort list alphabetically
if item.startswith(("-E", "-D")): # prefer -Etsv over -Dxyz
suffix = item[2:] # e.g. tsv (1st choice) or xyz (2nd choice)

# Save pandas.DataFrame track data to temporary file
file_contexts.append(tempfile_from_dftrack(track=track, suffix=suffix))
else:
case _:
raise GMTInvalidInput(f"Unrecognized data type: {type(track)}")

with GMTTempFile(suffix=".txt") as tmpfile:
with Session() as lib:
with lib.virtualfile_out(kind="dataset", fname=outfile) as vouttbl:
with contextlib.ExitStack() as stack:
fnames = [stack.enter_context(c) for c in file_contexts]
if outfile is None:
outfile = tmpfile.name
lib.call_module(
module="x2sys_cross",
args=build_arg_list(kwargs, infile=fnames, outfile=outfile),
)

# Read temporary csv output to a pandas table
if outfile == tmpfile.name: # if outfile isn't set, return pd.DataFrame
# Read the tab-separated ASCII table
date_format_kwarg = (
{"date_format": "ISO8601"}
if Version(pd.__version__) >= Version("2.0.0")
else {}
args=build_arg_list(kwargs, infile=fnames, outfile=vouttbl),
)
table = pd.read_csv(
tmpfile.name,
sep="\t",
header=2, # Column names are on 2nd row
comment=">", # Skip the 3rd row with a ">"
parse_dates=[2, 3], # Datetimes on 3rd and 4th column
**date_format_kwarg, # Parse dates in ISO8601 format on pandas>=2
result = lib.virtualfile_to_dataset(
vfname=vouttbl, output_type=output_type, header=2
)
Comment on lines -241 to 225
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that x2sys_cross can output multi-segment files (each segment is separated by IO_SEGMENT_MARKER which is > by default, see https://docs.generic-mapping-tools.org/6.5/reference/file-formats.html#optional-segment-header-records). If I'm not mistaken, the current virtualfile_to_dataset method does not implement multi-segment file handling yet? To be fair though, the current implementation in x2sys_cross simply merges all segments into one, since we skip rows starting with >, but we need to check that virtualfile_to_dataset will return all segments in a multi-segment file instead of just the first one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm not mistaken, the current virtualfile_to_dataset method does not implement multi-segment file handling yet? To be fair though, the current implementation in x2sys_cross simply merges all segments into one, since we skip rows starting with >, but we need to check that virtualfile_to_dataset will return all segments in a multi-segment file instead of just the first one.

Yes. The main problem is that, as far as I know, there is no equivalent way to represent a multi-segment file in pandas. The multi-segment support was also mentioned in #2729 (comment).

If we can have a general way to represent multi-segment in pandas, then it should be straightforward to output multi-segments from _GMT_DATASET to the desired data structure.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be fair though, the current implementation in x2sys_cross simply merges all segments into one, since we skip rows starting with >, but we need to check that virtualfile_to_dataset will return all segments in a multi-segment file instead of just the first one.

It's already tested in the _GMT_DATASET docstrings

>>> with GMTTempFile(suffix=".txt") as tmpfile:
.

For x2sys_cross, we also test the shape of the output pd.DataFrame.

# Remove the "# " from "# x" in the first column
table = table.rename(columns={table.columns[0]: table.columns[0][2:]})
elif outfile != tmpfile.name: # if outfile is set, output in outfile only
table = None

return table
# Convert 3rd and 4th columns to datetime or timedelta.
# These two columns have names "t_1"/"t_2" or "i_1"/"i_2".
# "t_1"/"t_2" means they are absolute datetimes.
# "i_1"/"i_2" means they are dummy times relative to unix epoch.
if output_type == "pandas":
seisman marked this conversation as resolved.
Show resolved Hide resolved
t_or_i = result.columns[2][0]
to_func = {"t": pd.to_datetime, "i": pd.to_timedelta}[t_or_i]
result[result.columns[2:4]] = result[result.columns[2:4]].apply(
to_func, unit="s"
)
return result
14 changes: 9 additions & 5 deletions pygmt/tests/test_x2sys_cross.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def test_x2sys_cross_input_file_output_file():
x2sys_init(tag=tag, fmtfile="xyz", force=True)
outfile = tmpdir_p / "tmp_coe.txt"
output = x2sys_cross(
tracks=["@tut_ship.xyz"], tag=tag, coe="i", outfile=outfile
tracks=["@tut_ship.xyz"],
tag=tag,
coe="i",
outfile=outfile,
output_type="file",
)

assert output is None # check that output is None since outfile is set
Expand Down Expand Up @@ -98,8 +102,8 @@ def test_x2sys_cross_input_dataframe_output_dataframe(tracks):
columns = list(output.columns)
assert columns[:6] == ["x", "y", "i_1", "i_2", "dist_1", "dist_2"]
assert columns[6:] == ["head_1", "head_2", "vel_1", "vel_2", "z_X", "z_M"]
assert output.dtypes["i_1"].type == np.object_
assert output.dtypes["i_2"].type == np.object_
assert output.dtypes["i_1"].type == np.timedelta64
assert output.dtypes["i_2"].type == np.timedelta64


@pytest.mark.usefixtures("mock_x2sys_home")
Expand Down Expand Up @@ -159,8 +163,8 @@ def test_x2sys_cross_input_dataframe_with_nan(tracks):
columns = list(output.columns)
assert columns[:6] == ["x", "y", "i_1", "i_2", "dist_1", "dist_2"]
assert columns[6:] == ["head_1", "head_2", "vel_1", "vel_2", "z_X", "z_M"]
assert output.dtypes["i_1"].type == np.object_
assert output.dtypes["i_2"].type == np.object_
assert output.dtypes["i_1"].type == np.timedelta64
assert output.dtypes["i_2"].type == np.timedelta64


@pytest.mark.usefixtures("mock_x2sys_home")
Expand Down