Skip to content

Commit

Permalink
Add zpool status -s (slow I/Os) and -p (parseable)
Browse files Browse the repository at this point in the history
This patch adds a new slow I/Os (-s) column to zpool status to show the
number of VDEV slow I/Os. This is the number of I/Os that didn't
complete in zio_slow_io_ms milliseconds. It also adds a new parsable
(-p) flag to display exact values.

 	NAME         STATE     READ WRITE CKSUM  SLOW
 	testpool     ONLINE       0     0     0     -
	  mirror-0   ONLINE       0     0     0     -
 	    loop0    ONLINE       0     0     0    20
 	    loop1    ONLINE       0     0     0     0

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes openzfs#7756
Closes openzfs#6885
  • Loading branch information
tonyhutter authored and Gregor Kopka committed Jan 7, 2019
1 parent cac6053 commit b9df568
Show file tree
Hide file tree
Showing 16 changed files with 321 additions and 136 deletions.
53 changes: 45 additions & 8 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,8 @@ get_usage(zpool_help_t idx)
case HELP_RESILVER:
return (gettext("\tresilver <pool> ...\n"));
case HELP_STATUS:
return (gettext("\tstatus [-c [script1,script2,...]] [-gLPvxD]"
"[-T d|u] [pool] ... \n"
return (gettext("\tstatus [-c [script1,script2,...]] "
"[-gLpPsvxD] [-T d|u] [pool] ... \n"
"\t [interval [count]]\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade\n"
Expand Down Expand Up @@ -1669,10 +1669,12 @@ typedef struct status_cbdata {
int cb_namewidth;
boolean_t cb_allpools;
boolean_t cb_verbose;
boolean_t cb_literal;
boolean_t cb_explain;
boolean_t cb_first;
boolean_t cb_dedup_stats;
boolean_t cb_print_status;
boolean_t cb_print_slow_ios;
vdev_cmd_data_list_t *vcdl;
} status_cbdata_t;

Expand Down Expand Up @@ -1788,10 +1790,34 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
name, state);

if (!isspare) {
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
if (cb->cb_literal) {
printf(" %5llu %5llu %5llu",
(u_longlong_t)vs->vs_read_errors,
(u_longlong_t)vs->vs_write_errors,
(u_longlong_t)vs->vs_checksum_errors);
} else {
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
zfs_nicenum(vs->vs_checksum_errors, cbuf,
sizeof (cbuf));
printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
}

if (cb->cb_print_slow_ios) {
if (children == 0) {
/* Only leafs vdevs have slow IOs */
zfs_nicenum(vs->vs_slow_ios, rbuf,
sizeof (rbuf));
} else {
snprintf(rbuf, sizeof (rbuf), "-");
}

if (cb->cb_literal)
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
else
printf(" %5s", rbuf);
}

}

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
Expand Down Expand Up @@ -7175,6 +7201,9 @@ status_callback(zpool_handle_t *zhp, void *data)
cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
"CKSUM");

if (cbp->cb_print_slow_ios)
(void) printf(" %5s", gettext("SLOW"));

if (cbp->vcdl != NULL)
print_cmd_columns(cbp->vcdl, 0);

Expand Down Expand Up @@ -7241,13 +7270,15 @@ status_callback(zpool_handle_t *zhp, void *data)
}

/*
* zpool status [-c [script1,script2,...]] [-gLPvx] [-T d|u] [pool] ...
* zpool status [-c [script1,script2,...]] [-gLpPsvx] [-T d|u] [pool] ...
* [interval [count]]
*
* -c CMD For each vdev, run command CMD
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
* -p Display values in parsable (exact) format.
* -P Display full path for vdev name.
* -s Display slow IOs column.
* -v Display complete error logs
* -x Display only pools with potential problems
* -D Display dedup status (undocumented)
Expand All @@ -7266,7 +7297,7 @@ zpool_do_status(int argc, char **argv)
char *cmd = NULL;

/* check options */
while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) {
while ((c = getopt(argc, argv, "c:gLpPsvxDT:")) != -1) {
switch (c) {
case 'c':
if (cmd != NULL) {
Expand Down Expand Up @@ -7298,9 +7329,15 @@ zpool_do_status(int argc, char **argv)
case 'L':
cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
break;
case 'p':
cb.cb_literal = B_TRUE;
break;
case 'P':
cb.cb_name_flags |= VDEV_NAME_PATH;
break;
case 's':
cb.cb_print_slow_ios = B_TRUE;
break;
case 'v':
cb.cb_verbose = B_TRUE;
break;
Expand Down
1 change: 1 addition & 0 deletions include/sys/fm/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
Expand Down
4 changes: 4 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,9 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"

/* Number of slow IOs */
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"

/* vdev enclosure sysfs path */
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"

Expand Down Expand Up @@ -990,6 +993,7 @@ typedef struct vdev_stat {
uint64_t vs_fragmentation; /* device fragmentation */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
uint64_t vs_slow_ios; /* slow IOs */
} vdev_stat_t;

/*
Expand Down
4 changes: 3 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -1076,9 +1076,11 @@ extern const char *spa_state_to_name(spa_t *spa);
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset,
uint64_t length);
extern boolean_t zfs_ereport_is_valid(const char *class, spa_t *spa, vdev_t *vd,
zio_t *zio);
extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
const char *name, nvlist_t *aux);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
Expand Down
7 changes: 1 addition & 6 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,6 @@ enum zio_encrypt {
(compress) == ZIO_COMPRESS_ON || \
(compress) == ZIO_COMPRESS_OFF)

/*
* Default Linux timeout for a sd device.
*/
#define ZIO_DELAY_MAX (30 * MILLISEC)

#define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
Expand Down Expand Up @@ -664,7 +659,7 @@ extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);

/* If we have the good data in hand, this function can be used */
extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
uint64_t length, const abd_t *good_data, const abd_t *bad_data,
struct zio_bad_cksum *info);
Expand Down
12 changes: 6 additions & 6 deletions man/man5/zfs-events.5
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ information regarding "hung" I/O detection and configuration.
.ad
.RS 12n
Issued when a completed I/O exceeds the maximum allowed time specified
by the \fBzio_delay_max\fR module option. This can be an indicator of
problems with the underlying storage device.
by the \fBzio_slow_io_ms\fR module option. This can be an indicator of
problems with the underlying storage device. The number of delay events is
ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter.
.RE

.sp
Expand Down Expand Up @@ -697,10 +698,9 @@ full list of all the I/O stages.
\fBzio_delay\fR
.ad
.RS 12n
The time in ticks (HZ) required for the block layer to service the I/O. Unlike
\fBzio_delta\fR this does not include any vdev queuing time and is therefore
solely a measure of the block layer performance. On most modern Linux systems
HZ is defined as 1000 making a tick equivalent to 1 millisecond.
The time elapsed (in nanoseconds) waiting for the block layer to complete the
I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is
therefore solely a measure of the block layer performance.
.RE

.sp
Expand Down
21 changes: 11 additions & 10 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,7 @@ Default value: \fB600,000\fR.
.ad
.RS 12n
Interval in milliseconds after which the deadman is triggered and an
individual IO operation is considered to be "hung". As long as the I/O
individual I/O operation is considered to be "hung". As long as the I/O
remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
milliseconds until the I/O completes.
.sp
Expand Down Expand Up @@ -1141,10 +1141,10 @@ Default value: \fB500,000\fR.
.sp
.ne 2
.na
\fBzfs_delays_per_second\fR (int)
\fBzfs_low_ios_per_second\fR (int)
.ad
.RS 12n
Rate limit IO delay events to this many per second.
Rate limit delay zevents (which report slow I/Os) to this many per second.
.sp
Default value: 20
.RE
Expand Down Expand Up @@ -1655,7 +1655,7 @@ Default value: \fB50\fR.
.ad
.RS 12n
We currently support block sizes from 512 bytes to 16MB. The benefits of
larger blocks, and thus larger IO, need to be weighed against the cost of
larger blocks, and thus larger I/O, need to be weighed against the cost of
COWing a giant block to modify one byte. Additionally, very large blocks
can have an impact on i/o latency, and also potentially on the memory
allocator. Therefore, we do not allow the recordsize to be set larger than
Expand Down Expand Up @@ -2552,12 +2552,13 @@ Default value: \fB0\fR.
.sp
.ne 2
.na
\fBzio_delay_max\fR (int)
\fBzio_slow_io_ms\fR (int)
.ad
.RS 12n
A zevent will be logged if a ZIO operation takes more than N milliseconds to
complete. Note that this is only a logging facility, not a timeout on
operations.
When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to
complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow
I/O counters can be seen with "zpool status -s".

.sp
Default value: \fB30,000\fR.
.RE
Expand All @@ -2568,7 +2569,7 @@ Default value: \fB30,000\fR.
\fBzio_dva_throttle_enabled\fR (int)
.ad
.RS 12n
Throttle block allocations in the ZIO pipeline. This allows for
Throttle block allocations in the I/O pipeline. This allows for
dynamic allocation distribution when devices are imbalanced.
When enabled, the maximum number of pending allocations per top-level vdev
is limited by \fBzfs_vdev_queue_depth_pct\fR.
Expand All @@ -2594,7 +2595,7 @@ Default value: \fB0\fR.
.ad
.RS 12n
Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
for IO. These workers are responsible for IO work such as compression and
for I/O. These workers are responsible for I/O work such as compression and
checksum calculations. Fractional number of CPUs will be rounded down.
.sp
The default value of 75 was chosen to avoid using all CPUs which can result in
Expand Down
12 changes: 10 additions & 2 deletions man/man8/zpool.8
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@
.Nm
.Cm status
.Oo Fl c Ar SCRIPT Oc
.Op Fl gLPvxD
.Op Fl DgLpPsvx
.Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count
Expand Down Expand Up @@ -2167,7 +2167,7 @@ and automatically import it.
.Nm
.Cm status
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ...
.Op Fl gLPvxD
.Op Fl DgLpPsvx
.Op Fl T Sy u Ns | Ns Sy d
.Oo Ar pool Oc Ns ...
.Op Ar interval Op Ar count
Expand Down Expand Up @@ -2203,6 +2203,8 @@ Display real paths for vdevs resolving all symbolic links. This can
be used to look up the current block device name regardless of the
.Pa /dev/disk/
path used to open it.
.It Fl p
Display numbers in parsable (exact) values.
.It Fl P
Display full paths for vdevs instead of only the last component of
the path. This can be used in conjunction with the
Expand All @@ -2214,6 +2216,12 @@ Display a histogram of deduplication statistics, showing the allocated
and referenced
.Pq logically referenced in the pool
block counts and sizes by reference count.
.It Fl s
Display the number of leaf VDEV slow IOs. This is the number of IOs that
didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds).
This does not necessarily mean the IOs failed to complete, just took an
unreasonably long amount of time. This may indicate a problem with the
underlying storage.
.It Fl T Sy u Ns | Ns Sy d
Display a time stamp.
Specify
Expand Down
23 changes: 13 additions & 10 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,14 @@ int vdev_validate_skip = B_FALSE;
int vdev_dtl_sm_blksz = (1 << 12);

/*
* Rate limit delay events to this many IO delays per second.
* Rate limit slow IO (delay) events to this many per second.
*/
unsigned int zfs_delays_per_second = 20;
unsigned int zfs_slow_io_events_per_second = 20;

/*
* Rate limit checksum events after this many checksum errors per second.
*/
unsigned int zfs_checksums_per_second = 20;
unsigned int zfs_checksum_events_per_second = 20;

/*
* Ignore errors during scrub/resilver. Allows to work around resilver
Expand Down Expand Up @@ -507,8 +507,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
* and checksum events so that we don't overwhelm ZED with thousands
* of events when a disk is acting up.
*/
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_delays_per_second, 1);
zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksums_per_second, 1);
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
1);
zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1);

list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
Expand Down Expand Up @@ -3591,6 +3593,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_read_errors = 0;
vd->vdev_stat.vs_write_errors = 0;
vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;

for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
Expand Down Expand Up @@ -4630,12 +4633,12 @@ module_param(vdev_ms_count_limit, int, 0644);
MODULE_PARM_DESC(vdev_ms_count_limit,
"Practical upper limit of total metaslabs per top-level vdev");

module_param(zfs_delays_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
"IO delays per second");
module_param(zfs_slow_io_events_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_slow_io_events_per_second,
"Rate limit slow IO (delay) events to this many per second");

module_param(zfs_checksums_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_checksums_per_second, "Rate limit checksum events "
module_param(zfs_checksum_events_per_second, uint, 0644);
MODULE_PARM_DESC(zfs_checksum_events_per_second, "Rate limit checksum events "
"to this many checksum errors per second (do not set below zed"
"threshold).");

Expand Down
3 changes: 3 additions & 0 deletions module/zfs/vdev_label.c
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));

/* IO delays */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);

/* Add extended stats nvlist to main nvlist */
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);

Expand Down
Loading

0 comments on commit b9df568

Please sign in to comment.