From 863728b45c8709512aed83800cb0a279aacaac08 Mon Sep 17 00:00:00 2001 From: Mariusz Zaborski Date: Fri, 21 Apr 2023 09:28:23 +0000 Subject: [PATCH] zfs: support force exporting pools This is primarily of use when a pool has lost its disk, while the user doesn't care about any pending (or otherwise) transactions. Implement various control methods to make this feasible: - txg_wait can now take a NOSUSPEND flag, in which case the caller will be alerted if their txg can't be committed. This is primarily of interest for callers that would normally pass TXG_WAIT, but don't want to wait if the pool becomes suspended, which allows unwinding in some cases, specifically when one is attempting a non-forced export. Without this, the non-forced export would preclude a forced export by virtue of holding the namespace lock indefinitely. - txg_wait also returns failure for TXG_WAIT users if a pool is actually being force exported. Adjust most callers to tolerate this. - spa_config_enter_flags now takes a NOSUSPEND flag to the same effect. - DMU objset initiator which may be set on an objset being forcibly exported / unmounted. - SPA export initiator may be set on a pool being forcibly exported. - DMU send/recv now use an interruption mechanism which relies on the SPA export initiator being able to enumerate datasets and closing any send/recv streams, causing their EINTR paths to be invoked. - ZIO now has a cancel entry point, which tells all suspended zios to fail, and which suppresses the failures for non-CANFAIL users. - metaslab, etc. cleanup, which consists of simply throwing away any changes that were not able to be synced out. - Linux specific: introduce a new tunable, zfs_forced_export_unmount_enabled, which allows the filesystem to remain in a modified 'unmounted' state upon exiting zpl_umount_begin, to achieve parity with FreeBSD and illumos, which have VFS-level support for yanking filesystems out from under users. However, this only helps when the user is actively performing I/O, while not sitting on the filesystem. In particular, this allows test #3 below to pass on Linux. - Add basic logic to zpool to indicate a force-exporting pool, instead of crashing due to lack of config, etc. Add tests which cover the basic use cases: - Force export while a send is in progress - Force export while a recv is in progress - Force export while POSIX I/O is in progress This change modifies the libzfs ABI: - New ZPOOL_STATUS_FORCE_EXPORTING zpool_status_t enum value. - New field libzfs_force_export for libzfs_handle. Co-Authored-by: Will Andrews Co-Authored-by: Allan Jude Sponsored-by: Klara, Inc. Sponsored-by: Catalogics, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #3461 Signed-off-by: Will Andrews Signed-off-by: Allan Jude Signed-off-by: Mariusz Zaborski --- cmd/zpool/zpool_main.c | 11 +- include/libzfs.h | 9 +- include/os/freebsd/spl/sys/thread.h | 3 + include/os/freebsd/zfs/sys/zfs_znode_impl.h | 2 + include/os/linux/spl/sys/thread.h | 2 + include/os/linux/zfs/sys/zfs_vfsops_os.h | 3 +- include/os/linux/zfs/sys/zfs_znode_impl.h | 27 +- include/sys/arc.h | 1 + include/sys/dmu.h | 1 + include/sys/dmu_impl.h | 1 + include/sys/dmu_objset.h | 5 + include/sys/dmu_recv.h | 4 + include/sys/dmu_send.h | 1 + include/sys/dsl_dataset.h | 7 +- include/sys/dsl_scan.h | 2 +- include/sys/fs/zfs.h | 2 + include/sys/metaslab.h | 1 + include/sys/spa.h | 24 +- include/sys/spa_impl.h | 2 + include/sys/txg.h | 37 ++- include/sys/zfs_context.h | 1 + include/sys/zfs_ioctl_impl.h | 4 + include/sys/zfs_refcount.h | 2 +- include/sys/zfs_znode.h | 14 + include/sys/zio.h | 3 +- lib/libzfs/libzfs.abi | 8 +- lib/libzfs/libzfs_dataset.c | 35 +- lib/libzfs/libzfs_impl.h | 1 + lib/libzfs/libzfs_mount.c | 25 +- lib/libzfs/libzfs_pool.c | 8 +- lib/libzfs/libzfs_status.c | 3 + lib/libzfs/os/freebsd/libzfs_zmount.c | 12 + lib/libzfs/os/linux/libzfs_mount_os.c | 20 ++ man/man4/zfs.4 | 12 + man/man8/zpool-export.8 | 12 +- module/os/freebsd/spl/spl_misc.c | 10 + module/os/linux/spl/spl-thread.c | 12 + module/os/linux/zfs/zfs_dir.c | 1 - module/os/linux/zfs/zfs_ioctl_os.c | 19 ++ module/os/linux/zfs/zfs_vfsops.c | 25 +- module/os/linux/zfs/zfs_vnops_os.c | 4 +- module/os/linux/zfs/zpl_super.c | 36 +- module/zfs/arc.c | 39 ++- module/zfs/dbuf.c | 25 +- module/zfs/dmu.c | 24 +- module/zfs/dmu_objset.c | 87 ++++- module/zfs/dmu_recv.c | 176 ++++++++-- module/zfs/dmu_redact.c | 9 +- module/zfs/dmu_send.c | 37 ++- module/zfs/dmu_tx.c | 46 ++- module/zfs/dsl_dataset.c | 150 +++++++-- module/zfs/dsl_pool.c | 4 +- module/zfs/dsl_scan.c | 23 +- module/zfs/dsl_synctask.c | 7 +- module/zfs/metaslab.c | 56 +++- module/zfs/spa.c | 309 +++++++++++++++--- module/zfs/spa_checkpoint.c | 11 +- module/zfs/spa_config.c | 141 +++++--- module/zfs/spa_errlog.c | 9 +- module/zfs/spa_history.c | 7 +- module/zfs/spa_log_spacemap.c | 8 +- module/zfs/spa_misc.c | 156 ++++++--- module/zfs/space_map.c | 2 +- module/zfs/txg.c | 153 +++++++-- module/zfs/vdev.c | 33 ++ module/zfs/vdev_indirect.c | 18 +- module/zfs/vdev_initialize.c | 8 + module/zfs/vdev_label.c | 2 +- module/zfs/vdev_rebuild.c | 26 +- module/zfs/vdev_removal.c | 11 +- module/zfs/vdev_trim.c | 19 +- module/zfs/zap.c | 9 +- module/zfs/zfs_ioctl.c | 14 +- module/zfs/zil.c | 60 +++- module/zfs/zio.c | 125 +++++-- tests/runfiles/common.run | 3 +- tests/zfs-tests/include/commands.cfg | 2 + tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 3 + .../zpool_events/zpool_events_follow.ksh | 8 +- .../zpool_expand/zpool_expand_005_pos.ksh | 1 + .../cli_root/zpool_export/zpool_export.kshlib | 6 + .../zpool_export/zpool_export_005_pos.ksh | 93 ++++++ .../zpool_export/zpool_export_006_pos.ksh | 112 +++++++ .../zpool_export/zpool_export_007_pos.ksh | 107 ++++++ .../functional/fault/auto_online_001_pos.ksh | 2 +- tests/zfs-tests/tests/functional/mmp/mmp.cfg | 1 + .../functional/mmp/mmp_inactive_import.ksh | 12 +- .../functional/mmp/mmp_reset_interval.ksh | 6 +- .../pool_checkpoint/checkpoint_lun_expsz.ksh | 1 + 90 files changed, 2155 insertions(+), 419 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 4965cba52692..48d2191887be 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -363,7 +363,7 @@ get_usage(zpool_help_t idx) case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: - return (gettext("\texport [-af] ...\n")); + return (gettext("\texport [-afF] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: @@ -1877,7 +1877,7 @@ zpool_do_destroy(int argc, char **argv) return (1); } - if (zpool_disable_datasets(zhp, force) != 0) { + if (zpool_disable_datasets(zhp, force, FALSE) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); @@ -1907,7 +1907,7 @@ zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; - if (zpool_disable_datasets(zhp, cb->force) != 0) + if (zpool_disable_datasets(zhp, cb->force, cb->hardforce) != 0) return (1); /* The history must be logged as part of the export */ @@ -1928,10 +1928,13 @@ zpool_export_one(zpool_handle_t *zhp, void *data) * * -a Export all pools * -f Forcefully unmount datasets + * -F Forcefully export, dropping all outstanding dirty data * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, - * then the datasets will be forcefully unmounted. + * then the datasets will be forcefully unmounted. If the '-F' flag is + * specified, the pool's dirty data, if any, will simply be dropped after a + * best-effort attempt to forcibly stop all activity. */ int zpool_do_export(int argc, char **argv) diff --git a/include/libzfs.h b/include/libzfs.h index 7ec9768d8e93..a68f4ef65ce2 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -419,6 +419,7 @@ typedef enum { ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ + ZPOOL_STATUS_FORCE_EXPORTING, /* pool is being force exported */ /* * Finally, the following indicates a healthy pool. @@ -977,10 +978,16 @@ _LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, * sharing/unsharing them. */ _LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int); -_LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); +_LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t, boolean_t); _LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_volume_os(const char *); +/* + * Procedure to inform os that we have started force unmount (linux specific). + */ +_LIBZFS_H void zpool_unmount_mark_hard_force_begin(zpool_handle_t *zhp); +_LIBZFS_H void zpool_unmount_mark_hard_force_end(zpool_handle_t *zhp); + /* * Parse a features file for -o compatibility */ diff --git a/include/os/freebsd/spl/sys/thread.h b/include/os/freebsd/spl/sys/thread.h index 4fb1a542f55f..9fa900d37d3c 100644 --- a/include/os/freebsd/spl/sys/thread.h +++ b/include/os/freebsd/spl/sys/thread.h @@ -31,4 +31,7 @@ #define getcomm() curthread->td_name #define getpid() curthread->td_tid +#define thread_signal spl_kthread_signal +extern int spl_kthread_signal(kthread_t *tsk, int sig); + #endif diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h index 050fc3036f87..e54547a51ff4 100644 --- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -135,6 +135,8 @@ zfs_enter(zfsvfs_t *zfsvfs, const char *tag) return (0); } +#define zfs_enter_unmountok zfs_enter + /* Must be called before exiting the vop */ static inline void zfs_exit(zfsvfs_t *zfsvfs, const char *tag) diff --git a/include/os/linux/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h index bc88ff4efb67..a5b44ba39daa 100644 --- a/include/os/linux/spl/sys/thread.h +++ b/include/os/linux/spl/sys/thread.h @@ -53,6 +53,7 @@ typedef void (*thread_func_t)(void *); __thread_create(stk, stksize, (thread_func_t)func, #func, \ arg, len, pp, state, pri) +#define thread_signal(t, s) spl_kthread_signal(t, s) #define thread_exit() spl_thread_exit() #define thread_join(t) VERIFY(0) #define curthread current @@ -64,6 +65,7 @@ extern kthread_t *__thread_create(caddr_t stk, size_t stksize, int state, pri_t pri); extern struct task_struct *spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...); +extern int spl_kthread_signal(kthread_t *tsk, int sig); static inline __attribute__((noreturn)) void spl_thread_exit(void) diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index e320b8de4222..9e056817699d 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -101,7 +101,8 @@ struct zfsvfs { boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_relatime; /* enable relatime mount option */ - boolean_t z_unmounted; /* unmounted */ + boolean_t z_unmounted; /* mount status */ + boolean_t z_force_unmounted; /* force-unmounted status */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index 0be2c445ab76..48a32b265a64 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -98,24 +98,39 @@ extern "C" { #define zhold(zp) VERIFY3P(igrab(ZTOI((zp))), !=, NULL) #define zrele(zp) iput(ZTOI((zp))) +#define zfsvfs_is_unmounted(zfsvfs) \ + ((zfsvfs)->z_unmounted || (zfsvfs)->z_force_unmounted) + +/* Must be called before exiting the operation. */ +static inline void +zfs_exit(zfsvfs_t *zfsvfs, const char *tag) +{ + zfs_exit_fs(zfsvfs); + ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); +} + /* Called on entry to each ZFS inode and vfs operation. */ static inline int zfs_enter(zfsvfs_t *zfsvfs, const char *tag) { ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag); - if (unlikely(zfsvfs->z_unmounted)) { + if (unlikely(zfsvfs_is_unmounted(zfsvfs))) { ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); return (SET_ERROR(EIO)); } return (0); } -/* Must be called before exiting the operation. */ -static inline void -zfs_exit(zfsvfs_t *zfsvfs, const char *tag) +/* zfs_enter() but ok with forced unmount having begun */ +static inline int +zfs_enter_unmountok(zfsvfs_t *zfsvfs, const char *tag) { - zfs_exit_fs(zfsvfs); - ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag); + ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag); + if (unlikely((zfsvfs)->z_unmounted == B_TRUE)) { + zfs_exit(zfsvfs, tag); + return (SET_ERROR(EIO)); + } + return (0); } static inline int diff --git a/include/sys/arc.h b/include/sys/arc.h index 836ed679dbac..2f8b15a86ce3 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -338,6 +338,7 @@ void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); +void l2arc_spa_rebuild_stop(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1b82ff620f27..fa8d13bea6f7 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -283,6 +283,7 @@ typedef enum dmu_object_type { #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) +#define TXG_NOSUSPEND (1ULL<<2) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index ce6ae3c665ac..0ae83fdabb72 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -241,6 +241,7 @@ typedef struct dmu_sendstatus { list_node_t dss_link; int dss_outfd; proc_t *dss_proc; + kthread_t *dss_thread; offset_t *dss_off; uint64_t dss_blocks; /* blocks visited during the sending process */ } dmu_sendstatus_t; diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9f6e0fdd601b..4521eda89571 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -179,6 +179,7 @@ struct objset { /* Protected by os_lock */ kmutex_t os_lock; + kthread_t *os_shutdown_initiator; multilist_t os_dirty_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; @@ -266,6 +267,10 @@ int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); +int dmu_objset_shutdown_register(objset_t *os); +boolean_t dmu_objset_exiting(objset_t *os); +void dmu_objset_shutdown_unregister(objset_t *os); + void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index 3390ca1089f8..59ddd8347800 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -40,6 +40,7 @@ extern const char *const recv_clone_name; typedef struct dmu_recv_cookie { struct dsl_dataset *drc_ds; + kthread_t *drc_initiator; struct dmu_replay_record *drc_drr_begin; struct drr_begin *drc_drrb; const char *drc_tofs; @@ -57,6 +58,8 @@ typedef struct dmu_recv_cookie { nvlist_t *drc_keynvl; uint64_t drc_fromsnapobj; uint64_t drc_ivset_guid; + unsigned int drc_flags; + void *drc_rwa; void *drc_owner; cred_t *drc_cred; proc_t *drc_proc; @@ -83,6 +86,7 @@ int dmu_recv_begin(const char *, const char *, dmu_replay_record_t *, boolean_t, boolean_t, boolean_t, nvlist_t *, nvlist_t *, const char *, dmu_recv_cookie_t *, zfs_file_t *, offset_t *); int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *); +int dmu_recv_close(dsl_dataset_t *ds); int dmu_recv_end(dmu_recv_cookie_t *, void *); boolean_t dmu_objset_is_receiving(objset_t *); diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index 061b81532fb1..7c8666f61590 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -60,6 +60,7 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, struct dmu_send_outparams *dso); +int dmu_send_close(struct dsl_dataset *ds); typedef int (*dmu_send_outfunc_t)(objset_t *os, void *buf, int len, void *arg); typedef struct dmu_send_outparams { diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 3450527af7e0..9abe3fc63301 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -242,6 +242,8 @@ typedef struct dsl_dataset { kmutex_t ds_sendstream_lock; list_t ds_sendstreams; + struct dmu_recv_cookie *ds_receiver; + /* * When in the middle of a resumable receive, tracks how much * progress we have made. @@ -324,7 +326,8 @@ typedef struct dsl_dataset_rename_snapshot_arg { /* flags for holding the dataset */ typedef enum ds_hold_flags { DS_HOLD_FLAG_NONE = 0 << 0, - DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ + DS_HOLD_FLAG_DECRYPT = 1 << 0, /* needs access to encrypted data */ + DS_HOLD_FLAG_MUST_BE_OPEN = 1 << 1, /* dataset must already be open */ } ds_hold_flags_t; int dsl_dataset_hold(struct dsl_pool *dp, const char *name, const void *tag, @@ -453,6 +456,8 @@ void dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag); void dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag); boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); +int dsl_dataset_sendrecv_cancel_all(spa_t *spa); + int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 8925b5815a37..5b94d985b522 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -172,7 +172,7 @@ int dsl_scan(struct dsl_pool *, pool_scan_func_t); void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); -void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); +int dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 0734ff12280e..9538d744f930 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1469,6 +1469,8 @@ typedef enum zfs_ioc { ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */ ZFS_IOC_SET_BOOTENV, /* 0x87 */ ZFS_IOC_GET_BOOTENV, /* 0x88 */ + ZFS_IOC_HARD_FORCE_UNMOUNT_BEGIN, /* 0x89 (Linux) */ + ZFS_IOC_HARD_FORCE_UNMOUNT_END, /* 0x90 (Linux) */ ZFS_IOC_LAST } zfs_ioc_t; diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index fec080139a2b..c913111d5bd9 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -114,6 +114,7 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); +void metaslab_class_force_discard(metaslab_class_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); diff --git a/include/sys/spa.h b/include/sys/spa.h index b96a9ef1d42f..f92cadd89072 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -755,6 +755,7 @@ extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_set_pre_export_status(const char *pool, boolean_t status); extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); @@ -838,16 +839,13 @@ extern kmutex_t spa_namespace_lock; * SPA configuration functions in spa_config.c */ -#define SPA_CONFIG_UPDATE_POOL 0 -#define SPA_CONFIG_UPDATE_VDEVS 1 - extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); -extern void spa_config_update(spa_t *spa, int what); +extern int spa_config_update_pool(spa_t *spa); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); @@ -964,6 +962,15 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); + +typedef enum { + /* Config lock handling flags */ + SCL_FLAG_TRYENTER = 1U << 0, + SCL_FLAG_NOSUSPEND = 1U << 1, + /* MMP flag */ + SCL_FLAG_MMP = 1U << 2, +} spa_config_flag_t; + extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, @@ -976,6 +983,8 @@ extern int spa_import_progress_set_state(uint64_t pool_guid, /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); +extern int spa_config_enter_flags(spa_t *spa, int locks, const void *tag, + krw_t rw, spa_config_flag_t flags); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw); @@ -1026,6 +1035,7 @@ extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern void spa_verify_dirty_txg(spa_t *spa, uint64_t txg); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); @@ -1045,6 +1055,8 @@ extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); +extern void spa_evicting_os_lock(spa_t *); +extern void spa_evicting_os_unlock(spa_t *); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); @@ -1136,6 +1148,10 @@ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, extern const char *spa_state_to_name(spa_t *spa); +extern boolean_t spa_exiting_any(spa_t *spa); +extern boolean_t spa_exiting(spa_t *spa); +extern int spa_operation_interrupted(spa_t *spa); + /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 5782c54bd78f..6e04643e4f4b 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -245,6 +245,8 @@ struct spa { kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ list_t spa_evicting_os_list; /* Objsets being evicted. */ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ + kthread_t *spa_export_initiator; /* thread exporting the pool */ + boolean_t spa_pre_exporting; /* allow fails before export */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_min_ashift; /* of vdevs in normal class */ diff --git a/include/sys/txg.h b/include/sys/txg.h index 46945210cdb5..725694951d9d 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -66,11 +66,27 @@ typedef struct txg_list { } txg_list_t; struct dsl_pool; +struct dmu_tx; + +/* + * TXG wait flags, used by txg_wait_synced_tx and callers to indicate + * modifications to how they wish to wait for a txg. + */ +typedef enum { + /* No special wait flags. */ + TXG_WAIT_F_NONE = 0, + /* Reject the call with EINTR upon receiving a signal. */ + TXG_WAIT_F_SIGNAL = (1U << 0), + /* Reject the call with EAGAIN upon suspension. */ + TXG_WAIT_F_NOSUSPEND = (1U << 1), + /* Ignore errors and export anyway. */ + TXG_WAIT_F_FORCE_EXPORT = (1U << 2), +} txg_wait_flag_t; extern void txg_init(struct dsl_pool *dp, uint64_t txg); extern void txg_fini(struct dsl_pool *dp); extern void txg_sync_start(struct dsl_pool *dp); -extern void txg_sync_stop(struct dsl_pool *dp); +extern int txg_sync_stop(struct dsl_pool *dp, txg_wait_flag_t txg_how); extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); @@ -84,14 +100,23 @@ extern void txg_kick(struct dsl_pool *dp, uint64_t txg); * Wait until the given transaction group has finished syncing. * Try to make this happen as soon as possible (eg. kick off any * necessary syncs immediately). If txg==0, wait for the currently open - * txg to finish syncing. + * txg to finish syncing. This may be interrupted due to an exiting pool. + * + * If desired, flags can be specified using txg_wait_synced_tx(), in case + * the caller wants to be interruptible. */ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); +extern int txg_wait_synced_tx(struct dsl_pool *dp, uint64_t txg, + struct dmu_tx *tx, txg_wait_flag_t flags); +extern int txg_wait_synced_flags(struct dsl_pool *dp, uint64_t txg, + txg_wait_flag_t flags); /* - * Wait as above. Returns true if the thread was signaled while waiting. + * Similar to a txg_wait_synced but it can be interrupted from a signal. + * Returns B_TRUE if the thread was signaled while waiting. */ -extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); +#define txg_wait_synced_sig(dp, txg) \ + (txg_wait_synced_tx(dp, txg, NULL, TXG_WAIT_F_SIGNAL) == EINTR) /* * Wait until the given transaction group, or one after it, is @@ -102,6 +127,8 @@ extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg, boolean_t should_quiesce); +void txg_force_export(spa_t *spa); + /* * Returns TRUE if we are "backed up" waiting for the syncing * transaction to complete; otherwise returns FALSE. @@ -113,6 +140,8 @@ extern boolean_t txg_sync_waiting(struct dsl_pool *dp); extern void txg_verify(spa_t *spa, uint64_t txg); +extern void txg_completion_notify(struct dsl_pool *dp); + /* * Wait for pending commit callbacks of already-synced transactions to finish * processing. diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 0d31195447d1..95bf0ee75c8c 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -231,6 +231,7 @@ typedef pthread_t kthread_t; zk_thread_create(func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg, stksize, state) +#define thread_signal(t, s) pthread_kill((pthread_t)(t), s) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) diff --git a/include/sys/zfs_ioctl_impl.h b/include/sys/zfs_ioctl_impl.h index cb852c5577fd..84200829785b 100644 --- a/include/sys/zfs_ioctl_impl.h +++ b/include/sys/zfs_ioctl_impl.h @@ -73,6 +73,10 @@ typedef struct zfs_ioc_key { int zfs_secpolicy_config(zfs_cmd_t *, nvlist_t *, cred_t *); +void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, boolean_t log_history, + zfs_ioc_poolcheck_t pool_check); + void zfs_ioctl_register_dataset_nolog(zfs_ioc_t, zfs_ioc_legacy_func_t *, zfs_secpolicy_func_t *, zfs_ioc_poolcheck_t); diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index 42f846b8920a..d7468d74c11d 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -60,7 +60,7 @@ typedef struct refcount { /* * Note: zfs_refcount_t must be initialized with - * refcount_create[_untracked]() + * zfs_refcount_create[_untracked]() */ void zfs_refcount_create(zfs_refcount_t *); diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 012e7403e2a6..7a52bcbc7703 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -239,6 +239,20 @@ zfs_enter_verify_zp(zfsvfs_t *zfsvfs, znode_t *zp, const char *tag) return (0); } +/* zfs_enter_unmountok and zfs_verify_zp together */ +static inline int +zfs_enter_unmountok_verify_zp(zfsvfs_t *zfsvfs, znode_t *zp, const char *tag) +{ + int error; + if ((error = zfs_enter_unmountok(zfsvfs, tag)) != 0) + return (error); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, tag); + return (error); + } + return (0); +} + typedef struct znode_hold { uint64_t zh_obj; /* object id */ avl_node_t zh_node; /* avl tree linkage */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 3463682a1065..246d640614fe 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -416,6 +416,7 @@ typedef zio_t *zio_pipe_stage_t(zio_t *zio); */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_REEXECUTE_CANCELLED 0x04 /* * The io_trim flags are used to specify the type of TRIM to perform. They @@ -643,7 +644,7 @@ extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); -extern void zio_resume_wait(spa_t *spa); +extern void zio_cancel(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, enum blk_verify_flag blk_verify); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f9aed4e0d57e..6f7022f19268 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -2013,7 +2013,7 @@ - + @@ -2071,6 +2071,9 @@ + + + @@ -5597,7 +5600,8 @@ - + + diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 138eca19acc3..5fb5ed3fabee 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -473,8 +473,6 @@ make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { - zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) @@ -482,18 +480,33 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path) zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); - zcmd_alloc_dst_nvlist(hdl, &zc, 0); + if (!hdl->libzfs_force_export) { + zfs_cmd_t zc = {"\0"}; - if (get_stats_ioctl(zhp, &zc) == -1) { + zcmd_alloc_dst_nvlist(hdl, &zc, 0); + if (get_stats_ioctl(zhp, &zc) == -1) { + zcmd_free_nvlists(&zc); + free(zhp); + return (NULL); + } + if (make_dataset_handle_common(zhp, &zc) == -1) { + free(zhp); + zhp = NULL; + } zcmd_free_nvlists(&zc); - free(zhp); - return (NULL); - } - if (make_dataset_handle_common(zhp, &zc) == -1) { - free(zhp); - zhp = NULL; + } else { + /* + * Called from zpool_disable_datasets() which sets force + * export and uses mount entries, so de facto the dataset + * is a ZFS filesystem. Furthermore, we need to avoid + * calling get_stats_ioctl() here since it results in + * zfs_ioc_objset_stats()->dmu_objset_hold() being called by + * the kernel which can potentially cause IO to be issued + * depending on what's currently cached in ARC. + */ + zhp->zfs_dmustats.dds_type = DMU_OST_ZFS; + zhp->zfs_type = ZFS_TYPE_FILESYSTEM; } - zcmd_free_nvlists(&zc); return (zhp); } diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index ef0359f45ea0..45a79c264387 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -72,6 +72,7 @@ struct libzfs_handle { uint64_t libzfs_max_nvlist; void *libfetch; char *libfetch_load_error; + boolean_t libzfs_force_export; }; struct zfs_handle { diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 5d1fe651c97e..85ed36e0284a 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1345,7 +1345,8 @@ mountpoint_compare(const void *a, const void *b) * and gather all the filesystems that are currently mounted. */ int -zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) +zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force, + boolean_t hardforce) { int used, alloc; FILE *mnttab; @@ -1355,8 +1356,9 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) libzfs_handle_t *hdl = zhp->zpool_hdl; int i; int ret = -1; - int flags = (force ? MS_FORCE : 0); + int flags = ((hardforce || force) ? MS_FORCE : 0); + hdl->libzfs_force_export = flags & MS_FORCE; namelen = strlen(zhp->zpool_name); if ((mnttab = fopen(MNTTAB, "re")) == NULL) @@ -1418,6 +1420,10 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) if (used != 0) qsort(sets, used, sizeof (struct sets_s), mountpoint_compare); + if (hardforce) { + zpool_unmount_mark_hard_force_begin(zhp); + } + /* * Walk through and first unshare everything. */ @@ -1441,9 +1447,15 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) goto out; } - for (i = 0; i < used; i++) { - if (sets[i].dataset) - remove_mountpoint(sets[i].dataset); + /* + * Remove mountpoints, unless the pool is being forcibly exported. + * In the latter case, avoid potentially initiating I/O on the pool. + */ + if (!hdl->libzfs_force_export) { + for (i = 0; i < used; i++) { + if (sets[i].dataset) + remove_mountpoint(sets[i].dataset); + } } zpool_disable_datasets_os(zhp, force); @@ -1457,6 +1469,9 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) free(sets[i].mountpoint); } free(sets); + if (hardforce) { + zpool_unmount_mark_hard_force_end(zhp); + } return (ret); } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ae4c861590fd..f0fbec2d7f19 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -252,7 +252,9 @@ zpool_get_state_str(zpool_handle_t *zhp) status = zpool_get_status(zhp, NULL, &errata); - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + if (status == ZPOOL_STATUS_FORCE_EXPORTING) { + str = gettext("FORCE-EXPORTING"); + } else if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || status == ZPOOL_STATUS_IO_FAILURE_MMP) { @@ -1574,7 +1576,9 @@ zpool_destroy(zpool_handle_t *zhp, const char *log_str) } if (zfp) { - remove_mountpoint(zfp); + /* Avoid initiating I/O during a forced export. */ + if (!hdl->libzfs_force_export) + remove_mountpoint(zfp); zfs_close(zfp); } diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index a2259eee91ca..d63c1f230a24 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -228,6 +228,9 @@ check_status(nvlist_t *config, boolean_t isimport, uint64_t errata = 0; unsigned long system_hostid = get_system_hostid(); + if (config == NULL) + return (ZPOOL_STATUS_FORCE_EXPORTING); + uint64_t version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); diff --git a/lib/libzfs/os/freebsd/libzfs_zmount.c b/lib/libzfs/os/freebsd/libzfs_zmount.c index 34976f7bbf46..fe3922faf1c8 100644 --- a/lib/libzfs/os/freebsd/libzfs_zmount.c +++ b/lib/libzfs/os/freebsd/libzfs_zmount.c @@ -137,3 +137,15 @@ zpool_disable_volume_os(const char *name) { (void) name; } + +void +zpool_unmount_mark_hard_force_begin(zpool_handle_t *zhp) +{ + (void) zhp; +} + +void +zpool_unmount_mark_hard_force_end(zpool_handle_t *zhp) +{ + (void) zhp; +} diff --git a/lib/libzfs/os/linux/libzfs_mount_os.c b/lib/libzfs/os/linux/libzfs_mount_os.c index f0bf3dcc6c6b..af522024807a 100644 --- a/lib/libzfs/os/linux/libzfs_mount_os.c +++ b/lib/libzfs/os/linux/libzfs_mount_os.c @@ -428,3 +428,23 @@ zpool_disable_volume_os(const char *name) { (void) name; } + +void +zpool_unmount_mark_hard_force_begin(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) zfs_ioctl(hdl, ZFS_IOC_HARD_FORCE_UNMOUNT_BEGIN, &zc); +} + +void +zpool_unmount_mark_hard_force_end(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) zfs_ioctl(hdl, ZFS_IOC_HARD_FORCE_UNMOUNT_END, &zc); +} diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d529147464fe..1e1138fa8159 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -922,6 +922,18 @@ receive of encrypted datasets. Intended for users whose pools were created with OpenZFS pre-release versions and now have compatibility issues. . +.It Sy zfs_forced_export_unmount_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int +During forced unmount, leave the filesystem in a disabled mode of operation, +in which all new I/Os fail, except for those required to unmount it. +Intended for users trying to forcibly export a pool even when I/Os are in +progress, without the need to find and stop them. +This option does not affect processes that are merely sitting on the +filesystem, only those performing active I/O. +.Pp +This parameter can be set to 1 to enable this behavior. +.Pp +This parameter only applies on Linux. +. .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong Maximum number of uses of a single salt value before generating a new one for encrypted datasets. diff --git a/man/man8/zpool-export.8 b/man/man8/zpool-export.8 index 4fed98cfe278..ce41c87e78be 100644 --- a/man/man8/zpool-export.8 +++ b/man/man8/zpool-export.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd November 1, 2022 .Dt ZPOOL-EXPORT 8 .Os . @@ -37,6 +37,7 @@ .Nm zpool .Cm export .Op Fl f +.Op Fl F .Fl a Ns | Ns Ar pool Ns … . .Sh DESCRIPTION @@ -67,6 +68,15 @@ spares. This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption. +.It Fl F +Forcibly export the pool. +.Pp +This option allows a pool to be exported even when the underlying disks are +offline and the pool is unavailable. +When force exporting a pool, any outstanding dirty data will be discarded. +This option implies the +.Fl f +option. .El . .Sh EXAMPLES diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index e3653167323b..e06288cf647b 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -104,6 +104,16 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +int +spl_kthread_signal(kthread_t *td, int sig) +{ + + PROC_LOCK(td->td_proc); + tdsignal(td, sig); + PROC_UNLOCK(td->td_proc); + return (0); +} + SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index b4ef86a5e4a6..2e053287c9e6 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -204,3 +204,15 @@ issig(int why) } EXPORT_SYMBOL(issig); + +/* + * spl_kthread_signal - Wrapper for sending signals to a thread. + */ +int +spl_kthread_signal(kthread_t *tsk, int sig) +{ + + return (send_sig(sig, tsk, 0)); +} + +EXPORT_SYMBOL(spl_kthread_signal); diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 1eeabe53d23c..c9652f173c84 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -563,7 +563,6 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs) void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) { - ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); if (zfsvfs->z_draining) { zfsvfs->z_drain_cancel = B_TRUE; diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c index 663474ea49ab..976e2720f55b 100644 --- a/module/os/linux/zfs/zfs_ioctl_os.c +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -208,6 +208,18 @@ zfs_ioctl_update_mount_cache(const char *dsname) { } +static int +zfs_ioc_pool_unmount_begin(zfs_cmd_t *zc) +{ + return (spa_set_pre_export_status(zc->zc_name, true)); +} + +static int +zfs_ioc_pool_unmount_end(zfs_cmd_t *zc) +{ + return (spa_set_pre_export_status(zc->zc_name, false)); +} + void zfs_ioctl_init_os(void) { @@ -215,6 +227,13 @@ zfs_ioctl_init_os(void) zfs_ioc_userns_attach, zfs_secpolicy_config, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERNS_DETACH, zfs_ioc_userns_detach, zfs_secpolicy_config, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_HARD_FORCE_UNMOUNT_BEGIN, + zfs_ioc_pool_unmount_begin, zfs_secpolicy_config, B_FALSE, + POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_HARD_FORCE_UNMOUNT_END, + zfs_ioc_pool_unmount_end, zfs_secpolicy_config, B_FALSE, + POOL_CHECK_NONE); } #ifdef CONFIG_COMPAT diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 48945b8af8c1..0d0f1a8a1832 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1094,7 +1094,7 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp) uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; - if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + if ((err = zfs_enter_unmountok(zfsvfs, FTAG)) != 0) return (err); dmu_objset_space(zfsvfs->z_os, @@ -1166,7 +1166,7 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) znode_t *rootzp; int error; - if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + if ((error = zfs_enter_unmountok(zfsvfs, FTAG)) != 0) return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); @@ -1312,6 +1312,8 @@ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; + kthread_t *initiator = NULL; + uint64_t wait_flags = 0; zfs_unlinked_drain_stop_wait(zfsvfs); @@ -1341,6 +1343,15 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) if (++round > 1 && !unmounting) break; } + initiator = zfsvfs->z_os->os_shutdown_initiator; + /* + * Although it could be argued that a force unmount in + * another thread shouldn't have this apply, once a force + * unmount is in effect, it's pointless for the non-forced + * unmount to not use this flag. + */ + if (initiator != NULL) + wait_flags |= TXG_WAIT_F_NOSUSPEND; } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); @@ -1373,6 +1384,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + if (initiator == curthread) { + zfsvfs->z_unmounted = B_FALSE; + dmu_objset_shutdown_unregister(zfsvfs->z_os); + } rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); @@ -1441,12 +1456,16 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) } } if (!zfs_is_readonly(zfsvfs) && os_dirty) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) txg_wait_synced_tx(dmu_objset_pool(zfsvfs->z_os), 0, + NULL, wait_flags); } dmu_objset_evict_dbufs(zfsvfs->z_os); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); + if (initiator == curthread) + dmu_objset_shutdown_unregister(zfsvfs->z_os); + return (0); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 234c4d5ef0e0..51aa2c715e8f 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -208,7 +208,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) zfsvfs_t *zfsvfs = ITOZSB(ip); int error; - if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + if ((error = zfs_enter_unmountok_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Decrement the synchronous opens in the znode */ @@ -1660,7 +1660,7 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) u_longlong_t nblocks; int error; - if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + if ((error = zfs_enter_unmountok_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index c5c230bee144..21d55a1d5d6b 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -29,7 +29,10 @@ #include #include #include +#include +#include +int zfs_forced_export_unmount_enabled = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -103,6 +106,31 @@ zpl_evict_inode(struct inode *ip) spl_fstrans_unmark(cookie); } +static void +zpl_umount_begin(struct super_block *sb) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + + if (zfsvfs) { + /* + * Flush out all POSIX I/Os. Notify all waiters that they + * must end, then wait for all users to drop their holds on + * z_teardown_*_lock, and evict buffers. + */ + if (zfs_forced_export_unmount_enabled) + zfsvfs->z_force_unmounted = B_TRUE; + (void) dmu_objset_shutdown_register(zfsvfs->z_os); + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + dmu_objset_evict_dbufs(zfsvfs->z_os); + + dsl_dir_cancel_waiters(zfsvfs->z_os->os_dsl_dataset->ds_dir); + dmu_objset_shutdown_unregister(zfsvfs->z_os); + } +} + static void zpl_put_super(struct super_block *sb) { @@ -187,7 +215,7 @@ static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { int error; - if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + if ((error = zfs_enter_unmountok(zfsvfs, FTAG)) != 0) return (error); char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -381,6 +409,7 @@ const struct super_operations zpl_super_operations = { .write_inode = NULL, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, + .umount_begin = zpl_umount_begin, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .remount_fs = zpl_remount_fs, @@ -400,3 +429,8 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, forced_export_unmount_enabled, INT, ZMOD_RW, + "Enable forced export unmount to keep POSIX I/O users off"); +/* END CSTYLED */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index c50228a2682f..e3defb7bf68f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -9778,6 +9778,18 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) l2arc_rebuild_dev(dev, reopen); } +static void +l2arc_dev_rebuild_stop(l2arc_dev_t *l2ad) +{ + mutex_enter(&l2arc_rebuild_thr_lock); + if (l2ad->l2ad_rebuild_began == B_TRUE) { + l2ad->l2ad_rebuild_cancel = B_TRUE; + while (l2ad->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); +} + /* * Remove a vdev from the L2ARC. */ @@ -9795,13 +9807,7 @@ l2arc_remove_vdev(vdev_t *vd) /* * Cancel any ongoing or scheduled rebuild. */ - mutex_enter(&l2arc_rebuild_thr_lock); - if (remdev->l2ad_rebuild_began == B_TRUE) { - remdev->l2ad_rebuild_cancel = B_TRUE; - while (remdev->l2ad_rebuild == B_TRUE) - cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); - } - mutex_exit(&l2arc_rebuild_thr_lock); + l2arc_dev_rebuild_stop(remdev); /* * Remove device from global list @@ -9917,6 +9923,25 @@ l2arc_spa_rebuild_start(spa_t *spa) } } +void +l2arc_spa_rebuild_stop(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + l2arc_dev_rebuild_stop(dev); + } +} + /* * Main entry point for L2ARC rebuilding. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c7f76e8d96f8..5b8a8498d736 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1068,11 +1068,12 @@ dbuf_verify(dmu_buf_impl_t *db) uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_objset != NULL); - if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY) || + dmu_objset_exiting(db->db_objset)) return; - ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { @@ -1156,7 +1157,8 @@ dbuf_verify(dmu_buf_impl_t *db) if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && - db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) { + db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg) && + !dmu_objset_exiting(db->db_objset)) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that @@ -2304,7 +2306,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * this assertion only if we're not already dirty. */ os = dn->dn_objset; - VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); + spa_verify_dirty_txg(os->os_spa, dmu_tx_get_txg(tx)); #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); @@ -4331,7 +4333,11 @@ dbuf_lightweight_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; - VERIFY0(zio->io_error); + if (zio->io_error != 0) { + /* If the pool is exiting, only cleanup in-core state. */ + ASSERT(spa_exiting_any(zio->io_spa)); + goto out; + } objset_t *os = dr->dr_dnode->dn_objset; dmu_tx_t *tx = os->os_synctx; @@ -4355,6 +4361,7 @@ dbuf_lightweight_done(zio_t *zio) dr->dr_accounted % zio->io_phys_children, zio->io_txg); } +out: abd_free(dr->dt.dll.dr_abd); kmem_free(dr, sizeof (*dr)); } @@ -4755,9 +4762,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; - ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); + if (zio->io_error != 0) { + /* If the pool is exiting, only cleanup in-core state. */ + ASSERT(spa_exiting_any(zio->io_spa)); + goto cleanup; + } + /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. @@ -4770,6 +4782,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dsl_dataset_block_born(ds, bp, tx); } +cleanup: mutex_enter(&db->db_mtx); DBUF_VERIFY(db); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cda1472a77aa..c2a09e858d82 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1132,12 +1132,16 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) return; - VERIFY0(dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + error = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + VERIFY(error == 0 || spa_exiting_any(os->os_spa)); + if (error != 0) + return; dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -1151,12 +1155,16 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) return; - VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + error = dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + VERIFY(error == 0 || spa_exiting_any(dn->dn_objset->os_spa)); + if (error != 0) + return; dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -1188,11 +1196,15 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; + int error; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - VERIFY0(dmu_buf_hold_noread(os, object, offset, - FTAG, &db)); + error = dmu_buf_hold_noread(os, object, offset, + FTAG, &db); + VERIFY(error == 0 || spa_exiting_any(os->os_spa)); + if (error != 0) + return; dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c19ebf424953..689589c36edf 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1608,6 +1608,11 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; + if (zio->io_error != 0) { + ASSERT(spa_exiting_any(zio->io_spa)); + goto done; + } + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { @@ -1617,6 +1622,8 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } + +done: kmem_free(bp, sizeof (*bp)); } @@ -1856,6 +1863,7 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) { void *cookie; userquota_node_t *uqn; + int error; ASSERT(dmu_tx_is_syncing(tx)); @@ -1867,10 +1875,13 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) * zap_increment_int(). It's needed because zap_increment_int() * is not thread-safe (i.e. not atomic). */ - mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); - mutex_exit(&os->os_userused_lock); + if (!dmu_objset_exiting(os)) { + mutex_enter(&os->os_userused_lock); + error = zap_increment(os, DMU_USERUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx); + VERIFY(error == 0 || dmu_objset_exiting(os)); + mutex_exit(&os->os_userused_lock); + } kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_user_deltas); @@ -1878,10 +1889,13 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, &cookie)) != NULL) { - mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); - mutex_exit(&os->os_userused_lock); + if (!dmu_objset_exiting(os)) { + mutex_enter(&os->os_userused_lock); + error = zap_increment(os, DMU_GROUPUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx); + VERIFY(error == 0 || dmu_objset_exiting(os)); + mutex_exit(&os->os_userused_lock); + } kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_group_deltas); @@ -1891,8 +1905,9 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); + error = zap_increment(os, DMU_PROJECTUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx); + VERIFY(error == 0 || dmu_objset_exiting(os)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } @@ -2011,6 +2026,7 @@ userquota_updates_task(void *arg) flags = dn->dn_id_flags; ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, &cache, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, @@ -2347,8 +2363,9 @@ dmu_objset_space_upgrade(objset_t *os) if (err != 0) return (err); - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); + err = spa_operation_interrupted(os->os_spa); + if (err != 0) + return (err); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) @@ -3036,6 +3053,52 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } +/* + * Notify the objset that it's being shutdown. This is primarily useful + * when attempting to dislodge any references that might be waiting on a txg + * or similar. + */ +int +dmu_objset_shutdown_register(objset_t *os) +{ + int ret = 0; + + mutex_enter(&os->os_lock); + if (os->os_shutdown_initiator == NULL) { + os->os_shutdown_initiator = curthread; + } else { + ret = SET_ERROR(EBUSY); + } + mutex_exit(&os->os_lock); + + /* + * Signal things that will check for objset force export. The calling + * thread must use a secondary mechanism to check for ref drops, + * before calling dmu_objset_shutdown_unregister(). + */ + if (ret == 0) { + txg_completion_notify(spa_get_dsl(dmu_objset_spa(os))); + } + + return (ret); +} + +boolean_t +dmu_objset_exiting(objset_t *os) +{ + + return (os->os_shutdown_initiator != NULL || + spa_exiting_any(os->os_spa)); +} + +void +dmu_objset_shutdown_unregister(objset_t *os) +{ + + ASSERT3P(os->os_shutdown_initiator, ==, curthread); + os->os_shutdown_initiator = NULL; +} + #if defined(_KERNEL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index c2ce5ce000ac..c86ab81f71f9 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -73,7 +73,6 @@ static uint_t zfs_recv_queue_ff = 20; static uint_t zfs_recv_write_batch_size = 1024 * 1024; static int zfs_recv_best_effort_corrective = 0; -static const void *const dmu_recv_tag = "dmu_recv_tag"; const char *const recv_clone_name = "%recv"; typedef enum { @@ -82,6 +81,9 @@ typedef enum { ORNS_MAYBE } or_need_sync_t; +/* The receive was closed by an external call. */ +#define DRC_CLOSED (1U << 0) + static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); @@ -356,6 +358,34 @@ recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) return (0); } +static void +recv_own(dsl_pool_t *dp, dmu_tx_t *tx, uint64_t dsobj, ds_hold_flags_t dsflags, + dmu_recv_cookie_t *drc, dsl_dataset_t **dsp, objset_t **osp) +{ + dsl_dataset_t *ds; + + /* + * The dataset must be marked inconsistent before exit in any event, + * so dirty it now. This ensures it's cleaned up if interrupted. + */ + VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, drc, &ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + ds->ds_receiver = drc; + *dsp = ds; + VERIFY0(dmu_objset_from_ds(ds, osp)); +} + +static void +recv_disown(dsl_dataset_t *ds, dmu_recv_cookie_t *drc) +{ + ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + + ASSERT3P(ds->ds_receiver, ==, drc); + ds->ds_receiver = NULL; + dsl_dataset_disown(ds, dsflags, drc); +} + static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) @@ -906,8 +936,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(dd, FTAG); drc->drc_newfs = B_TRUE; } - VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, - &newds)); + recv_own(dp, tx, dsobj, dsflags, drba->drba_cookie, &newds, &os); + if (dsl_dataset_feature_is_active(newds, SPA_FEATURE_REDACTED_DATASETS)) { /* @@ -987,9 +1017,6 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) numredactsnaps, tx); } - dmu_buf_will_dirty(newds->ds_dbuf, tx); - dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; - /* * If we actually created a non-clone, we need to create the objset * in our new dataset. If this is a raw send we postpone this until @@ -1175,8 +1202,9 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); - const char *tofs = drba->drba_cookie->drc_tofs; - uint64_t featureflags = drba->drba_cookie->drc_featureflags; + dmu_recv_cookie_t *drc = drba->drba_cookie; + const char *tofs = drc->drc_tofs; + uint64_t featureflags = drc->drc_featureflags; dsl_dataset_t *ds; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ @@ -1186,28 +1214,26 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) recv_clone_name); if (featureflags & DMU_BACKUP_FEATURE_RAW) { - drba->drba_cookie->drc_raw = B_TRUE; + drc->drc_raw = B_TRUE; } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } - if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) - != 0) { + if (dsl_dataset_own_force(dp, recvname, dsflags, drc, &ds) != 0) { /* %recv does not exist; continue in tofs */ - VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, - &ds)); - drba->drba_cookie->drc_newfs = B_TRUE; + VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, drc, &ds)); + drc->drc_newfs = B_TRUE; } + ds->ds_receiver = drc; ASSERT(DS_IS_INCONSISTENT(ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || - drba->drba_cookie->drc_raw); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || drc->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); - drba->drba_cookie->drc_ds = ds; - VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); - drba->drba_cookie->drc_should_save = B_TRUE; + drc->drc_ds = ds; + VERIFY0(dmu_objset_from_ds(ds, &drc->drc_os)); + drc->drc_should_save = B_TRUE; spa_history_log_internal_ds(ds, "resume receive", tx, " "); } @@ -1227,6 +1253,7 @@ dmu_recv_begin(const char *tofs, const char *tosnap, int err = 0; memset(drc, 0, sizeof (dmu_recv_cookie_t)); + drc->drc_initiator = curthread; drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; @@ -1331,6 +1358,16 @@ dmu_recv_begin(const char *tofs, const char *tosnap, } } + if (err == 0 && drc->drc_ds == NULL) { + /* + * Make sure the dataset is destroyed before returning. We + * can't do this in the sync task because a dataset can't be + * synced and destroyed in the same txg. In this scenario, + * it should be flagged as inconsistent so we're ok anyway. + */ + (void) dsl_destroy_head(tofs); + return (SET_ERROR(ENXIO)); + } if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); nvlist_free(drc->drc_begin_nvl); @@ -2661,29 +2698,37 @@ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; - ds_hold_flags_t dsflags; + objset_t *os = ds->ds_objset; + int error = 0; - dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has * been written out to disk. For raw receives, this ensures * that the user accounting code will not attempt to do anything * after we stopped receiving the dataset. + * + * If this is interrupted due to suspension and the pool is being + * force exported, just exit and cleanup. */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); + for (;;) { + error = txg_wait_synced_tx(ds->ds_dir->dd_pool, 0, + NULL, TXG_WAIT_F_NOSUSPEND); + if (error == 0 || spa_exiting_any(os->os_spa)) + break; + } ds->ds_objset->os_raw_receive = B_FALSE; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (drc->drc_resumable && drc->drc_should_save && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + recv_disown(ds, drc); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); - dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + recv_disown(ds, drc); if (!drc->drc_heal) (void) dsl_destroy_head(name); } @@ -3244,6 +3289,35 @@ resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) return (0); } +/* + * Cancel the receive stream for the dataset, if there is one. + */ +int +dmu_recv_close(dsl_dataset_t *ds) +{ + int err = 0; + dmu_recv_cookie_t *drc; + + /* + * This lock isn't technically for recv, but it's not worth + * adding a dedicated one for this purpose. + */ + mutex_enter(&ds->ds_sendstream_lock); + drc = ds->ds_receiver; + if (drc != NULL) { + drc->drc_flags |= DRC_CLOSED; + /* + * Send an interrupt to the initiator thread, which will + * cause it to end the stream and clean up. + */ + if (drc->drc_initiator != curthread) + thread_signal(drc->drc_initiator, SIGINT); + } + mutex_exit(&ds->ds_sendstream_lock); + + return (err); +} + /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -3260,6 +3334,7 @@ int dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) { int err = 0; + spa_t *spa = dsl_dataset_get_spa(drc->drc_ds); struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) { @@ -3300,7 +3375,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * are sure the rest of the receive succeeded so we * stash the keynvl away until then. */ - err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + err = dsl_crypto_recv_raw(spa_name(spa), drc->drc_ds->ds_object, drc->drc_fromsnapobj, drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); if (err != 0) @@ -3340,6 +3415,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) */ drc->drc_should_save = B_TRUE; + /* Last chance before kicking off. */ + if (drc->drc_flags & DRC_CLOSED) { + err = SET_ERROR(EINTR); + goto out; + } + (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), offsetof(struct receive_record_arg, node)); @@ -3361,8 +3442,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); - (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, - TS_RUN, minclsyspri); + /* + * Register the rwa with the drc so it can be interrupted. This + * requires a mutex handshake to ensure validity. + */ + mutex_enter(&drc->drc_ds->ds_sendstream_lock); + drc->drc_rwa = rwa; + mutex_exit(&drc->drc_ds->ds_sendstream_lock); + + kthread_t *rw_td = thread_create(NULL, 0, receive_writer_thread, + rwa, 0, curproc, TS_RUN, minclsyspri); + /* * We're reading rwa->err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we @@ -3378,11 +3468,10 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * it. Finally, if receive_read_record fails or we're at the end of the * stream, then we free drc->drc_rrd and exit. */ - while (rwa->err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); + while (rwa->err == 0 && err == 0) { + err = spa_operation_interrupted(dmu_objset_spa(rwa->os)); + if (err) break; - } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = drc->drc_next_rrd; @@ -3409,9 +3498,22 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) mutex_enter(&rwa->mutex); while (!rwa->done) { + boolean_t closed = drc->drc_flags & DRC_CLOSED; + + if (!closed) { + if (err == 0) + err = spa_operation_interrupted(spa); + if (err != 0) { + drc->drc_flags |= DRC_CLOSED; + thread_signal(rw_td, SIGINT); + closed = B_TRUE; + } + } + /* * We need to use cv_wait_sig() so that any process that may - * be sleeping here can still fork. + * be sleeping here can still fork. Also, it allows + * dmu_recv_close to cause an eos marker to be injected. */ (void) cv_wait_sig(&rwa->cv, &rwa->mutex); } @@ -3443,6 +3545,10 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) } } + mutex_enter(&drc->drc_ds->ds_sendstream_lock); + drc->drc_rwa = NULL; + mutex_exit(&drc->drc_ds->ds_sendstream_lock); + cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); @@ -3491,7 +3597,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); int error; - ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + ASSERT3P(drc->drc_ds->ds_receiver, ==, drc); if (drc->drc_heal) { error = 0; @@ -3716,7 +3822,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, drc->drc_ds->ds_object, drc->drc_ds); } - dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); + recv_disown(drc->drc_ds, drc); drc->drc_ds = NULL; } @@ -3782,7 +3888,7 @@ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && - os->os_dsl_dataset->ds_owner == dmu_recv_tag); + os->os_dsl_dataset->ds_receiver != NULL); } ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW, diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 6bd35713ff18..897e8d9ed029 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -565,7 +565,14 @@ commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(os->os_spa)); + dmu_tx_abort(tx); + return; + } + uint64_t txg = dmu_tx_get_txg(tx); if (!md->md_synctask_txg[txg & TXG_MASK]) { dsl_sync_task_nowait(dmu_tx_pool(tx), diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 5b7f5543ad09..d7591550034a 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2220,6 +2220,7 @@ setup_send_progress(struct dmu_send_params *dspp) dssp->dss_outfd = dspp->outfd; dssp->dss_off = dspp->off; dssp->dss_proc = curproc; + dssp->dss_thread = curthread; mutex_enter(&dspp->to_ds->ds_sendstream_lock); list_insert_head(&dspp->to_ds->ds_sendstreams, dssp); mutex_exit(&dspp->to_ds->ds_sendstream_lock); @@ -2509,6 +2510,14 @@ dmu_send_impl(struct dmu_send_params *dspp) } } + /* + * Last chance, bail if possible at this point, now that the send is + * registered and can be cancelled by signalling this thread. + */ + err = spa_operation_interrupted(os->os_spa); + if (err != 0) + goto out; + if (resuming || book_resuming) { err = setup_resume_points(dspp, to_arg, from_arg, rlt_arg, smt_arg, resuming, os, redact_rl, nvl); @@ -2555,8 +2564,8 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig(JUSTLOOKING) && issig(FORREAL)) - err = SET_ERROR(EINTR); + if (err == 0) + err = spa_operation_interrupted(os->os_spa); } /* @@ -3099,6 +3108,30 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, return (err); } +/* Close all send streams on the dataset. */ +int +dmu_send_close(dsl_dataset_t *ds) +{ + int err = 0; + dmu_sendstatus_t *dss; + + mutex_enter(&ds->ds_sendstream_lock); + dss = list_head(&ds->ds_sendstreams); + while (err == 0 && dss != NULL) { + /* + * Interrupt the initiator thread, which will cause it + * to initiate a cleanup error exit. Also send SIGPIPE + * because this interrupts pipe writes. + */ + thread_signal(dss->dss_thread, SIGINT); + thread_signal(dss->dss_thread, SIGPIPE); + dss = list_next(&ds->ds_sendstreams, dss); + } + mutex_exit(&ds->ds_sendstream_lock); + + return (0); +} + ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW, "Allow sending corrupt data"); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1c5608c4541b..0235f2198b94 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -917,6 +917,19 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) if (spa_suspended(spa)) { DMU_TX_STAT_BUMP(dmu_tx_suspended); + if (txg_how & TXG_NOSUSPEND) + return (SET_ERROR(EAGAIN)); + + /* + * If the user is forcibly exporting the pool or the objset, + * indicate to the caller that they need to give up. + */ + if (spa_exiting_any(spa)) + return (SET_ERROR(EIO)); + + if (tx->tx_objset != NULL && dmu_objset_exiting(tx->tx_objset)) + return (SET_ERROR(EIO)); + /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). @@ -1050,6 +1063,8 @@ dmu_tx_unassign(dmu_tx_t *tx) tx->tx_txg = 0; } +static void dmu_tx_wait_flags(dmu_tx_t *, txg_wait_flag_t); + /* * Assign tx to a transaction group; txg_how is a bitmask: * @@ -1070,6 +1085,11 @@ dmu_tx_unassign(dmu_tx_t *tx) * they have already called dmu_tx_wait() (though most likely on a * different tx). * + * If TXG_NOSUSPEND is set, this indicates that this request must return + * EAGAIN if the pool becomes suspended while it is in progress. This + * ensures that the request does not inadvertently cause conditions that + * cannot be unwound. + * * It is guaranteed that subsequent successful calls to dmu_tx_assign() * will assign the tx to monotonically increasing txgs. Of course this is * not strong monotonicity, because the same txg can be returned multiple @@ -1092,7 +1112,7 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) int err; ASSERT(tx->tx_txg == 0); - ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); + ASSERT0(txg_how & ~(TXG_NOSUSPEND | TXG_WAIT | TXG_NOTHROTTLE)); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ @@ -1107,7 +1127,8 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) if (err != ERESTART || !(txg_how & TXG_WAIT)) return (err); - dmu_tx_wait(tx); + dmu_tx_wait_flags(tx, + (txg_how & TXG_NOSUSPEND) ? TXG_WAIT_F_NOSUSPEND : 0); } txg_rele_to_quiesce(&tx->tx_txgh); @@ -1115,8 +1136,8 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) return (0); } -void -dmu_tx_wait(dmu_tx_t *tx) +static void +dmu_tx_wait_flags(dmu_tx_t *tx, txg_wait_flag_t how) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; @@ -1161,8 +1182,11 @@ dmu_tx_wait(dmu_tx_t *tx) * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. + * + * It's also possible the pool will be force exported, in + * which case we'll try again and notice this fact, and exit. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_tx(dp, spa_last_synced_txg(spa) + 1, tx, how); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; @@ -1176,13 +1200,23 @@ dmu_tx_wait(dmu_tx_t *tx) * If we have a lot of dirty data just wait until we sync * out a TXG at which point we'll hopefully have synced * a portion of the changes. + * + * It's also possible the pool will be force exported, in + * which case we'll try again and notice this fact, and exit. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_tx(dp, spa_last_synced_txg(spa) + 1, tx, how); } spa_tx_assign_add_nsecs(spa, gethrtime() - before); } +void +dmu_tx_wait(dmu_tx_t *tx) +{ + + return (dmu_tx_wait_flags(tx, TXG_WAIT_F_NONE)); +} + static void dmu_tx_destroy(dmu_tx_t *tx) { diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 14e7ced4007c..f8a91073614e 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -44,6 +44,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -566,8 +569,8 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag) } int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, - dsl_dataset_t **dsp) +dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; @@ -590,6 +593,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, ds = dmu_buf_get_user(dbuf); if (ds == NULL) { + if (flags & DS_HOLD_FLAG_MUST_BE_OPEN) { + dmu_buf_rele(dbuf, tag); + return (SET_ERROR(ENXIO)); + } + dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); @@ -734,6 +742,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, } } + if (err == 0 && (flags & DS_HOLD_FLAG_DECRYPT)) { + err = dsl_dataset_create_key_mapping(ds); + if (err != 0) + dsl_dataset_rele(ds, tag); + } + + if (err != 0) + return (err); + ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || @@ -757,24 +774,10 @@ dsl_dataset_create_key_mapping(dsl_dataset_t *ds) } int -dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, - ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, + dsl_dataset_t **dsp) { - int err; - - err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); - if (err != 0) - return (err); - - ASSERT3P(*dsp, !=, NULL); - - if (flags & DS_HOLD_FLAG_DECRYPT) { - err = dsl_dataset_create_key_mapping(*dsp); - if (err != 0) - dsl_dataset_rele(*dsp, tag); - } - - return (err); + return (dsl_dataset_hold_obj_flags(dp, dsobj, 0, tag, dsp)); } int @@ -925,6 +928,115 @@ dsl_dataset_long_held(dsl_dataset_t *ds) return (!zfs_refcount_is_zero(&ds->ds_longholds)); } +/* + * Enumerate active datasets. This function is intended for use cases that + * want to avoid I/O, and only operate on those that have been loaded in + * memory. This works by enumerating the objects in the MOS that are known, + * and calling back with each dataset's MOS object IDs. It would be nice if + * the objset_t's were registered in a spa_t global list, but they're not, + * so this implementation is a bit more complex... + */ +static int +dsl_dataset_active_foreach(spa_t *spa, int func(dsl_dataset_t *, void *), + void *cl) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + dnode_t *mdn = DMU_META_DNODE(mos); + dmu_buf_impl_t *db; + uint64_t blkid, dsobj, i; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; + dsl_dataset_t *ds; + int epb, error; + int ret = 0; + + /* + * For each block of the MOS's meta-dnode's full size: + * - If the block is not cached, skip. + * - If the block has no user, skip. + * - For each dnode child of the meta-dnode block: + * - If not loaded (no dnode pointer), skip. + * - Attempt to hold the dataset, skip on failure. + * - Call the callback, quit if returns non zero, + * - Rele the dataset either way. + */ + rrw_enter(&dp->dp_config_rwlock, RW_READER, FTAG); + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + for (blkid = dsobj = 0; + ret == 0 && blkid <= mdn->dn_maxblkid; + blkid++, dsobj += epb) { + epb = DNODES_PER_BLOCK; + error = dbuf_hold_impl(mdn, 0, blkid, TRUE, TRUE, FTAG, &db); + if (error != 0) { + continue; + } + + epb = db->db.db_size >> DNODE_SHIFT; + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + goto skip; + } + + for (i = 0; ret == 0 && i < epb; i++) { + dnh = &children_dnodes->dnc_children[i]; + if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) + continue; + + error = dsl_dataset_hold_obj_flags(dp, dsobj + i, + DS_HOLD_FLAG_MUST_BE_OPEN, FTAG, &ds); + if (error != 0) + continue; + + ret = func(ds, cl); + dsl_dataset_rele(ds, FTAG); + } + +skip: + dbuf_rele(db, FTAG); + } + rw_exit(&mdn->dn_struct_rwlock); + rrw_exit(&dp->dp_config_rwlock, FTAG); + + return (ret); +} + +/* + * Cancellation interfaces for send/receive streams. + * + * If a send/recv wins the race with a forced destroy, their pipes will be + * interrupted, and the destroy will wait for all ioctl references to drop. + * + * If a forced destroy wins the race, the send/receive will fail to start. + */ + +/* dsl_dataset_sendrecv_cancel_all callback for dsl_dataset_active_foreach. */ +static int +dsl_dataset_sendrecv_cancel_cb(dsl_dataset_t *ds, __maybe_unused void *arg) +{ + (void) arg; + int err; + + err = dmu_send_close(ds); + if (err == 0) + err = dmu_recv_close(ds); + + return (err); +} + +/* + * Cancel all outstanding sends/receives. Used when the pool is trying to + * forcibly exit. Iterates on all datasets in the MOS and cancels any + * running sends/receives by interrupting them. + */ +int +dsl_dataset_sendrecv_cancel_all(spa_t *spa) +{ + + return (dsl_dataset_active_foreach(spa, + dsl_dataset_sendrecv_cancel_cb, NULL)); +} + void dsl_dataset_name(dsl_dataset_t *ds, char *name) { diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 9120fef93c74..b4e750553ea9 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -680,6 +680,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; list_t synced_datasets; + int error; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); @@ -717,7 +718,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } - VERIFY0(zio_wait(zio)); + error = zio_wait(zio); + VERIFY(error == 0 || (spa_exiting_any(zio->io_spa) && error == EIO)); /* * Update the long range free counter after diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d6a9365df120..a1479bf749b6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -879,8 +879,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { - dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); - return (0); + return (dsl_scan_restart_resilver(spa->spa_dsl_pool, 0)); } if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { @@ -1143,13 +1142,20 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) /* start a new scan, or restart an existing one. */ -void +int dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { + int error; + if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + ASSERT(spa_exiting_any(dp->dp_spa)); + dmu_tx_abort(tx); + return (error); + } txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; @@ -1159,6 +1165,8 @@ dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) } zfs_dbgmsg("restarting resilver for %s at txg=%llu", dp->dp_spa->spa_name, (longlong_t)txg); + + return (0); } void @@ -2812,13 +2820,18 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) dsl_dataset_t *ds; uint64_t dsobj = sds->sds_dsobj; uint64_t txg = sds->sds_txg; + int error; /* dequeue and free the ds from the queue */ scan_ds_queue_remove(scn, dsobj); sds = NULL; /* set up min / max txg */ - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + VERIFY(error == 0 || + (spa_exiting_any(dp->dp_spa) && error == EIO)); + if (error != 0) + return; if (txg != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, txg); diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 409e12884d91..af11c8bc74fe 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -57,7 +57,12 @@ dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, top: tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return (err); + } dst.dst_pool = dp; dst.dst_txg = dmu_tx_get_txg(tx); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 24d52a74933f..d4d549699183 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -508,6 +508,16 @@ metaslab_class_get_dspace(metaslab_class_t *mc) return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } +void +metaslab_class_force_discard(metaslab_class_t *mc) +{ + + mc->mc_alloc = 0; + mc->mc_deferred = 0; + mc->mc_space = 0; + mc->mc_dspace = 0; +} + void metaslab_class_histogram_verify(metaslab_class_t *mc) { @@ -2784,6 +2794,19 @@ metaslab_fini(metaslab_t *msp) metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); + if (spa_exiting_any(mg->mg_vd->vdev_spa)) { + /* Catch-all cleanup as required for force export. */ + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + range_tree_vacate(msp->ms_freeing, NULL, NULL); + range_tree_vacate(msp->ms_freed, NULL, NULL); + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + for (int t = 0; t < TXG_SIZE; t++) + range_tree_vacate(msp->ms_allocating[t], NULL, NULL); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + range_tree_vacate(msp->ms_defer[t], NULL, NULL); + msp->ms_deferspace = 0; + } + VERIFY(msp->ms_group == NULL); /* @@ -3981,6 +4004,31 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) return; } + /* + * The pool is being forcibly exported. Just discard everything. + */ + if (spa_exiting_any(spa)) { + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + range_tree_vacate(alloctree, NULL, NULL); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + range_tree_vacate(msp->ms_freeing, NULL, NULL); + range_tree_vacate(msp->ms_freed, NULL, NULL); + range_tree_vacate(msp->ms_trim, NULL, NULL); + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + range_tree_vacate(msp->ms_allocating[txg & TXG_MASK], + NULL, NULL); + range_tree_vacate(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK], + NULL, NULL); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_vacate(msp->ms_defer[t], NULL, NULL); + } + msp->ms_deferspace = 0; + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + return; + } + /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being @@ -4000,7 +4048,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) return; - VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); + spa_verify_dirty_txg(spa, txg); /* * The only state that can actually be changing concurrently @@ -4356,7 +4404,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); - if (msp->ms_deferspace != 0) { + if (msp->ms_deferspace != 0 && !spa_exiting_any(spa)) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. @@ -6177,7 +6225,9 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); - if (err == ENOENT) { + if (err != 0 && spa_exiting_any(spa)) { + return; + } else if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 67601211d6c2..9469ae88e939 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1378,6 +1378,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) static void spa_deactivate(spa_t *spa) { + int error; + ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); @@ -1417,10 +1419,19 @@ spa_deactivate(spa_t *spa) for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); - VERIFY0(zio_wait(spa->spa_txg_zio[i])); + error = zio_wait(spa->spa_txg_zio[i]); + VERIFY(error == 0 || (spa_exiting_any(spa) && error == EIO)); spa->spa_txg_zio[i] = NULL; } + if (spa_exiting_any(spa)) { + metaslab_class_force_discard(spa->spa_normal_class); + metaslab_class_force_discard(spa->spa_log_class); + metaslab_class_force_discard(spa->spa_embedded_log_class); + metaslab_class_force_discard(spa->spa_special_class); + metaslab_class_force_discard(spa->spa_dedup_class); + } + metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; @@ -1554,7 +1565,12 @@ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txerr = dmu_tx_assign(tx, TXG_WAIT); + if (txerr != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return; + } ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); @@ -1610,9 +1626,13 @@ spa_destroy_aux_threads(spa_t *spa) /* * Opposite of spa_load(). */ -static void -spa_unload(spa_t *spa) +static int +spa_unload(spa_t *spa, txg_wait_flag_t txg_how) { + int err; + vdev_t *vd; + uint64_t t, txg; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); @@ -1655,10 +1675,45 @@ spa_unload(spa_t *spa) * Stop syncing. */ if (spa->spa_sync_on) { - txg_sync_stop(spa->spa_dsl_pool); + err = txg_sync_stop(spa->spa_dsl_pool, txg_how); + if (err != 0) { + spa_async_resume(spa); + return (err); + } spa->spa_sync_on = B_FALSE; } + /* + * If the pool is being forcibly exported, it may be necessary to + * cleanup again. This normally would be handled by spa_sync(), + * except it's possible that followup txg's were skipped, and + * thus the opportunity to have performed these operations. + * + * This is the correct place to perform these operations, as just + * now, spa_sync() and vdev activity has been stopped, and after + * here, the metaslabs are destroyed. + */ + if (spa_exiting_any(spa)) { + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) + vdev_config_clean(vd); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) + vdev_state_clean(vd); + /* The only dirty entries should be for spa_syncing_txg + 1. */ + t = 0; + txg = spa_syncing_txg(spa) + 1; + while (t < TXG_SIZE) { + vd = txg_list_remove(&spa->spa_vdev_txg_list, t); + if (vd == NULL) { + t++; + continue; + } + VERIFY3U(t, ==, txg & TXG_MASK); + vdev_sync_done(vd, txg); + } + spa_config_exit(spa, SCL_ALL, spa); + } + /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. @@ -1764,6 +1819,7 @@ spa_unload(spa_t *spa) } spa_config_exit(spa, SCL_ALL, spa); + return (0); } /* @@ -2651,9 +2707,10 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) " livelist %llu, %lld remaining", (u_longlong_t)dle->dle_bpobj.bpo_object, (u_longlong_t)ll_obj, (longlong_t)count - 1); - VERIFY0(dsl_sync_task(spa_name(spa), NULL, + err = dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, - ZFS_SPACE_CHECK_DESTROY)); + ZFS_SPACE_CHECK_DESTROY); + VERIFY(err == 0 || spa_exiting_any(spa)); } else { VERIFY3U(err, ==, EINTR); } @@ -2669,8 +2726,10 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) }; zfs_dbgmsg("deletion of livelist %llu completed", (u_longlong_t)ll_obj); - VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, - &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); + int err = dsl_sync_task(spa_name(spa), NULL, + livelist_delete_sync, &sync_arg, 0, + ZFS_SPACE_CHECK_DESTROY); + VERIFY(err == 0 || spa_exiting_any(spa)); } } @@ -4552,7 +4611,7 @@ spa_ld_prepare_for_reload(spa_t *spa) spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_activate(spa, mode); @@ -5072,7 +5131,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; @@ -5234,6 +5293,16 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (SET_ERROR(ENOENT)); } + /* + * If the pool is exiting, only the thread forcing it to exit may + * open new references to it. + */ + if (spa_exiting(spa)) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENXIO)); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; @@ -5262,7 +5331,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); @@ -5283,7 +5352,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) @@ -5952,7 +6021,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -6216,15 +6285,13 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } - spa_async_resume(spa); - /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -6272,9 +6339,21 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) /* * Update the config cache to include the newly-imported pool. */ - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + mutex_exit(&spa_namespace_lock); + error = spa_config_update_pool(spa); + if (error != 0) { + mutex_enter(&spa_namespace_lock); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + mutex_enter(&spa_namespace_lock); } + spa_async_resume(spa); + /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. @@ -6397,7 +6476,7 @@ spa_tryimport(nvlist_t *tryconfig) spa_config_exit(spa, SCL_CONFIG, FTAG); } - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -6405,6 +6484,39 @@ spa_tryimport(nvlist_t *tryconfig) return (config); } +int +spa_set_pre_export_status(const char *pool, boolean_t status) +{ + spa_t *spa; + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + mutex_enter(&spa->spa_evicting_os_lock); + spa->spa_pre_exporting = status; + if (status) + txg_completion_notify(spa_get_dsl(spa)); + mutex_exit(&spa->spa_evicting_os_lock); + + mutex_exit(&spa_namespace_lock); + return (0); +} + + +static void +spa_set_export_initiator(spa_t *spa, void *initiator) +{ + + mutex_enter(&spa->spa_evicting_os_lock); + spa->spa_export_initiator = initiator; + if (initiator != NULL) + txg_completion_notify(spa_get_dsl(spa)); + mutex_exit(&spa->spa_evicting_os_lock); +} + /* * Pool export/destroy * @@ -6420,6 +6532,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, { int error; spa_t *spa; + boolean_t force_removal, modifying; if (oldconfig) *oldconfig = NULL; @@ -6440,13 +6553,48 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, } spa->spa_is_exporting = B_TRUE; + if (spa_exiting(spa)) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } + + modifying = spa->spa_sync_on && (new_state == POOL_STATE_DESTROYED || + new_state == POOL_STATE_EXPORTED); + /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); + + /* + * Mark the pool as facing impending exit if this is a forced + * destroy or export. + */ + force_removal = hardforce && modifying; + if (force_removal) { + /* Ensure that references see this change after this. */ + spa_set_export_initiator(spa, curthread); + } mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); + + /* + * Cancel all sends/receives if necessary, and wait for their holds + * to expire. This is done without the namespace lock, since some + * operations may require acquiring it (although they will fail). + */ + if (force_removal && spa->spa_sync_on) { + error = dsl_dataset_sendrecv_cancel_all(spa); + if (error != 0) { + spa_set_export_initiator(spa, NULL); + spa_async_resume(spa); + return (error); + } + txg_force_export(spa); + spa_evicting_os_wait(spa); + } + if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); @@ -6456,22 +6604,45 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; + /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ - if (spa->spa_sync_on) { - txg_wait_synced(spa->spa_dsl_pool, 0); + if (!force_removal && spa->spa_sync_on) { + error = txg_wait_synced_tx(spa->spa_dsl_pool, 0, + NULL, TXG_WAIT_F_NOSUSPEND); + if (error != 0) + goto fail; spa_evicting_os_wait(spa); } + /* + * For forced removal, wait for refcount to drop to minref. At this + * point, all ioctls should be on their way out or getting rejected + * at the front door. + */ + if (force_removal) { + mutex_exit(&spa_namespace_lock); + mutex_enter(&spa->spa_evicting_os_lock); + while (zfs_refcount_count(&spa->spa_refcount) > + spa->spa_minref) { + zio_cancel(spa); + cv_wait(&spa->spa_evicting_os_cv, + &spa->spa_evicting_os_lock); + } + mutex_exit(&spa->spa_evicting_os_lock); + mutex_enter(&spa_namespace_lock); + } + /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { + VERIFY(!force_removal); error = SET_ERROR(EBUSY); goto fail; } @@ -6501,6 +6672,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); + l2arc_spa_rebuild_stop(spa); /* * We want this to be reflected on every label, @@ -6544,7 +6716,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); + /* + * If the pool is not being hard forced, throw an error upon + * suspension and abort. + */ + error = spa_unload(spa, hardforce ? + TXG_WAIT_F_FORCE_EXPORT : TXG_WAIT_F_NOSUSPEND); + if (error != 0) + goto fail; spa_deactivate(spa); } @@ -6552,7 +6731,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, *oldconfig = fnvlist_dup(spa->spa_config); if (new_state != POOL_STATE_UNINITIALIZED) { - if (!hardforce) + if (!force_removal) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { @@ -6568,6 +6747,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, return (0); fail: + if (force_removal) + spa_set_export_initiator(spa, NULL); spa->spa_is_exporting = B_FALSE; spa_async_resume(spa); mutex_exit(&spa_namespace_lock); @@ -6770,10 +6951,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) */ (void) spa_vdev_exit(spa, vd, txg, 0); - mutex_enter(&spa_namespace_lock); - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_config_update_pool(spa); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); - mutex_exit(&spa_namespace_lock); return (0); } @@ -6820,6 +6999,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, NULL, txg, error)); } + /* If the pool is being force-exported, no vdev changes may occur. */ + ASSERT(!spa_exiting_any(spa)); + if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); @@ -6988,8 +7170,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_propagate_state(pvd); tvd = newvd->vdev_top; - ASSERT(pvd->vdev_top == tvd); - ASSERT(tvd->vdev_parent == rvd); + ASSERT3P(pvd->vdev_top, ==, tvd); + ASSERT3P(tvd->vdev_parent, ==, rvd); vdev_config_dirty(tvd); @@ -7033,8 +7215,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + VERIFY0(dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg)); } } @@ -7889,7 +8071,7 @@ spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, return (error); out: - spa_unload(newspa); + VERIFY0(spa_unload(newspa, TXG_WAIT_F_NONE)); spa_deactivate(newspa); spa_remove(newspa); @@ -8207,6 +8389,19 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } +static uint64_t +spa_pool_space(spa_t *spa) +{ + uint64_t space; + + space = metaslab_class_get_space(spa_normal_class(spa)); + space += metaslab_class_get_space(spa_special_class(spa)); + space += metaslab_class_get_space(spa_dedup_class(spa)); + space += metaslab_class_get_space(spa_embedded_log_class(spa)); + + return (space); +} + static __attribute__((noreturn)) void spa_async_thread(void *arg) { @@ -8227,21 +8422,9 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; - mutex_enter(&spa_namespace_lock); - old_space = metaslab_class_get_space(spa_normal_class(spa)); - old_space += metaslab_class_get_space(spa_special_class(spa)); - old_space += metaslab_class_get_space(spa_dedup_class(spa)); - old_space += metaslab_class_get_space( - spa_embedded_log_class(spa)); - - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - - new_space = metaslab_class_get_space(spa_normal_class(spa)); - new_space += metaslab_class_get_space(spa_special_class(spa)); - new_space += metaslab_class_get_space(spa_dedup_class(spa)); - new_space += metaslab_class_get_space( - spa_embedded_log_class(spa)); - mutex_exit(&spa_namespace_lock); + new_space = old_space = spa_pool_space(spa); + if (spa_config_update_pool(spa) == 0) + new_space = spa_pool_space(spa); /* * If the pool grew as a result of the config update, @@ -8299,7 +8482,7 @@ spa_async_thread(void *arg) !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) - dsl_scan_restart_resilver(dp, 0); + (void) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); @@ -8552,6 +8735,9 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) size_t nvsize = 0; dmu_buf_t *db; + if (spa_exiting_any(spa)) + return; + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* @@ -9034,6 +9220,9 @@ vdev_indirect_state_sync_verify(vdev_t *vd) vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; + if (spa_exiting_any(vd->vdev_spa)) + return; + if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); @@ -9238,10 +9427,10 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; + boolean_t exiting = B_FALSE; - for (;;) { + while (exiting == B_FALSE) { int error = 0; - /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. @@ -9285,7 +9474,18 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); - zio_resume_wait(spa); + + mutex_enter(&spa->spa_suspend_lock); + for (;;) { + exiting = spa_exiting(spa); + if (exiting || spa_suspended(spa) == B_FALSE) + break; + cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + } + mutex_exit(&spa->spa_suspend_lock); + + if (exiting) + zio_cancel(spa); } } @@ -9397,7 +9597,8 @@ spa_sync(spa_t *spa, uint64_t txg) spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG - if (!list_is_empty(&spa->spa_config_dirty_list)) { + if (!list_is_empty(&spa->spa_config_dirty_list) && + !spa_exiting_any(spa)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets @@ -9510,7 +9711,8 @@ spa_sync_allpools(void) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); - txg_wait_synced(spa_get_dsl(spa), 0); + txg_wait_synced_flags(spa_get_dsl(spa), 0, + TXG_WAIT_F_NOSUSPEND); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } @@ -9549,7 +9751,7 @@ spa_evict_all(void) spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); } spa_remove(spa); @@ -10022,6 +10224,7 @@ EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); +EXPORT_SYMBOL(spa_set_pre_export_status); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index b588f7041e5c..6f363d08bf5a 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -400,6 +400,7 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *rvd = spa->spa_root_vdev; + int err = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; @@ -434,9 +435,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) error, vd->vdev_id); } - VERIFY0(dsl_sync_task(spa->spa_name, NULL, + err = dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_thread_sync, vd, - 0, ZFS_SPACE_CHECK_NONE)); + 0, ZFS_SPACE_CHECK_NONE); + VERIFY(err == 0 || spa_exiting_any(spa)); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -444,9 +446,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) VERIFY(spa_checkpoint_discard_is_done(spa)); VERIFY0(spa->spa_checkpoint_info.sci_dspace); - VERIFY0(dsl_sync_task(spa->spa_name, NULL, + err = dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_complete_sync, spa, - 0, ZFS_SPACE_CHECK_NONE)); + 0, ZFS_SPACE_CHECK_NONE); + VERIFY(err == 0 || spa_exiting_any(spa)); } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 636c04d9f785..b9bcb9245fc0 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -556,76 +556,115 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) return (config); } -/* - * Update all disk labels, generate a fresh config based on the current - * in-core state, and sync the global config cache (do not sync the config - * cache if this is a booting rootpool). - */ -void -spa_config_update(spa_t *spa, int what) +static int +spa_config_update_begin(spa_t *spa, const void *tag) { - vdev_t *rvd = spa->spa_root_vdev; - uint64_t txg; - int c; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - txg = spa_last_synced_txg(spa) + 1; - if (what == SPA_CONFIG_UPDATE_POOL) { - vdev_config_dirty(rvd); - } else { - /* - * If we have top-level vdevs that were added but have - * not yet been prepared for allocation, do that now. - * (It's safe now because the config cache is up to date, - * so it will be able to translate the new DVAs.) - * See comments in spa_vdev_add() for full details. - */ - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; + return (spa_config_enter_flags(spa, SCL_ALL, tag, RW_WRITER, + SCL_FLAG_NOSUSPEND)); +} - /* - * Explicitly skip vdevs that are indirect or - * log vdevs that are being removed. The reason - * is that both of those can have vdev_ms_array - * set to 0 and we wouldn't want to change their - * metaslab size nor call vdev_expand() on them. - */ - if (!vdev_is_concrete(tvd) || - (tvd->vdev_islog && tvd->vdev_removing)) - continue; +/* Complete a label update. */ +static int +spa_config_update_complete(spa_t *spa, uint64_t txg, boolean_t postsysevent, + const void *tag) +{ + int error = 0; - if (tvd->vdev_ms_array == 0) - vdev_metaslab_set_size(tvd); - vdev_expand(tvd, txg); - } - } - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, tag); /* * Wait for the mosconfig to be regenerated and synced. */ - txg_wait_synced(spa->spa_dsl_pool, txg); + error = txg_wait_synced_tx(spa->spa_dsl_pool, txg, NULL, 0); + if (error == 0 && !spa->spa_is_root) { + /* + * Update the global config cache to reflect the new mosconfig. + * This operation does not perform any pool I/O, so it is + * safe even if one or more of them are suspended. + */ + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(spa, B_FALSE, postsysevent, postsysevent); + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* Update any top-level vdevs needing expansion. */ +static int +spa_config_update_vdevs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int c, error; + + error = spa_config_update_begin(spa, FTAG); + if (error != 0) + return (error); + + txg = spa_last_synced_txg(spa) + 1; /* - * Update the global config cache to reflect the new mosconfig. + * If we have top-level vdevs that were added but have + * not yet been prepared for allocation, do that now. + * (It's safe now because the config cache is up to date, + * so it will be able to translate the new DVAs.) + * See comments in spa_vdev_add() for full details. */ - if (!spa->spa_is_root) { - spa_write_cachefile(spa, B_FALSE, - what != SPA_CONFIG_UPDATE_POOL, - what != SPA_CONFIG_UPDATE_POOL); + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + /* + * Explicitly skip vdevs that are indirect or + * log vdevs that are being removed. The reason + * is that both of those can have vdev_ms_array + * set to 0 and we wouldn't want to change their + * metaslab size nor call vdev_expand() on them. + */ + if (!vdev_is_concrete(tvd) || + (tvd->vdev_islog && tvd->vdev_removing)) + continue; + + if (tvd->vdev_ms_array == 0) + vdev_metaslab_set_size(tvd); + vdev_expand(tvd, txg); } - if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); + return (spa_config_update_complete(spa, txg, B_TRUE, FTAG)); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache (do not sync the config + * cache if this is a booting rootpool). + */ +int +spa_config_update_pool(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int error; + + error = spa_config_update_begin(spa, FTAG); + if (error != 0) + return (error); + + txg = spa_last_synced_txg(spa) + 1; + vdev_config_dirty(rvd); + + error = spa_config_update_complete(spa, txg, B_FALSE, FTAG); + if (error == 0) + error = spa_config_update_vdevs(spa); + + return (error); } EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); -EXPORT_SYMBOL(spa_config_update); +EXPORT_SYMBOL(spa_config_update_pool); #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 3bc8619b51a8..c5174a5b769f 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -190,9 +190,9 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) /* * If we are trying to import a pool, ignore any errors, as we won't be - * writing to the pool any time soon. + * writing to the pool any time soon. Same for force exports. */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + if (spa_exiting_any(spa) || spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return; mutex_enter(&spa->spa_errlist_lock); @@ -1025,6 +1025,9 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) if (avl_numnodes(t) == 0) return; + if (spa_exiting_any(spa)) + goto done; + /* create log if necessary */ if (*obj == 0) *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, @@ -1077,6 +1080,8 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) err_obj, buf, 1, strlen(name) + 1, name, tx); } } + +done: /* purge the error list */ cookie = NULL; while ((se = avl_destroy_nodes(t, &cookie)) != NULL) diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index de036d6c3718..442c9c5281a2 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -384,7 +384,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) } tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - err = dmu_tx_assign(tx, TXG_WAIT); + err = dmu_tx_assign(tx, TXG_WAIT | TXG_NOSUSPEND); if (err) { dmu_tx_abort(tx); return (err); @@ -520,9 +520,10 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. - * Likewise if the pool is not writeable. + * Likewise if the pool is not writeable, or is being force exported. */ - if (spa_is_initializing(spa) || !spa_writeable(spa)) { + if (spa_is_initializing(spa) || !spa_writeable(spa) || + spa_exiting_any(spa)) { fnvlist_free(nvl); return; } diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 2878e68c6e4b..ba40acdd7e1e 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -919,7 +919,9 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); return; } - VERIFY0(error); + VERIFY(error == 0 || spa_exiting_any(spa)); + if (error != 0) + return; metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed); uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest); @@ -973,7 +975,9 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) &spacemap_zap, tx)); spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx); } - VERIFY0(error); + VERIFY(error == 0 || spa_exiting_any(spa)); + if (error != 0) + return; uint64_t sm_obj; ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj), diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 54a0eeccf27b..4e4d1d09bc59 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -462,47 +462,32 @@ spa_config_lock_destroy(spa_t *spa) } } -int -spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) +static int +spa_config_eval_flags(spa_t *spa, spa_config_flag_t flags) { - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - if (!(locks & (1 << i))) - continue; - mutex_enter(&scl->scl_lock); - if (rw == RW_READER) { - if (scl->scl_writer || scl->scl_write_wanted) { - mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks & ((1 << i) - 1), - tag); - return (0); - } - } else { - ASSERT(scl->scl_writer != curthread); - if (scl->scl_count != 0) { - mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks & ((1 << i) - 1), - tag); - return (0); - } - scl->scl_writer = curthread; - } - scl->scl_count++; - mutex_exit(&scl->scl_lock); + int error = 0; + + if ((flags & SCL_FLAG_TRYENTER) != 0) + error = SET_ERROR(EAGAIN); + if (error == 0 && ((flags & SCL_FLAG_NOSUSPEND) != 0)) { + /* Notification given by zio_suspend(). */ + mutex_enter(&spa->spa_suspend_lock); + error = spa_suspended(spa) ? SET_ERROR(EAGAIN) : 0; + mutex_exit(&spa->spa_suspend_lock); } - return (1); + return (error); } -static void -spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, - int mmp_flag) +int +spa_config_enter_flags(spa_t *spa, int locks, const void *tag, krw_t rw, + spa_config_flag_t flags) { - (void) tag; + int error = 0; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); - for (int i = 0; i < SCL_LOCKS; i++) { + for (int i = 0; error == 0 && i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); @@ -511,28 +496,54 @@ spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || - (!mmp_flag && scl->scl_write_wanted)) { + ((flags & SCL_FLAG_MMP) && scl->scl_write_wanted)) { + error = spa_config_eval_flags(spa, flags); + if (error != 0) + break; cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (scl->scl_count != 0) { + error = spa_config_eval_flags(spa, flags); + if (error != 0) + break; scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } - scl->scl_writer = curthread; + if (error == 0) + scl->scl_writer = curthread; } - scl->scl_count++; + if (error == 0) + scl->scl_count++; mutex_exit(&scl->scl_lock); + + if (error != 0 && i > 0) { + /* Should never happen for classic spa_config_enter. */ + ASSERT3U(flags, !=, 0); + spa_config_exit(spa, locks & ((1 << i) - 1), tag); + } } + ASSERT3U(wlocks_held, <=, locks); + return (error); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { - spa_config_enter_impl(spa, locks, tag, rw, 0); + spa_config_flag_t flags = 0; + + spa_config_enter_flags(spa, locks, tag, rw, flags); +} + +int +spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + + return (spa_config_enter_flags(spa, locks, tag, rw, + SCL_FLAG_TRYENTER) == 0); } /* @@ -543,11 +554,10 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) * a single disk in a pool that is responding slowly and presumably about to * fail. */ - void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) { - spa_config_enter_impl(spa, locks, tag, rw, 1); + spa_config_enter_flags(spa, locks, tag, rw, SCL_FLAG_MMP); } void @@ -908,6 +918,20 @@ spa_open_ref(spa_t *spa, const void *tag) (void) zfs_refcount_add(&spa->spa_refcount, tag); } +/* + * Remove a reference to a given spa_t. Common routine that also includes + * notifying the exporter if one is registered, when minref has been reached. + */ +static void +spa_close_common(spa_t *spa, const void *tag) +{ + if (zfs_refcount_remove(&spa->spa_refcount, tag) == spa->spa_minref) { + mutex_enter(&spa->spa_evicting_os_lock); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); + } +} + /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. @@ -917,7 +941,7 @@ spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); - (void) zfs_refcount_remove(&spa->spa_refcount, tag); + spa_close_common(spa, tag); } /* @@ -931,7 +955,7 @@ spa_close(spa_t *spa, const void *tag) void spa_async_close(spa_t *spa, const void *tag) { - (void) zfs_refcount_remove(&spa->spa_refcount, tag); + spa_close_common(spa, tag); } /* @@ -1750,6 +1774,19 @@ spa_syncing_txg(spa_t *spa) return (spa->spa_syncing_txg); } +/* + * Verify that the requesting thread isn't dirtying a txg it's not supposed + * to be. Normally, this must be spa_final_dirty_txg(), but if the pool is + * being force exported, no data will be written to stable storage anyway. + */ +void +spa_verify_dirty_txg(spa_t *spa, uint64_t txg) +{ + + if (spa->spa_export_initiator == NULL) + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); +} + /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. @@ -2000,6 +2037,18 @@ spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, return (spa_normal_class(spa)); } +void +spa_evicting_os_lock(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_unlock(spa_t *spa) +{ + mutex_exit(&spa->spa_evicting_os_lock); +} + void spa_evicting_os_register(spa_t *spa, objset_t *os) { @@ -2628,6 +2677,31 @@ spa_maxblocksize(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } +boolean_t +spa_exiting_any(spa_t *spa) +{ + return (spa->spa_export_initiator != NULL || spa->spa_pre_exporting); +} + +/* + * NB: must hold spa_namespace_lock or spa_evicting_os_lock if the result of + * this is critical. + */ +boolean_t +spa_exiting(spa_t *spa) +{ + return (spa_exiting_any(spa) && spa->spa_export_initiator != curthread); +} + +int +spa_operation_interrupted(spa_t *spa) +{ + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + if (spa_exiting(spa)) + return (SET_ERROR(ENXIO)); + return (0); +} /* * Returns the txg that the last device removal completed. No indirect mappings @@ -2908,6 +2982,8 @@ EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); +EXPORT_SYMBOL(spa_exiting); +EXPORT_SYMBOL(spa_operation_interrupted); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index a336ff41eadb..f4dfa5c11246 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -861,7 +861,7 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(dmu_tx_is_syncing(tx)); - VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); + spa_verify_dirty_txg(spa, dmu_tx_get_txg(tx)); dmu_object_info_from_db(sm->sm_dbuf, &doi); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ec61cabcaab2..483d0e160cde 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -255,10 +256,11 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) /* * Stop syncing transaction groups. */ -void -txg_sync_stop(dsl_pool_t *dp) +int +txg_sync_stop(dsl_pool_t *dp, txg_wait_flag_t txg_how) { tx_state_t *tx = &dp->dp_tx; + int err; dprintf("pool %p\n", dp); /* @@ -269,7 +271,10 @@ txg_sync_stop(dsl_pool_t *dp) /* * We need to ensure that we've vacated the deferred metaslab trees. */ - txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); + err = txg_wait_synced_tx(dp, tx->tx_open_txg + TXG_DEFER_SIZE, + NULL, txg_how); + if (err != 0) + return (err); /* * Wake all sync threads and wait for them to die. @@ -290,6 +295,7 @@ txg_sync_stop(dsl_pool_t *dp) tx->tx_exiting = 0; mutex_exit(&tx->tx_sync_lock); + return (0); } /* @@ -514,6 +520,24 @@ txg_has_quiesced_to_sync(dsl_pool_t *dp) return (tx->tx_quiesced_txg != 0); } +/* + * Notify of completion. This is usually only called by the sync thread, + * but in force-export/unmount scenarios, it can be called by another thread + * that has generated an alternative completion scenario. + */ +void +txg_completion_notify(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + boolean_t locked = MUTEX_HELD(&tx->tx_sync_lock); + + if (!locked) + mutex_enter(&tx->tx_sync_lock); + cv_broadcast(&tx->tx_sync_done_cv); + if (!locked) + mutex_exit(&tx->tx_sync_lock); +} + static __attribute__((noreturn)) void txg_sync_thread(void *arg) { @@ -596,7 +620,7 @@ txg_sync_thread(void *arg) tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_done_cv); + txg_completion_notify(dp); /* * Dispatch commit callbacks to worker threads. @@ -689,61 +713,88 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) mutex_exit(&tx->tx_sync_lock); } -static boolean_t -txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig) +int +txg_wait_synced_tx(dsl_pool_t *dp, uint64_t txg, dmu_tx_t *tx, + txg_wait_flag_t flags) { - tx_state_t *tx = &dp->dp_tx; + tx_state_t *dp_tx = &dp->dp_tx; + int error = 0; + objset_t *os = NULL; ASSERT(!dsl_pool_config_held(dp)); - mutex_enter(&tx->tx_sync_lock); - ASSERT3U(tx->tx_threads, ==, 2); + mutex_enter(&dp_tx->tx_sync_lock); + ASSERT3U(dp_tx->tx_threads, ==, 2); if (txg == 0) - txg = tx->tx_open_txg + TXG_DEFER_SIZE; - if (tx->tx_sync_txg_waiting < txg) - tx->tx_sync_txg_waiting = txg; + txg = dp_tx->tx_open_txg + TXG_DEFER_SIZE; + if (dp_tx->tx_sync_txg_waiting < txg) + dp_tx->tx_sync_txg_waiting = txg; + if (tx != NULL && tx->tx_objset != NULL) + os = tx->tx_objset; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - (u_longlong_t)txg, (u_longlong_t)tx->tx_quiesce_txg_waiting, - (u_longlong_t)tx->tx_sync_txg_waiting); - while (tx->tx_synced_txg < txg) { + (u_longlong_t)txg, (u_longlong_t)dp_tx->tx_quiesce_txg_waiting, + (u_longlong_t)dp_tx->tx_sync_txg_waiting); + while (error == 0 && dp_tx->tx_synced_txg < txg) { dprintf("broadcasting sync more " "tx_synced=%llu waiting=%llu dp=%px\n", - (u_longlong_t)tx->tx_synced_txg, - (u_longlong_t)tx->tx_sync_txg_waiting, dp); - cv_broadcast(&tx->tx_sync_more_cv); - if (wait_sig) { + (u_longlong_t)dp_tx->tx_synced_txg, + (u_longlong_t)dp_tx->tx_sync_txg_waiting, dp); + cv_broadcast(&dp_tx->tx_sync_more_cv); + /* + * If we are suspending and exiting, give up, because our + * data isn't going to be pushed. + */ + if (spa_suspended(dp->dp_spa)) { + if ((flags & TXG_WAIT_F_FORCE_EXPORT)) { + error = 0; + break; + } + if ((flags & TXG_WAIT_F_NOSUSPEND) || + spa_exiting_any(dp->dp_spa)) { + error = SET_ERROR(EAGAIN); + } + } + if (error == 0 && os != NULL && dmu_objset_exiting(os)) { + if ((flags & TXG_WAIT_F_FORCE_EXPORT)) { + error = 0; + break; + } + error = SET_ERROR(EAGAIN); + } + if (error != 0) + break; + if (flags & TXG_WAIT_F_SIGNAL) { /* * Condition wait here but stop if the thread receives a * signal. The caller may call txg_wait_synced*() again * to resume waiting for this txg. */ - if (cv_wait_io_sig(&tx->tx_sync_done_cv, - &tx->tx_sync_lock) == 0) { - mutex_exit(&tx->tx_sync_lock); - return (B_TRUE); + if (cv_wait_io_sig(&dp_tx->tx_sync_done_cv, + &dp_tx->tx_sync_lock) == 0) { + error = SET_ERROR(EINTR); + break; } } else { - cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + cv_wait_io(&dp_tx->tx_sync_done_cv, + &dp_tx->tx_sync_lock); } } - mutex_exit(&tx->tx_sync_lock); - return (B_FALSE); + + mutex_exit(&dp_tx->tx_sync_lock); + dprintf("txg=%llu error=%d\n", (u_longlong_t)txg, error); + return (error); } -void -txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +int +txg_wait_synced_flags(dsl_pool_t *dp, uint64_t txg, txg_wait_flag_t flags) { - VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE)); + return (txg_wait_synced_tx(dp, txg, NULL, flags)); } -/* - * Similar to a txg_wait_synced but it can be interrupted from a signal. - * Returns B_TRUE if the thread was signaled while waiting. - */ -boolean_t -txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg) +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { - return (txg_wait_synced_impl(dp, txg, B_TRUE)); + (void) txg_wait_synced_tx(dp, txg, NULL, 0); } /* @@ -821,6 +872,36 @@ txg_sync_waiting(dsl_pool_t *dp) tx->tx_quiesced_txg != 0); } +void +txg_force_export(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + tx_state_t *tx = &dp->dp_tx; + uint64_t t, txg; + + /* + * When forcing removal, push through TXG_SIZE TXGs to ensure that + * all state is cleaned up by spa_sync(). While waiting for each + * TXG to complete, cancel any suspended zios that appear. + */ + ASSERT(spa_exiting_any(spa)); + txg = tx->tx_synced_txg + 1; + for (t = 0; t < TXG_SIZE; t++) { + txg_wait_open(dp, txg + t, B_TRUE); + + boolean_t complete = B_FALSE; + while (!complete) { + zio_cancel(spa); + mutex_enter(&tx->tx_sync_lock); + (void) cv_timedwait(&tx->tx_sync_done_cv, + &tx->tx_sync_lock, + ddi_get_lbolt() + MSEC_TO_TICK(100)); + complete = (tx->tx_synced_txg >= (txg + t)); + mutex_exit(&tx->tx_sync_lock); + } + } +} + /* * Verify that this txg is active (open, quiescing, syncing). Non-active * txg's should not be manipulated. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4bfd95861e02..61fac7de5da8 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1027,6 +1027,8 @@ void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + uint64_t t, txg; + metaslab_t *msp; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); @@ -1051,6 +1053,19 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + /* If the pool is being forcibly exported, clean up any stragglers. */ + if (spa_exiting_any(spa)) { + for (t = 0, txg = spa_syncing_txg(spa) + 1; t < TXG_SIZE; ) { + msp = txg_list_remove(&vd->vdev_ms_list, t); + if (msp == NULL) { + t++; + continue; + } + VERIFY3U(t, ==, txg & TXG_MASK); + /* Metaslab already destroyed, nothing to do. */ + } + } + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); @@ -1080,6 +1095,9 @@ vdev_free(vdev_t *vd) vd->vdev_log_mg = NULL; } + if (spa_exiting_any(spa)) + vdev_clear_stats(vd); + ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); @@ -3379,6 +3397,16 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); + /* + * The pool is being forcibly exported. Just discard everything. + */ + if (spa_exiting(spa)) { + mutex_enter(&vd->vdev_dtl_lock); + range_tree_vacate(rt, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + return; + } + ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -4892,6 +4920,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; + if (vd == NULL && spa_exiting_any(spa)) { + /* Forced export resulted in partially constructed I/O. */ + return; + } + if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 8c11a574ae86..7207425a441f 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -563,13 +563,19 @@ spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + dmu_tx_t *tx; + int txgoff; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); - dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return; + } + txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync @@ -651,6 +657,7 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; + int err = 0; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -744,9 +751,10 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) if (zthr_iscancelled(zthr)) return; - VERIFY0(dsl_sync_task(spa_name(spa), NULL, + err = dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); + ZFS_SPACE_CHECK_EXTRA_RESERVED); + VERIFY(err == 0 || spa_exiting_any(spa)); } /* diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 75beb0cc3d12..722f443ad9cb 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -121,6 +121,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; + /* + * In this context, a pool vdev is initializing. Usually, we would + * want to handle txg failure, but this can only happen if the pool + * becomes suspended and then forcibly exported when this occurs. In + * which case, the caller here hung while holding the namespace lock, + * so there's little that can be done (including attempt a force + * export, which requires the namespace lock) to recover. + */ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 85c7134ca4c4..fec543a12e05 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1920,7 +1920,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) * bailing out and declaring the pool faulted. */ if (error != 0) { - if ((flags & ZIO_FLAG_TRYHARD) != 0) + if (spa_exiting_any(spa) || (flags & ZIO_FLAG_TRYHARD) != 0) return (error); flags |= ZIO_FLAG_TRYHARD; } diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 62aa61b3b9e7..121fbe8122f7 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -285,7 +285,12 @@ vdev_rebuild_initiate(vdev_t *vd) ASSERT(!vd->vdev_rebuilding); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(vd->vdev_spa)); + dmu_tx_abort(tx); + return; + } vd->vdev_rebuilding = B_TRUE; @@ -584,7 +589,15 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + mutex_enter(&vr->vr_io_lock); + vr->vr_bytes_inflight -= psize; + mutex_exit(&vr->vr_io_lock); + return (err); + } uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); @@ -922,9 +935,15 @@ vdev_rebuild_thread(void *arg) dsl_pool_t *dp = spa_get_dsl(spa); dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txerr = dmu_tx_assign(tx, TXG_WAIT); mutex_enter(&vd->vdev_rebuild_lock); + if (txerr != 0) { + ASSERT(spa_exiting_any(vd->vdev_spa)); + vd->vdev_rebuilding = B_FALSE; + dmu_tx_abort(tx); + goto done; + } if (error == 0) { /* * After a successful rebuild clear the DTLs of all ranges @@ -963,6 +982,7 @@ vdev_rebuild_thread(void *arg) dmu_tx_commit(tx); +done: vd->vdev_rebuild_thread = NULL; mutex_exit(&vd->vdev_rebuild_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 1249657f9d72..120b5c3fef44 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1711,7 +1711,15 @@ spa_vdev_remove_thread(void *arg) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + /* + * If a tx can't be assigned, just punt and wait for + * the next round. This must be an exiting spa. + */ + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + goto done; + } uint64_t txg = dmu_tx_get_txg(tx); /* @@ -1745,6 +1753,7 @@ spa_vdev_remove_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); +done: /* * Wait for all copies to finish before cleaning up the vca. */ diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 0d71b9434342..6cad490a500b 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -336,7 +336,14 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txerr = dmu_tx_assign(tx, TXG_WAIT); + if (txerr != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return; + } + + vd->vdev_trim_state = new_state; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, guid, tx); @@ -521,7 +528,15 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) mutex_exit(&vd->vdev_trim_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(spa)); + mutex_enter(&vd->vdev_trim_io_lock); + vd->vdev_trim_inflight[ta->trim_type]--; + mutex_exit(&vd->vdev_trim_io_lock); + dmu_tx_abort(tx); + return (err); + } uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index dde05d7005c2..1ee4c5e767ca 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -545,6 +545,13 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); + zap_leaf_phys_t *zap_phys = db->db_data; + if (zap_phys->l_hdr.lh_block_type != ZBT_LEAF || + zap_phys->l_hdr.lh_magic != ZAP_LEAF_MAGIC) { + dmu_buf_rele(db, NULL); + return (SET_ERROR(EIO)); + } + zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) @@ -559,8 +566,6 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 22e644f75f95..e1c5c7d807f9 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1570,7 +1570,9 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; - zfs_log_history(zc); + if (!force && !hardforce) + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); @@ -6984,7 +6986,7 @@ zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, vec->zvec_nvl_key_count = num_keys; } -static void +void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) @@ -7266,9 +7268,9 @@ zfs_ioctl_init(void) * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); @@ -7276,10 +7278,10 @@ zfs_ioctl_init(void) zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, - zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, - zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index eb26e4b32998..2538ffbe41d3 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -921,7 +921,12 @@ zil_create(zilog_t *zilog) */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return (NULL); + } dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -1001,6 +1006,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; + int error; /* * Wait for any previous destroy to complete. @@ -1013,7 +1019,12 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return (B_FALSE); + } dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -1803,7 +1814,11 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ - VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + if (dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE) != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return (NULL); + } dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -3155,7 +3170,12 @@ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return; + } itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; @@ -3317,6 +3337,12 @@ zil_commit(zilog_t *zilog, uint64_t foid) return; } + /* + * If the objset is being forced to exit, there's nothing more to do. + */ + if (dmu_objset_exiting(zilog->zl_os)) + return; + /* * The ->zl_suspend_lock rwlock ensures that all in-flight * zil_commit() operations finish before suspension begins and that @@ -3715,14 +3741,16 @@ zil_close(zilog_t *zilog) * zil_sync() will guarantee all lwbs up to that txg have been * written out, flushed, and cleaned. */ - if (txg != 0) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (!dmu_objset_exiting(zilog->zl_os)) { + if (txg != 0) + txg_wait_synced(zilog->zl_dmu_pool, txg); - if (zilog_is_dirty(zilog)) - zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, - (u_longlong_t)txg); - if (txg < spa_freeze_txg(zilog->zl_spa)) - VERIFY(!zilog_is_dirty(zilog)); + if (zilog_is_dirty(zilog)) + zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, + (u_longlong_t)txg); + if (txg < spa_freeze_txg(zilog->zl_spa)) + VERIFY(!zilog_is_dirty(zilog)); + } zilog->zl_get_data = NULL; @@ -3739,7 +3767,15 @@ zil_close(zilog_t *zilog) metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); list_remove(&zilog->zl_lwb_list, lwb); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + if (lwb->lwb_buf != NULL) { + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + } else { + /* + * Pool is being force exported, while this lwb was + * between zil_lwb_flush_vdevs_done and zil_sync. + */ + ASSERT(spa_exiting(zilog->zl_spa)); + } zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0924fb6f40bc..8567c1ae9324 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2272,6 +2272,9 @@ zio_wait(zio_t *zio) error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); + if (error != 0 && spa_exiting_any(zio->io_spa)) { + break; + } if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { @@ -2284,6 +2287,14 @@ zio_wait(zio_t *zio) mutex_exit(&zio->io_lock); error = zio->io_error; + if (error != 0 && (zio->io_flags & ZIO_FLAG_CANFAIL) == 0 && + spa_exiting_any(zio->io_spa)) { + /* + * Don't report errors to the callers. In this context, the + * pool is being forcibly exported, so just throw it away. + */ + error = 0; + } zio_destroy(zio); return (error); @@ -2339,11 +2350,22 @@ zio_reexecute(void *arg) pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; - pio->io_pipeline = pio->io_orig_pipeline; - pio->io_reexecute = 0; + if (spa_exiting_any(pio->io_spa)) { + /* + * This pool is being forcibly exported; skip everything and + * finish as soon as possible. + */ + pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (pio->io_error == 0) + pio->io_error = SET_ERROR(EIO); + pio->io_reexecute = ZIO_REEXECUTE_CANCELLED; + } else { + pio->io_pipeline = pio->io_orig_pipeline; + pio->io_error = 0; + pio->io_reexecute = 0; + } pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; - pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) @@ -2385,6 +2407,8 @@ zio_reexecute(void *arg) void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { + dsl_pool_t *dp = spa_get_dsl(spa); + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " @@ -2415,16 +2439,19 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) } mutex_exit(&spa->spa_suspend_lock); + + /* Notify waiters that might care about this state transition. */ + for (int i = 0; i < SCL_LOCKS; i++) + cv_broadcast(&spa->spa_config_lock[i].scl_cv); + cv_broadcast(&spa->spa_evicting_os_cv); + txg_completion_notify(dp); } -int -zio_resume(spa_t *spa) +static zio_t * +zio_unsuspend(spa_t *spa) { zio_t *pio; - /* - * Reexecute all previously suspended i/o. - */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); @@ -2432,20 +2459,52 @@ zio_resume(spa_t *spa) spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); - if (pio == NULL) - return (0); - - zio_reexecute(pio); - return (zio_wait(pio)); + return (pio); } void -zio_resume_wait(spa_t *spa) +zio_cancel(spa_t *spa) { + zio_t *pio; + + /* + * Interrupt all physical zios. + * Only meaningful in the context of a forced export. + */ mutex_enter(&spa->spa_suspend_lock); - while (spa_suspended(spa)) - cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + pio = spa->spa_suspend_zio_root; + spa->spa_suspend_zio_root = NULL; + cv_broadcast(&spa->spa_suspend_cv); mutex_exit(&spa->spa_suspend_lock); + if (pio == NULL) + return; + + zio_reexecute(pio); + (void) zio_wait(pio); +} + +int +zio_resume(spa_t *spa) +{ + zio_t *pio; + + /* + * Issue an async request to update the pool's configuration in case + * suspension occurred while such an update was in progress. This + * will restart the update process from the beginning. We could + * make it conditional, but it's safer not to. + */ + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + + /* + * Reexecute all previously suspended i/o. + */ + pio = zio_unsuspend(spa); + if (pio == NULL) + return (0); + + zio_reexecute(pio); + return (zio_wait(pio)); } /* @@ -4426,7 +4485,7 @@ zio_ready(zio_t *zio) return (NULL); } - if (zio->io_ready) { + if (zio->io_ready && zio->io_spa->spa_export_initiator == NULL) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); @@ -4572,6 +4631,13 @@ zio_done(zio_t *zio) return (NULL); } + /* + * If the pool is forcibly exporting, make sure everything is + * thrown away, as nothing can be trusted now. + */ + if (spa_exiting_any(zio->io_spa) && zio->io_error == 0) + zio->io_error = SET_ERROR(EIO); + /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async @@ -4691,7 +4757,7 @@ zio_done(zio_t *zio) } } - if (zio->io_error) { + if (zio->io_error && !spa_exiting_any(zio->io_spa)) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure @@ -4831,7 +4897,20 @@ zio_done(zio_t *zio) } } - if ((pio = zio_unique_parent(zio)) != NULL) { + if (zio->io_reexecute & ZIO_REEXECUTE_CANCELLED) { + /* + * This zio had been marked for reexecute previously, + * and upon reexecution, found the pool being forcibly + * exported. Nothing to do now but clean up. + * + * This special flag is used because it allows the + * zio pipeline to mark all zios in the tree as + * cancelled, before cleaning them up. + */ + ASSERT3U(zio->io_error, !=, 0); + zio->io_reexecute = 0; + goto finish; + } else if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors @@ -4863,9 +4942,11 @@ zio_done(zio_t *zio) return (NULL); } - ASSERT(zio->io_child_count == 0); - ASSERT(zio->io_reexecute == 0); - ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); +finish: + ASSERT3U(zio->io_child_count, ==, 0); + ASSERT3U(zio->io_reexecute, ==, 0); + ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL) || + zio->io_spa->spa_export_initiator != NULL); /* * Report any checksum errors, since the I/O is complete. diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index cc4ce03677cb..ee7f3a7dda1d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -389,7 +389,8 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', - 'zpool_export_003_neg', 'zpool_export_004_pos'] + 'zpool_export_003_neg', 'zpool_export_004_pos', 'zpool_export_005_pos', + 'zpool_export_006_pos', 'zpool_export_007_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index b3cfe149ffa7..bb06c3cccfe7 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -52,6 +52,7 @@ export SYSTEM_FILES_COMMON='awk ln ls mkdir + mkfifo mknod mkfifo mktemp @@ -67,6 +68,7 @@ export SYSTEM_FILES_COMMON='awk pkill printf ps + pv python3 readlink rm diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 80e7bcb3bd09..85575f15586b 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -34,6 +34,7 @@ DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync +FORCED_EXPORT_UNMOUNT UNSUPPORTED zfs_forced_export_unmount_enabled INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index e671a3f6b02b..639ddb2a1efb 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1031,6 +1031,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_005_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_006_pos.ksh \ + functional/cli_root/zpool_export/zpool_export_007_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh index 3311eb546676..a87747b65531 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh @@ -49,16 +49,12 @@ log_must eval "zpool events -H -f > $EVENTS_FILE &" pid=$! # 3. Generate some ZFS events -for i in {1..$EVENTS_NUM}; do - log_must zpool clear $TESTPOOL -done +log_must zpool clear $TESTPOOL # wait a bit to allow the kernel module to process new events zpool_events_settle # 4. Verify 'zpool events -f' successfully recorded these new events EVENTS_LOG=$(wc -l < $EVENTS_FILE) -if [[ $EVENTS_LOG -ne $EVENTS_NUM ]]; then - log_fail "Unexpected number of events: $EVENTS_LOG != $EVENTS_NUM" -fi +log_must test $EVENTS_LOG -gt 0 log_pass "'zpool events -f' successfully follows new events." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh index 5e8f1d7053e8..319015a8b43a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh @@ -88,6 +88,7 @@ fi # online the device so the zpool will use the new space log_must zpool online -e $TESTPOOL1 $SDISK +log_must zpool sync $TESTPOOL1 typeset new_size=$(get_pool_prop size $TESTPOOL1) log_note "new pool size: $new_size" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib index 54f805ea71a4..42b42d980ddc 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib @@ -25,6 +25,12 @@ . $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.cfg +function create_fifo +{ + log_must rm -f $1 + log_must mkfifo $1 +} + function zpool_export_cleanup { [[ -d $TESTDIR0 ]] && log_must rm -rf $TESTDIR0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh new file mode 100755 index 000000000000..1bd0de168021 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib + +# +# DESCRIPTION: +# A pool should be force exportable, while a send is running from it. +# +# STRATEGY: +# 1. Initiate a send from pool to a file. +# 2. Slow the send using pv, so it blocks a normal pool export. +# 3. Check that normal export fails. +# 4. Forcibly export pool. +# 5. Verify pool is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup { + [[ -n "$sendpid" ]] && kill -9 "$sendpid" + [[ -n "$pvpid" ]] && kill -9 $pvpid + [[ -n "$snapstream" ]] && rm -f "$snapstream" + zpool_export_cleanup +} + +log_onexit cleanup + +log_assert "Verify a pool can be forcibly exported while sending." + +snap=$TESTPOOL1/$TESTFS@$TESTSNAP +snapstream=$TEST_BASE_DIR/send.$$ + +vdev0=$TESTDIR0/$TESTFILE0 +log_must mkdir -p $TESTDIR0 +log_must truncate -s 1G $vdev0 +log_must zpool create -f $TESTPOOL1 $vdev0 +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/$TESTFILE1 bs=1M count=16 + +log_must zfs snapshot $snap + +# Create FIFOs for the send, so the processes can be controlled and +# monitored individually. +create_fifo $TESTDIR0/snapfifo +zfs send $snap > $TESTDIR0/snapfifo & +sendpid=$! +pv -L 1k < $TESTDIR0/snapfifo > $snapstream & +pvpid=$! + +log_note "zfs send pid is $sendpid, pv pid is $pvpid" + +log_mustnot zpool export $TESTPOOL1 + +# Send should still be running; now try force export. +log_must kill -0 $sendpid +log_must zpool export -F $TESTPOOL1 + +lsout=$(ls -l $snapstream) +log_note "snapstream: $lsout" + +# Send should have exited non-zero. +log_mustnot wait $sendpid + +poolexists $TESTPOOL1 && \ + log_fail "$TESTPOOL1 unexpectedly found in 'zpool list' output." + +log_pass "Successfully forcibly exported a pool while sending." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh new file mode 100755 index 000000000000..6b3bd6923d28 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib + +# +# DESCRIPTION: +# A pool should be force exportable, while a send is running from it. +# +# STRATEGY: +# 1. Initiate a send from pool A to pool B. +# 2. Slow the send using pv, so it blocks a normal pool export. +# 3. Check that normal export of pool Bfails. +# 4. Forcibly export pool B. +# 5. Verify pool B is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup { + [[ -n "$sendpid" ]] && kill -9 "$sendpid" + [[ -n "$recvpid" ]] && kill -9 "$recvpid" + [[ -n "$pvpid" ]] && kill -9 "$pvpid" + zpool_export_cleanup +} + +log_onexit cleanup + +log_assert "Verify a receiving pool can be forcibly exported." + +srcsnap=$TESTPOOL1/$TESTFS@$TESTSNAP +dstsnap=$TESTPOOL2/$TESTFS@$TESTSNAP + +vdev0=$TESTDIR0/$TESTFILE0 +vdev1=$TESTDIR0/$TESTFILE1 +log_must mkdir -p $TESTDIR0 +log_must truncate -s 1G $vdev0 $vdev1 +log_must zpool create -f $TESTPOOL1 $vdev0 +log_must zpool create -f $TESTPOOL2 $vdev1 +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/$TESTFILE1 bs=1M count=16 + +log_must zfs snapshot $srcsnap + +# Create FIFOs for send and receive, so the processes can be controlled and +# monitored individually. +create_fifo $TESTDIR0/sendfifo +create_fifo $TESTDIR0/recvfifo + +zfs send $srcsnap > $TESTDIR0/sendfifo & +sendpid=$! +pv -L 1k < $TESTDIR0/sendfifo > $TESTDIR0/recvfifo & +pvpid=$! +zfs recv $dstsnap < $TESTDIR0/recvfifo & +recvpid=$! + +log_note "zfs send pid is $sendpid, recv pid is $recvpid, pv pid is $pvpid" + +log_note "Waiting until zfs receive has a chance to start ..." +typeset -i i=0 +typeset -i timeout=5 +while (( $i < $timeout )); do + zfs list $TESTPOOL2/$TESTFS >/dev/null 2>&1 && break + sleep 1 + ((i = i + 1)) +done +[[ $i -lt $timeout ]] || log_fail "receive failed to start" + +log_must zfs list $TESTPOOL2/$TESTFS + +log_mustnot zpool export $TESTPOOL2 + +# Send & receive should still be running; now try force export. +log_must kill -0 $sendpid +log_must kill -0 $recvpid +log_must zpool export -F $TESTPOOL2 + +# Both zfs send & recv should have exited non-zero. +log_mustnot wait $recvpid +log_mustnot wait $sendpid + +poolexists $TESTPOOL1 || \ + log_fail "$TESTPOOL1 should be in 'zpool list' output." +poolexists $TESTPOOL2 && \ + log_fail "$TESTPOOL2 unexpectedly found in 'zpool list' output." + +log_pass "Successfully forcibly exported a pool while receiving." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh new file mode 100755 index 000000000000..8fb16e92dcf9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib + +# +# DESCRIPTION: +# A pool should be force exportable, while POSIX I/O is in flight. +# +# STRATEGY: +# 1. Write to a file that is held open, slowed using pv, so it blocks a +# normal filesystem unmount / pool export. +# 2. Check that normal export fails. +# 3. Forcibly export pool. +# 4. Verify pool is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup { + [[ -n "$ddinpid" ]] && kill -9 "$ddinpid" + [[ -n "$ddoutpid" ]] && kill -9 "$ddoutpid" + if is_linux; then + log_must set_tunable64 FORCED_EXPORT_UNMOUNT 0 + fi + zpool_export_cleanup +} + +log_onexit cleanup + +log_assert "Verify a pool can be forcibly exported while writing POSIX I/O" + +snap=$TESTPOOL1/$TESTFS@$TESTSNAP +snapstream=$TEST_BASE_DIR/send.$$ + +# On Linux, it's necessary to enable a tunable for the test to be able to +# kick the POSIX I/O user off. +if is_linux; then + log_must set_tunable64 FORCED_EXPORT_UNMOUNT 1 +fi + +vdev0=$TESTDIR0/$TESTFILE0 +log_must mkdir -p $TESTDIR0 +log_must truncate -s 1G $vdev0 +log_must zpool create -f $TESTPOOL1 $vdev0 +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) + +# Create FIFOs for the writes, so the processes can be controlled and +# monitored individually. +create_fifo $TESTDIR0/writefifo +dd if=/dev/urandom bs=1M count=16 | pv -L 1k > $TESTDIR0/writefifo & +ddinpid=$! + +dd of=${mntpnt}/$TESTFILE1 < $TESTDIR0/writefifo & +ddoutpid=$! + +log_note "dd input pid is $ddinpid, dd output pid is $ddoutpid" + +log_note "Waiting until output file is filling ..." +typeset -i i=0 +typeset -i timeout=5 +while (( $i < $timeout )); do + test -f ${mntpnt}/$TESTFILE1 && break + sleep 1 + ((i = i + 1)) +done +[[ $i -lt $timeout ]] || log_fail "dd failed to start" + +log_mustnot zpool export $TESTPOOL1 + +# Write should still be running; now try force export. We must do this +# twice so dd dies initially. +log_must kill -0 $ddoutpid +log_mustnot zpool export -F $TESTPOOL1 +# Write should have exited non-zero. +log_mustnot wait $ddoutpid +log_must zpool export -F $TESTPOOL1 + +poolexists $TESTPOOL1 && \ + log_fail "$TESTPOOL1 unexpectedly found in 'zpool list' output." + +log_pass "Successfully forcibly exported a pool while writing POSIX I/O sending." diff --git a/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh index 950e80dfc186..a1b8b6a492cb 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh @@ -105,7 +105,7 @@ log_must mkfile $FSIZE /$TESTPOOL/data for offline_disk in $autoonline_disks do - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL host=$(get_scsi_host $offline_disk) diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.cfg b/tests/zfs-tests/tests/functional/mmp/mmp.cfg index 9f7e76e27018..d25e96074a51 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.cfg +++ b/tests/zfs-tests/tests/functional/mmp/mmp.cfg @@ -20,6 +20,7 @@ export PREV_UBER="$TEST_BASE_DIR/mmp-uber-prev.txt" export CURR_UBER="$TEST_BASE_DIR/mmp-uber-curr.txt" export DISK=${DISKS%% *} +export TESTPOOL="testpool.mmp" export HOSTID_FILE="/etc/hostid" export HOSTID1=01234567 diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh index 6e7bb637548d..ee9468ce2a0b 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh @@ -57,19 +57,19 @@ default_setup_noexit $DISK log_must zpool set multihost=off $TESTPOOL for opt in "" "-f"; do - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must import_no_activity_check $TESTPOOL $opt done # 3. Verify multihost=off and hostids differ (no activity check) -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID2 log_mustnot import_no_activity_check $TESTPOOL "" log_must import_no_activity_check $TESTPOOL "-f" # 4. Verify multihost=off and hostid zero allowed (no activity check) -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_mustnot import_no_activity_check $TESTPOOL "" log_must import_no_activity_check $TESTPOOL "-f" @@ -79,19 +79,19 @@ log_must mmp_pool_set_hostid $TESTPOOL $HOSTID1 log_must zpool set multihost=on $TESTPOOL for opt in "" "-f"; do - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must import_no_activity_check $TESTPOOL $opt done # 6. Verify multihost=on and hostids differ (activity check) -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID2 log_mustnot import_activity_check $TESTPOOL "" log_must import_activity_check $TESTPOOL "-f" # 7. Verify mmp_write and mmp_fail are set correctly -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must verify_mmp_write_fail_present ${DISK[0]} # 8. Verify multihost=on and hostid zero fails (no activity check) diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh index 6063c6a3796b..1fd528856a59 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh @@ -97,15 +97,15 @@ for x in $(seq 10); do log_must mmp_set_hostid $HOSTID1 log_must zpool import $TESTPOOL elif [ $action -eq 1 ]; then - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must zpool import $TESTPOOL elif [ $action -eq 2 ]; then - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID2 log_must zpool import -f $TESTPOOL elif [ $action -eq 3 ]; then - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN log_must zpool import $TESTPOOL elif [ $action -eq 4 ]; then diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh index 7cfc3b1829bc..9637a39217bb 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh @@ -47,6 +47,7 @@ log_must zpool checkpoint $NESTEDPOOL log_must truncate -s $EXPSZ $FILEDISK1 log_must zpool online -e $NESTEDPOOL $FILEDISK1 +log_must zpool sync $NESTEDPOOL NEWSZ=$(zpool list -v | awk -v d="$FILEDISK1" '$0 ~ d {print $2}') DEXPSZ=$(zpool list -v | awk -v d="$FILEDISK1" '$0 ~ d {print $6}') nested_change_state_after_checkpoint