diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 35a59710c05e..097f57c1e9c6 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -357,7 +357,7 @@ get_usage(zpool_help_t idx) case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: - return (gettext("\texport [-af] ...\n")); + return (gettext("\texport [-afF] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: @@ -1826,7 +1826,7 @@ zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; - if (zpool_disable_datasets(zhp, cb->force) != 0) + if (zpool_disable_datasets(zhp, cb->force || cb->hardforce) != 0) return (1); /* The history must be logged as part of the export */ @@ -1847,10 +1847,13 @@ zpool_export_one(zpool_handle_t *zhp, void *data) * * -a Export all pools * -f Forcefully unmount datasets + * -F Forcefully export, dropping all outstanding dirty data * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, - * then the datasets will be forcefully unmounted. + * then the datasets will be forcefully unmounted. If the '-F' flag is + * specified, the pool's dirty data, if any, will simply be dropped after a + * best-effort attempt to forcibly stop all activity. */ int zpool_do_export(int argc, char **argv) diff --git a/include/libzfs.h b/include/libzfs.h index 9ef280636d4c..b1931b924af8 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -397,6 +397,7 @@ typedef enum { ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ + ZPOOL_STATUS_FORCE_EXPORTING, /* pool is being force exported */ /* * Finally, the following indicates a healthy pool. diff --git a/include/os/freebsd/spl/sys/thread.h b/include/os/freebsd/spl/sys/thread.h index 4fb1a542f55f..9fa900d37d3c 100644 --- a/include/os/freebsd/spl/sys/thread.h +++ b/include/os/freebsd/spl/sys/thread.h @@ -31,4 +31,7 @@ #define getcomm() curthread->td_name #define getpid() curthread->td_tid +#define thread_signal spl_kthread_signal +extern int spl_kthread_signal(kthread_t *tsk, int sig); + #endif diff --git a/include/os/linux/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h index 220742387b62..755a917bf3fa 100644 --- a/include/os/linux/spl/sys/thread.h +++ b/include/os/linux/spl/sys/thread.h @@ -55,6 +55,7 @@ typedef void (*thread_func_t)(void *); #func, arg, len, pp, state, pri) /* END CSTYLED */ +#define thread_signal(t, s) spl_kthread_signal(t, s) #define thread_exit() __thread_exit() #define thread_join(t) VERIFY(0) #define curthread current @@ -67,6 +68,7 @@ extern kthread_t *__thread_create(caddr_t stk, size_t stksize, extern void __thread_exit(void); extern struct task_struct *spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...); +extern int spl_kthread_signal(kthread_t *tsk, int sig); extern proc_t p0; diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 8e03ae99a7fd..1de4b5149805 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -101,7 +101,8 @@ struct zfsvfs { boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_relatime; /* enable relatime mount option */ - boolean_t z_unmounted; /* unmounted */ + boolean_t z_unmounted; /* mount status */ + boolean_t z_force_unmounted; /* force-unmounted status */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index be211c5b51da..bb655f97e583 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -82,11 +82,14 @@ extern "C" { #define zhold(zp) VERIFY3P(igrab(ZTOI((zp))), !=, NULL) #define zrele(zp) iput(ZTOI((zp))) +#define zfsvfs_is_unmounted(zfsvfs) \ + ((zfsvfs)->z_unmounted && (zfsvfs)->z_force_unmounted) + /* Called on entry to each ZFS inode and vfs operation. */ #define ZFS_ENTER_ERROR(zfsvfs, error) \ do { \ ZFS_TEARDOWN_ENTER_READ(zfsvfs, FTAG); \ - if (unlikely((zfsvfs)->z_unmounted)) { \ + if (unlikely(zfsvfs_is_unmounted(zfsvfs))) { \ ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ return (error); \ } \ @@ -94,6 +97,18 @@ do { \ #define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) #define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) +/* ZFS_ENTER but ok with forced unmount having begun */ +#define _ZFS_ENTER_UNMOUNTOK(zfsvfs, error) \ +do { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted == B_TRUE) { \ + ZFS_EXIT(zfsvfs); \ + return (error); \ + } \ +} while (0) +#define ZFS_ENTER_UNMOUNTOK(zfsvfs) _ZFS_ENTER_UNMOUNTOK(zfsvfs, EIO) +#define ZPL_ENTER_UNMOUNTOK(zfsvfs) _ZFS_ENTER_UNMOUNTOK(zfsvfs, -EIO) + /* Must be called before exiting the operation. */ #define ZFS_EXIT(zfsvfs) \ do { \ diff --git a/include/sys/arc.h b/include/sys/arc.h index f58fa53b6003..7e5ca12c54a8 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -329,6 +329,7 @@ void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); +void l2arc_spa_rebuild_stop(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 10e29a45c89f..b1b49db23261 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -276,6 +276,7 @@ typedef enum dmu_object_type { #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) +#define TXG_NOSUSPEND (1ULL<<2) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index def4aadba1d0..eacdc7d6aa94 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -241,6 +241,7 @@ typedef struct dmu_sendstatus { list_node_t dss_link; int dss_outfd; proc_t *dss_proc; + kthread_t *dss_thread; offset_t *dss_off; uint64_t dss_blocks; /* blocks visited during the sending process */ } dmu_sendstatus_t; diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index e89ee64ea686..142f772caa61 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -172,6 +172,7 @@ struct objset { /* Protected by os_lock */ kmutex_t os_lock; + kthread_t *os_shutdown_initiator; multilist_t os_dirty_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; @@ -263,6 +264,10 @@ int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); +int dmu_objset_shutdown_register(objset_t *os); +boolean_t dmu_objset_exiting(objset_t *os); +void dmu_objset_shutdown_unregister(objset_t *os); + void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index 7188b2a02248..1a9763b9aee9 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -39,6 +39,7 @@ extern const char *recv_clone_name; typedef struct dmu_recv_cookie { struct dsl_dataset *drc_ds; + kthread_t *drc_initiator; struct dmu_replay_record *drc_drr_begin; struct drr_begin *drc_drrb; const char *drc_tofs; @@ -55,6 +56,8 @@ typedef struct dmu_recv_cookie { nvlist_t *drc_keynvl; uint64_t drc_fromsnapobj; uint64_t drc_ivset_guid; + unsigned int drc_flags; + void *drc_rwa; void *drc_owner; cred_t *drc_cred; proc_t *drc_proc; @@ -81,6 +84,7 @@ int dmu_recv_begin(char *, char *, dmu_replay_record_t *, boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *, dmu_recv_cookie_t *, zfs_file_t *, offset_t *); int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *); +int dmu_recv_close(dsl_dataset_t *ds); int dmu_recv_end(dmu_recv_cookie_t *, void *); boolean_t dmu_objset_is_receiving(objset_t *); diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index d150f816c945..525facf09b92 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -60,6 +60,7 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, struct dmu_send_outparams *dso); +int dmu_send_close(struct dsl_dataset *ds); typedef int (*dmu_send_outfunc_t)(objset_t *os, void *buf, int len, void *arg); typedef struct dmu_send_outparams { diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index ed934f969e92..ac730b734dbb 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -243,6 +243,8 @@ typedef struct dsl_dataset { kmutex_t ds_sendstream_lock; list_t ds_sendstreams; + void *ds_receiver; /* really a dmu_recv_cookie_t */ + /* * When in the middle of a resumable receive, tracks how much * progress we have made. @@ -317,7 +319,8 @@ typedef struct dsl_dataset_snapshot_arg { /* flags for holding the dataset */ typedef enum ds_hold_flags { DS_HOLD_FLAG_NONE = 0 << 0, - DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ + DS_HOLD_FLAG_DECRYPT = 1 << 0, /* needs access to encrypted data */ + DS_HOLD_FLAG_MUST_BE_OPEN = 1 << 1, /* dataset must already be open */ } ds_hold_flags_t; int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, @@ -445,6 +448,8 @@ void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); +int dsl_dataset_sendrecv_cancel_all(spa_t *spa); + int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index fb1f1d65bad4..9983cb4ca3c8 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -172,7 +172,7 @@ int dsl_scan(struct dsl_pool *, pool_scan_func_t); void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); -void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); +int dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index ecff65f13de5..bada938de69b 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -111,6 +111,7 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); +void metaslab_class_force_discard(metaslab_class_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); diff --git a/include/sys/spa.h b/include/sys/spa.h index d37c6c923d8c..3d352f205d42 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -834,16 +834,13 @@ extern kmutex_t spa_namespace_lock; * SPA configuration functions in spa_config.c */ -#define SPA_CONFIG_UPDATE_POOL 0 -#define SPA_CONFIG_UPDATE_VDEVS 1 - extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); -extern void spa_config_update(spa_t *spa, int what); +extern int spa_config_update_pool(spa_t *spa); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); @@ -959,6 +956,13 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); + +/* Config lock handling flags */ +typedef enum { + SCL_FLAG_TRYENTER = 1U << 0, + SCL_FLAG_NOSUSPEND = 1U << 1, +} spa_config_flag_t; + extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, @@ -970,6 +974,8 @@ extern int spa_import_progress_set_state(uint64_t pool_guid, /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern int spa_config_enter_flags(spa_t *spa, int locks, const void *tag, + krw_t rw, spa_config_flag_t flags); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); @@ -1018,6 +1024,7 @@ extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern void spa_verify_dirty_txg(spa_t *spa, uint64_t txg); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); @@ -1037,6 +1044,8 @@ extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); +extern void spa_evicting_os_lock(spa_t *); +extern void spa_evicting_os_unlock(spa_t *); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); @@ -1126,6 +1135,10 @@ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, extern const char *spa_state_to_name(spa_t *spa); +extern boolean_t spa_exiting_any(spa_t *spa); +extern boolean_t spa_exiting(spa_t *spa); +extern int spa_operation_interrupted(spa_t *spa); + /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index bc88cfa15e8e..3107e0ec052d 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -239,6 +239,7 @@ struct spa { kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ list_t spa_evicting_os_list; /* Objsets being evicted. */ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ + kthread_t *spa_export_initiator; /* thread exporting the pool */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_min_ashift; /* of vdevs in normal class */ diff --git a/include/sys/txg.h b/include/sys/txg.h index 22158bd1a5e6..b0adbd147701 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -66,11 +66,25 @@ typedef struct txg_list { } txg_list_t; struct dsl_pool; +struct dmu_tx; + +/* + * TXG wait flags, used by txg_wait_synced_tx and callers to indicate + * modifications to how they wish to wait for a txg. + */ +typedef enum { + /* No special wait flags. */ + TXG_WAIT_F_NONE = 0, + /* Reject the call with EINTR upon receiving a signal. */ + TXG_WAIT_F_SIGNAL = (1U << 0), + /* Reject the call with EAGAIN upon suspension. */ + TXG_WAIT_F_NOSUSPEND = (1U << 1), +} txg_wait_flag_t; extern void txg_init(struct dsl_pool *dp, uint64_t txg); extern void txg_fini(struct dsl_pool *dp); extern void txg_sync_start(struct dsl_pool *dp); -extern void txg_sync_stop(struct dsl_pool *dp); +extern int txg_sync_stop(struct dsl_pool *dp, txg_wait_flag_t txg_how); extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); @@ -84,14 +98,23 @@ extern void txg_kick(struct dsl_pool *dp); * Wait until the given transaction group has finished syncing. * Try to make this happen as soon as possible (eg. kick off any * necessary syncs immediately). If txg==0, wait for the currently open - * txg to finish syncing. + * txg to finish syncing. This may be interrupted due to an exiting pool. + * + * If desired, flags can be specified using txg_wait_synced_tx(), in case + * the caller wants to be interruptible. */ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); +extern int txg_wait_synced_tx(struct dsl_pool *dp, uint64_t txg, + struct dmu_tx *tx, txg_wait_flag_t flags); +extern int txg_wait_synced_flags(struct dsl_pool *dp, uint64_t txg, + txg_wait_flag_t flags); /* - * Wait as above. Returns true if the thread was signaled while waiting. + * Similar to a txg_wait_synced but it can be interrupted from a signal. + * Returns B_TRUE if the thread was signaled while waiting. */ -extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); +#define txg_wait_synced_sig(dp, txg) \ + (txg_wait_synced_tx(dp, txg, NULL, TXG_WAIT_F_SIGNAL) == EINTR) /* * Wait until the given transaction group, or one after it, is @@ -102,6 +125,8 @@ extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg, boolean_t should_quiesce); +void txg_force_export(spa_t *spa); + /* * Returns TRUE if we are "backed up" waiting for the syncing * transaction to complete; otherwise returns FALSE. @@ -113,6 +138,8 @@ extern boolean_t txg_sync_waiting(struct dsl_pool *dp); extern void txg_verify(spa_t *spa, uint64_t txg); +extern void txg_completion_notify(struct dsl_pool *dp); + /* * Wait for pending commit callbacks of already-synced transactions to finish * processing. diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index aa4338ed2859..cbf879183b1e 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -230,6 +230,7 @@ typedef pthread_t kthread_t; zk_thread_create(func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg, stksize, state) +#define thread_signal(t, s) pthread_kill((pthread_t)t, s) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index fc0cbea1cf7c..a981c10be7b2 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -60,7 +60,7 @@ typedef struct refcount { /* * Note: zfs_refcount_t must be initialized with - * refcount_create[_untracked]() + * zfs_refcount_create[_untracked]() */ void zfs_refcount_create(zfs_refcount_t *); diff --git a/include/sys/zio.h b/include/sys/zio.h index c792cb65b67a..2cabd6833d51 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -398,6 +398,7 @@ typedef zio_t *zio_pipe_stage_t(zio_t *zio); */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_REEXECUTE_CANCELLED 0x04 /* * The io_trim flags are used to specify the type of TRIM to perform. They @@ -623,7 +624,7 @@ extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); -extern void zio_resume_wait(spa_t *spa); +extern void zio_cancel(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, enum blk_verify_flag blk_verify); diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 99e352dd4883..0128932b24ef 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -476,8 +476,6 @@ make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { - zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) @@ -485,20 +483,37 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path) zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); - if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { - free(zhp); - return (NULL); - } - if (get_stats_ioctl(zhp, &zc) == -1) { + + if (!hdl->libzfs_force_export) { + zfs_cmd_t zc = {"\0"}; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { + free(zhp); + return (NULL); + } + if (get_stats_ioctl(zhp, &zc) == -1) { + zcmd_free_nvlists(&zc); + free(zhp); + return (NULL); + } + if (make_dataset_handle_common(zhp, &zc) == -1) { + free(zhp); + zhp = NULL; + } zcmd_free_nvlists(&zc); - free(zhp); - return (NULL); - } - if (make_dataset_handle_common(zhp, &zc) == -1) { - free(zhp); - zhp = NULL; + } else { + /* + * Called from zpool_disable_datasets() which sets force + * export and uses mount entries, so de facto the dataset + * is a ZFS filesystem. Furthermore, we need to avoid + * calling get_stats_ioctl() here since it results in + * zfs_ioc_objset_stats()->dmu_objset_hold() being called by + * the kernel which can potentially cause IO to be issued + * depending on what's currently cached in ARC. + */ + zhp->zfs_dmustats.dds_type = DMU_OST_ZFS; + zhp->zfs_type = ZFS_TYPE_FILESYSTEM; } - zcmd_free_nvlists(&zc); return (zhp); } diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index ce7373582f0e..21abe1751e40 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -70,6 +70,7 @@ struct libzfs_handle { uint64_t libzfs_max_nvlist; void *libfetch; char *libfetch_load_error; + boolean_t libzfs_force_export; }; struct zfs_handle { diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index b074a6e6f371..5fbcbccc5284 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -1533,6 +1533,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) int ret = -1; int flags = (force ? MS_FORCE : 0); + hdl->libzfs_force_export = force; namelen = strlen(zhp->zpool_name); if ((mnttab = fopen(MNTTAB, "re")) == NULL) @@ -1634,9 +1635,15 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) goto out; } - for (i = 0; i < used; i++) { - if (datasets[i]) - remove_mountpoint(datasets[i]); + /* + * Remove mountpoints, unless the pool is being forcibly exported. + * In the latter case, avoid potentially initiating I/O on the pool. + */ + if (!hdl->libzfs_force_export) { + for (i = 0; i < used; i++) { + if (datasets[i]) + remove_mountpoint(datasets[i]); + } } ret = 0; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index c0bf9d067d42..f26630b86c10 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -260,7 +260,9 @@ zpool_get_state_str(zpool_handle_t *zhp) status = zpool_get_status(zhp, NULL, &errata); - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + if (status == ZPOOL_STATUS_FORCE_EXPORTING) { + str = gettext("FORCE-EXPORTING"); + } else if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || status == ZPOOL_STATUS_IO_FAILURE_MMP) { @@ -1481,7 +1483,9 @@ zpool_destroy(zpool_handle_t *zhp, const char *log_str) } if (zfp) { - remove_mountpoint(zfp); + /* Avoid initiating I/O during a forced export. */ + if (!hdl->libzfs_force_export) + remove_mountpoint(zfp); zfs_close(zfp); } diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index 33d6e1bfdf80..be161028bba5 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -236,6 +236,9 @@ check_status(nvlist_t *config, boolean_t isimport, uint64_t errata = 0; unsigned long system_hostid = get_system_hostid(); + if (config == NULL) + return (ZPOOL_STATUS_FORCE_EXPORTING); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, diff --git a/lib/libzfsbootenv/libzfsbootenv.abi b/lib/libzfsbootenv/libzfsbootenv.abi index 8ef242d2f5ac..8fa725db0cf0 100644 --- a/lib/libzfsbootenv/libzfsbootenv.abi +++ b/lib/libzfsbootenv/libzfsbootenv.abi @@ -1,17 +1,7 @@ - - - - - - - - - - @@ -24,189 +14,345 @@ - - - - - - - + + + + + + + + + + + + + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6da8d42b42bd..f3e680c20c17 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -932,6 +932,18 @@ receive of encrypted datasets. Intended for users whose pools were created with OpenZFS pre-release versions and now have compatibility issues. . +.It Sy zfs_forced_export_unmount Ns = Ns Sy 0 Ns | Ns 1 Pq int +During forced unmount, leave the filesystem in a disabled mode of operation, +in which all new I/Os fail, except for those required to unmount it. +Intended for users trying to forcibly export a pool even when I/Os are in +progress, without the need to find and stop them. This option does not +affect processes that are merely sitting on the filesystem, only those +performing active I/O. +.Pp +This parameter can be set to 1 to enable this behavior. +.Pp +This parameter only applies on Linux. +. .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong Maximum number of uses of a single salt value before generating a new one for encrypted datasets. diff --git a/man/man8/zpool-export.8 b/man/man8/zpool-export.8 index a15291a1f598..59932bbf10e1 100644 --- a/man/man8/zpool-export.8 +++ b/man/man8/zpool-export.8 @@ -37,6 +37,7 @@ .Nm zpool .Cm export .Op Fl f +.Op Fl F .Fl a Ns | Ns Ar pool Ns … . .Sh DESCRIPTION @@ -66,6 +67,15 @@ Forcefully unmount all datasets, and allow export of pools with active shared sp This command will forcefully export the pool even if it has a shared spare that is currently being used. This may lead to potential data corruption. +.It Fl F +Forcibly export the pool. +.Pp +This option allows a pool to be exported even when the underlying disks are +offline and the pool is unavailable. +When force exporting a pool, any outstanding dirty data will be discarded. +This option implies the +.Fl f +option. .El . .Sh SEE ALSO diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index 0354b986cd5f..30f2590e4c2b 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -108,6 +108,16 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +int +spl_kthread_signal(kthread_t *td, int sig) +{ + + PROC_LOCK(td->td_proc); + tdsignal(td, sig); + PROC_UNLOCK(td->td_proc); + return (0); +} + SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index 834c527117a3..7f03b0afa276 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -209,3 +209,14 @@ issig(int why) } EXPORT_SYMBOL(issig); + +/* + * spl_kthread_signal - Wrapper for sending signals to a thread. + */ +int +spl_kthread_signal(kthread_t *tsk, int sig) +{ + + return (send_sig(sig, tsk, 0)); +} +EXPORT_SYMBOL(spl_kthread_signal); diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 82b32d1cc3fa..f29089c04727 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -563,7 +563,6 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs) void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) { - ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); if (zfsvfs->z_draining) { zfsvfs->z_drain_cancel = B_TRUE; diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index ff0b0d9df8f0..79d6534d3c7d 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1087,7 +1087,7 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp) uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_UNMOUNTOK(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); @@ -1158,7 +1158,7 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) znode_t *rootzp; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_UNMOUNTOK(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) @@ -1297,6 +1297,8 @@ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; + kthread_t *initiator = NULL; + uint64_t wait_flags = 0; zfs_unlinked_drain_stop_wait(zfsvfs); @@ -1326,6 +1328,15 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) if (++round > 1 && !unmounting) break; } + initiator = zfsvfs->z_os->os_shutdown_initiator; + /* + * Although it could be argued that a force unmount in + * another thread shouldn't have this apply, once a force + * unmount is in effect, it's pointless for the non-forced + * unmount to not use this flag. + */ + if (initiator != NULL) + wait_flags |= TXG_WAIT_F_NOSUSPEND; } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); @@ -1358,6 +1369,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + if (initiator == curthread) { + zfsvfs->z_unmounted = B_FALSE; + dmu_objset_shutdown_unregister(zfsvfs->z_os); + } rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); @@ -1426,12 +1441,16 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) } } if (!zfs_is_readonly(zfsvfs) && os_dirty) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) txg_wait_synced_tx(dmu_objset_pool(zfsvfs->z_os), 0, + NULL, wait_flags); } dmu_objset_evict_dbufs(zfsvfs->z_os); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); + if (initiator == curthread) + dmu_objset_shutdown_unregister(zfsvfs->z_os); + return (0); } @@ -1522,6 +1541,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); + zfsvfs = NULL; goto out; } @@ -1529,6 +1549,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); + zfsvfs = NULL; error = SET_ERROR(ENOMEM); goto out; } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 24c016c5fcf1..42f8e0c54019 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -207,7 +207,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); + ZFS_ENTER_UNMOUNTOK(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ @@ -1639,7 +1639,7 @@ zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, uint32_t blksize; u_longlong_t nblocks; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_UNMOUNTOK(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index c2fd3fee1401..9efb41b472da 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -28,7 +28,10 @@ #include #include #include +#include +#include +int zfs_forced_export_unmount_enabled = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -102,6 +105,31 @@ zpl_evict_inode(struct inode *ip) spl_fstrans_unmark(cookie); } +static void +zpl_umount_begin(struct super_block *sb) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + + if (zfsvfs) { + /* + * Flush out all POSIX I/Os. Notify all waiters that they + * must end, then wait for all users to drop their holds on + * z_teardown_*_lock, and evict buffers. + */ + if (zfs_forced_export_unmount_enabled) + zfsvfs->z_force_unmounted = B_TRUE; + (void) dmu_objset_shutdown_register(zfsvfs->z_os); + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + dmu_objset_evict_dbufs(zfsvfs->z_os); + + dsl_dir_cancel_waiters(zfsvfs->z_os->os_dsl_dataset->ds_dir); + dmu_objset_shutdown_unregister(zfsvfs->z_os); + } +} + static void zpl_put_super(struct super_block *sb) { @@ -185,7 +213,7 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data) static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { - ZPL_ENTER(zfsvfs); + ZPL_ENTER_UNMOUNTOK(zfsvfs); char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); @@ -349,6 +377,7 @@ const struct super_operations zpl_super_operations = { .write_inode = NULL, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, + .umount_begin = zpl_umount_begin, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .remount_fs = zpl_remount_fs, @@ -363,3 +392,8 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, forced_export_unmount_enabled, INT, ZMOD_RW, + "Enable forced export unmount to keep POSIX I/O users off"); +/* END CSTYLED */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index b0468159d2e6..84df01d96d89 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -9883,6 +9883,18 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) } } +static void +l2arc_dev_rebuild_stop(l2arc_dev_t *l2ad) +{ + mutex_enter(&l2arc_rebuild_thr_lock); + if (l2ad->l2ad_rebuild_began == B_TRUE) { + l2ad->l2ad_rebuild_cancel = B_TRUE; + while (l2ad->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); +} + /* * Remove a vdev from the L2ARC. */ @@ -9900,13 +9912,7 @@ l2arc_remove_vdev(vdev_t *vd) /* * Cancel any ongoing or scheduled rebuild. */ - mutex_enter(&l2arc_rebuild_thr_lock); - if (remdev->l2ad_rebuild_began == B_TRUE) { - remdev->l2ad_rebuild_cancel = B_TRUE; - while (remdev->l2ad_rebuild == B_TRUE) - cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); - } - mutex_exit(&l2arc_rebuild_thr_lock); + l2arc_dev_rebuild_stop(remdev); /* * Remove device from global list @@ -10022,6 +10028,25 @@ l2arc_spa_rebuild_start(spa_t *spa) } } +void +l2arc_spa_rebuild_stop(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + l2arc_dev_rebuild_stop(dev); + } +} + /* * Main entry point for L2ARC rebuilding. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5b072f02613b..0cf09b0a939c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -985,11 +985,12 @@ dbuf_verify(dmu_buf_impl_t *db) uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_objset != NULL); - if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY) || + dmu_objset_exiting(db->db_objset)) return; - ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { @@ -1073,7 +1074,8 @@ dbuf_verify(dmu_buf_impl_t *db) if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && - db->db_state != DB_FILL && !dn->dn_free_txg) { + db->db_state != DB_FILL && !dn->dn_free_txg && + !dmu_objset_exiting(db->db_objset)) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that @@ -2194,7 +2196,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * this assertion only if we're not already dirty. */ os = dn->dn_objset; - VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); + spa_verify_dirty_txg(os->os_spa, dmu_tx_get_txg(tx)); #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); @@ -4591,9 +4593,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; - ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); + if (zio->io_error != 0) { + /* If the pool is exiting, only cleanup in-core state. */ + ASSERT(spa_exiting_any(zio->io_spa)); + goto cleanup; + } + /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. @@ -4606,6 +4613,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dsl_dataset_block_born(ds, bp, tx); } +cleanup: mutex_enter(&db->db_mtx); DBUF_VERIFY(db); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1c47430953b1..acca38465659 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1110,12 +1110,16 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) return; - VERIFY0(dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + error = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + VERIFY(error == 0 || spa_exiting_any(os->os_spa)); + if (error != 0) + return; dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -1129,12 +1133,16 @@ dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) return; - VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + error = dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + VERIFY(error == 0 || spa_exiting_any(dn->dn_objset->os_spa)); + if (error != 0) + return; dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -1166,11 +1174,15 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; + int error; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - VERIFY0(dmu_buf_hold_noread(os, object, offset, - FTAG, &db)); + error = dmu_buf_hold_noread(os, object, offset, + FTAG, &db); + VERIFY(error == 0 || spa_exiting_any(os->os_spa)); + if (error != 0) + return; dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 8c244dc4c317..c3250f75479d 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1566,6 +1566,11 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; + if (zio->io_error != 0) { + ASSERT(spa_exiting_any(zio->io_spa)); + goto done; + } + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { @@ -1575,6 +1580,8 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } + +done: kmem_free(bp, sizeof (*bp)); } @@ -1814,6 +1821,7 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) { void *cookie; userquota_node_t *uqn; + int error; ASSERT(dmu_tx_is_syncing(tx)); @@ -1825,10 +1833,13 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) * zap_increment_int(). It's needed because zap_increment_int() * is not thread-safe (i.e. not atomic). */ - mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); - mutex_exit(&os->os_userused_lock); + if (!dmu_objset_exiting(os)) { + mutex_enter(&os->os_userused_lock); + error = zap_increment(os, DMU_USERUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx); + VERIFY(error == 0 || dmu_objset_exiting(os)); + mutex_exit(&os->os_userused_lock); + } kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_user_deltas); @@ -1836,10 +1847,13 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, &cookie)) != NULL) { - mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); - mutex_exit(&os->os_userused_lock); + if (!dmu_objset_exiting(os)) { + mutex_enter(&os->os_userused_lock); + error = zap_increment(os, DMU_GROUPUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx); + VERIFY(error == 0 || dmu_objset_exiting(os)); + mutex_exit(&os->os_userused_lock); + } kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_group_deltas); @@ -1849,8 +1863,9 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); - VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT, - uqn->uqn_id, uqn->uqn_delta, tx)); + error = zap_increment(os, DMU_PROJECTUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx); + VERIFY(error == 0 || dmu_objset_exiting(os)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } @@ -1969,6 +1984,7 @@ userquota_updates_task(void *arg) flags = dn->dn_id_flags; ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, &cache, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, @@ -2305,8 +2321,9 @@ dmu_objset_space_upgrade(objset_t *os) if (err != 0) return (err); - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); + err = spa_operation_interrupted(os->os_spa); + if (err != 0) + return (err); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) @@ -2994,6 +3011,52 @@ dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } +/* + * Notify the objset that it's being shutdown. This is primarily useful + * when attempting to dislodge any references that might be waiting on a txg + * or similar. + */ +int +dmu_objset_shutdown_register(objset_t *os) +{ + int ret = 0; + + mutex_enter(&os->os_lock); + if (os->os_shutdown_initiator == NULL) { + os->os_shutdown_initiator = curthread; + } else { + ret = SET_ERROR(EBUSY); + } + mutex_exit(&os->os_lock); + + /* + * Signal things that will check for objset force export. The calling + * thread must use a secondary mechanism to check for ref drops, + * before calling dmu_objset_shutdown_unregister(). + */ + if (ret == 0) { + txg_completion_notify(spa_get_dsl(dmu_objset_spa(os))); + } + + return (ret); +} + +boolean_t +dmu_objset_exiting(objset_t *os) +{ + + return (os->os_shutdown_initiator != NULL || + spa_exiting_any(os->os_spa)); +} + +void +dmu_objset_shutdown_unregister(objset_t *os) +{ + + ASSERT3P(os->os_shutdown_initiator, ==, curthread); + os->os_shutdown_initiator = NULL; +} + #if defined(_KERNEL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index a713e1329027..2f8cf0ca241f 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -68,9 +68,11 @@ int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; int zfs_recv_queue_ff = 20; int zfs_recv_write_batch_size = 1024 * 1024; -static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; +/* The receive was closed by an external call. */ +#define DRC_CLOSED (1U << 0) + static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); @@ -339,6 +341,34 @@ recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) return (0); } +static void +recv_own(dsl_pool_t *dp, dmu_tx_t *tx, uint64_t dsobj, ds_hold_flags_t dsflags, + dmu_recv_cookie_t *drc, dsl_dataset_t **dsp, objset_t **osp) +{ + dsl_dataset_t *ds; + + /* + * The dataset must be marked inconsistent before exit in any event, + * so dirty it now. This ensures it's cleaned up if interrupted. + */ + VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, drc, &ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + ds->ds_receiver = drc; + *dsp = ds; + VERIFY0(dmu_objset_from_ds(ds, osp)); +} + +static void +recv_disown(dsl_dataset_t *ds, dmu_recv_cookie_t *drc) +{ + ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + + ASSERT3P(ds->ds_receiver, ==, drc); + ds->ds_receiver = NULL; + dsl_dataset_disown(ds, dsflags, drc); +} + static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) @@ -833,8 +863,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(dd, FTAG); drc->drc_newfs = B_TRUE; } - VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, - &newds)); + recv_own(dp, tx, dsobj, dsflags, drba->drba_cookie, &newds, &os); + if (dsl_dataset_feature_is_active(newds, SPA_FEATURE_REDACTED_DATASETS)) { /* @@ -914,9 +944,6 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) numredactsnaps, tx); } - dmu_buf_will_dirty(newds->ds_dbuf, tx); - dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; - /* * If we actually created a non-clone, we need to create the objset * in our new dataset. If this is a raw send we postpone this until @@ -1090,8 +1117,9 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); - const char *tofs = drba->drba_cookie->drc_tofs; - uint64_t featureflags = drba->drba_cookie->drc_featureflags; + dmu_recv_cookie_t *drc = drba->drba_cookie; + const char *tofs = drc->drc_tofs; + uint64_t featureflags = drc->drc_featureflags; dsl_dataset_t *ds; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ @@ -1101,28 +1129,26 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) recv_clone_name); if (featureflags & DMU_BACKUP_FEATURE_RAW) { - drba->drba_cookie->drc_raw = B_TRUE; + drc->drc_raw = B_TRUE; } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } - if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) - != 0) { + if (dsl_dataset_own_force(dp, recvname, dsflags, drc, &ds) != 0) { /* %recv does not exist; continue in tofs */ - VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, - &ds)); - drba->drba_cookie->drc_newfs = B_TRUE; + VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, drc, &ds)); + drc->drc_newfs = B_TRUE; } + ds->ds_receiver = drc; ASSERT(DS_IS_INCONSISTENT(ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || - drba->drba_cookie->drc_raw); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || drc->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); - drba->drba_cookie->drc_ds = ds; - VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); - drba->drba_cookie->drc_should_save = B_TRUE; + drc->drc_ds = ds; + VERIFY0(dmu_objset_from_ds(ds, &drc->drc_os)); + drc->drc_should_save = B_TRUE; spa_history_log_internal_ds(ds, "resume receive", tx, " "); } @@ -1141,6 +1167,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, int err; bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_initiator = curthread; drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; @@ -1229,6 +1256,16 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, } } + if (err == 0 && drc->drc_ds == NULL) { + /* + * Make sure the dataset is destroyed before returning. We + * can't do this in the sync task because a dataset can't be + * synced and destroyed in the same txg. In this scenario, + * it should be flagged as inconsistent so we're ok anyway. + */ + (void) dsl_destroy_head(tofs); + return (SET_ERROR(ENXIO)); + } if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); nvlist_free(drc->drc_begin_nvl); @@ -2314,7 +2351,9 @@ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; + objset_t *os = ds->ds_objset; ds_hold_flags_t dsflags; + int error = 0; dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* @@ -2323,20 +2362,28 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) * been written out to disk. For raw receives, this ensures * that the user accounting code will not attempt to do anything * after we stopped receiving the dataset. + * + * If this is interrupted due to suspension and the pool is being + * force exported, just exit and cleanup. */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); + for (;;) { + error = txg_wait_synced_tx(ds->ds_dir->dd_pool, 0, + NULL, TXG_WAIT_F_NOSUSPEND); + if (error == 0 || spa_exiting_any(os->os_spa)) + break; + } ds->ds_objset->os_raw_receive = B_FALSE; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (drc->drc_resumable && drc->drc_should_save && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { rrw_exit(&ds->ds_bp_rwlock, FTAG); - dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + recv_disown(ds, drc); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); - dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + recv_disown(ds, drc); (void) dsl_destroy_head(name); } } @@ -2862,6 +2909,35 @@ resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) return (0); } +/* + * Cancel the receive stream for the dataset, if there is one. + */ +int +dmu_recv_close(dsl_dataset_t *ds) +{ + int err = 0; + dmu_recv_cookie_t *drc; + + /* + * This lock isn't technically for recv, but it's not worth + * adding a dedicated one for this purpose. + */ + mutex_enter(&ds->ds_sendstream_lock); + drc = ds->ds_receiver; + if (drc != NULL) { + drc->drc_flags |= DRC_CLOSED; + /* + * Send an interrupt to the initiator thread, which will + * cause it to end the stream and clean up. + */ + if (drc->drc_initiator != curthread) + thread_signal(drc->drc_initiator, SIGINT); + } + mutex_exit(&ds->ds_sendstream_lock); + + return (err); +} + /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -2878,6 +2954,7 @@ int dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) { int err = 0; + spa_t *spa = dsl_dataset_get_spa(drc->drc_ds); struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) { @@ -2917,7 +2994,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * are sure the rest of the receive succeeded so we stash * the keynvl away until then. */ - err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + err = dsl_crypto_recv_raw(spa_name(spa), drc->drc_ds->ds_object, drc->drc_fromsnapobj, drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); if (err != 0) @@ -2945,6 +3022,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) */ drc->drc_should_save = B_TRUE; + /* Last chance before kicking off. */ + if (drc->drc_flags & DRC_CLOSED) { + err = SET_ERROR(EINTR); + goto out; + } + (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), offsetof(struct receive_record_arg, node)); @@ -2960,8 +3043,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); - (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, - TS_RUN, minclsyspri); + /* + * Register the rwa with the drc so it can be interrupted. This + * requires a mutex handshake to ensure validity. + */ + mutex_enter(&drc->drc_ds->ds_sendstream_lock); + drc->drc_rwa = rwa; + mutex_exit(&drc->drc_ds->ds_sendstream_lock); + + kthread_t *rw_td = thread_create(NULL, 0, receive_writer_thread, + rwa, 0, curproc, TS_RUN, minclsyspri); + /* * We're reading rwa->err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we @@ -2977,11 +3069,10 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * it. Finally, if receive_read_record fails or we're at the end of the * stream, then we free drc->drc_rrd and exit. */ - while (rwa->err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - err = SET_ERROR(EINTR); + while (rwa->err == 0 && err == 0) { + err = spa_operation_interrupted(dmu_objset_spa(rwa->os)); + if (err) break; - } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = drc->drc_next_rrd; @@ -3008,9 +3099,22 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) mutex_enter(&rwa->mutex); while (!rwa->done) { + boolean_t closed = drc->drc_flags & DRC_CLOSED; + + if (!closed) { + if (err == 0) + err = spa_operation_interrupted(spa); + if (err != 0) { + drc->drc_flags |= DRC_CLOSED; + thread_signal(rw_td, SIGINT); + closed = B_TRUE; + } + } + /* * We need to use cv_wait_sig() so that any process that may - * be sleeping here can still fork. + * be sleeping here can still fork. Also, it allows + * dmu_recv_close to cause an eos marker to be injected. */ (void) cv_wait_sig(&rwa->cv, &rwa->mutex); } @@ -3042,6 +3146,10 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) } } + mutex_enter(&drc->drc_ds->ds_sendstream_lock); + drc->drc_rwa = NULL; + mutex_exit(&drc->drc_ds->ds_sendstream_lock); + cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); @@ -3090,7 +3198,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); int error; - ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + ASSERT3P(drc->drc_ds->ds_receiver, ==, drc); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; @@ -3308,7 +3416,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, drc->drc_ds->ds_object, drc->drc_ds); } - dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); + recv_disown(drc->drc_ds, drc); drc->drc_ds = NULL; } @@ -3374,7 +3482,7 @@ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && - os->os_dsl_dataset->ds_owner == dmu_recv_tag); + os->os_dsl_dataset->ds_receiver != NULL); } /* BEGIN CSTYLED */ diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 62c7d01d4bd2..726b2ba8f397 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -564,7 +564,14 @@ commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(os->os_spa)); + dmu_tx_abort(tx); + return; + } + uint64_t txg = dmu_tx_get_txg(tx); if (!md->md_synctask_txg[txg & TXG_MASK]) { dsl_sync_task_nowait(dmu_tx_pool(tx), diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index d654382237c0..33ba850d13ea 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2198,6 +2198,7 @@ setup_send_progress(struct dmu_send_params *dspp) dssp->dss_outfd = dspp->outfd; dssp->dss_off = dspp->off; dssp->dss_proc = curproc; + dssp->dss_thread = curthread; mutex_enter(&dspp->to_ds->ds_sendstream_lock); list_insert_head(&dspp->to_ds->ds_sendstreams, dssp); mutex_exit(&dspp->to_ds->ds_sendstream_lock); @@ -2487,6 +2488,14 @@ dmu_send_impl(struct dmu_send_params *dspp) } } + /* + * Last chance, bail if possible at this point, now that the send is + * registered and can be cancelled by signalling this thread. + */ + err = spa_operation_interrupted(os->os_spa); + if (err != 0) + goto out; + if (resuming || book_resuming) { err = setup_resume_points(dspp, to_arg, from_arg, rlt_arg, smt_arg, resuming, os, redact_rl, nvl); @@ -2534,8 +2543,8 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig(JUSTLOOKING) && issig(FORREAL)) - err = SET_ERROR(EINTR); + if (err == 0) + err = spa_operation_interrupted(os->os_spa); } /* @@ -3070,6 +3079,30 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, return (err); } +/* Close all send streams on the dataset. */ +int +dmu_send_close(dsl_dataset_t *ds) +{ + int err = 0; + dmu_sendstatus_t *dss; + + mutex_enter(&ds->ds_sendstream_lock); + dss = list_head(&ds->ds_sendstreams); + while (err == 0 && dss != NULL) { + /* + * Interrupt the initiator thread, which will cause it + * to initiate a cleanup error exit. Also send SIGPIPE + * because this interrupts pipe writes. + */ + thread_signal(dss->dss_thread, SIGINT); + thread_signal(dss->dss_thread, SIGPIPE); + dss = list_next(&ds->ds_sendstreams, dss); + } + mutex_exit(&ds->ds_sendstream_lock); + + return (0); +} + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW, "Allow sending corrupt data"); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 73667915df0f..83a38550de4d 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -867,6 +867,19 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) if (spa_suspended(spa)) { DMU_TX_STAT_BUMP(dmu_tx_suspended); + if (txg_how & TXG_NOSUSPEND) + return (SET_ERROR(EAGAIN)); + + /* + * If the user is forcibly exporting the pool or the objset, + * indicate to the caller that they need to give up. + */ + if (spa_exiting_any(spa)) + return (SET_ERROR(EIO)); + + if (tx->tx_objset != NULL && dmu_objset_exiting(tx->tx_objset)) + return (SET_ERROR(EIO)); + /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). @@ -993,6 +1006,8 @@ dmu_tx_unassign(dmu_tx_t *tx) tx->tx_txg = 0; } +static void dmu_tx_wait_flags(dmu_tx_t *, txg_wait_flag_t); + /* * Assign tx to a transaction group; txg_how is a bitmask: * @@ -1013,6 +1028,11 @@ dmu_tx_unassign(dmu_tx_t *tx) * they have already called dmu_tx_wait() (though most likely on a * different tx). * + * If TXG_NOSUSPEND is set, this indicates that this request must return + * EAGAIN if the pool becomes suspended while it is in progress. This + * ensures that the request does not inadvertently cause conditions that + * cannot be unwound. + * * It is guaranteed that subsequent successful calls to dmu_tx_assign() * will assign the tx to monotonically increasing txgs. Of course this is * not strong monotonicity, because the same txg can be returned multiple @@ -1035,7 +1055,7 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) int err; ASSERT(tx->tx_txg == 0); - ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); + ASSERT0(txg_how & ~(TXG_NOSUSPEND | TXG_WAIT | TXG_NOTHROTTLE)); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ @@ -1050,7 +1070,7 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) if (err != ERESTART || !(txg_how & TXG_WAIT)) return (err); - dmu_tx_wait(tx); + dmu_tx_wait_flags(tx, txg_how); } txg_rele_to_quiesce(&tx->tx_txgh); @@ -1058,8 +1078,8 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) return (0); } -void -dmu_tx_wait(dmu_tx_t *tx) +static void +dmu_tx_wait_flags(dmu_tx_t *tx, txg_wait_flag_t how) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; @@ -1104,8 +1124,11 @@ dmu_tx_wait(dmu_tx_t *tx) * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. + * + * It's also possible the pool will be force exported, in + * which case we'll try again and notice this fact, and exit. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_tx(dp, spa_last_synced_txg(spa) + 1, tx, how); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; @@ -1119,13 +1142,23 @@ dmu_tx_wait(dmu_tx_t *tx) * If we have a lot of dirty data just wait until we sync * out a TXG at which point we'll hopefully have synced * a portion of the changes. + * + * It's also possible the pool will be force exported, in + * which case we'll try again and notice this fact, and exit. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_tx(dp, spa_last_synced_txg(spa) + 1, tx, how); } spa_tx_assign_add_nsecs(spa, gethrtime() - before); } +void +dmu_tx_wait(dmu_tx_t *tx) +{ + + return (dmu_tx_wait_flags(tx, TXG_WAIT_F_NONE)); +} + static void dmu_tx_destroy(dmu_tx_t *tx) { diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 9b9bb42287d5..872bc7bcb3cf 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -44,6 +44,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -561,8 +564,8 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) } int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) +dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; @@ -585,6 +588,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = dmu_buf_get_user(dbuf); if (ds == NULL) { + if (flags & DS_HOLD_FLAG_MUST_BE_OPEN) { + dmu_buf_rele(dbuf, tag); + return (SET_ERROR(ENXIO)); + } + dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); @@ -726,6 +734,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } + if (err == 0 && (flags & DS_HOLD_FLAG_DECRYPT)) { + err = dsl_dataset_create_key_mapping(ds); + if (err != 0) + dsl_dataset_rele(ds, tag); + } + + if (err != 0) + return (err); + ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || @@ -749,24 +766,10 @@ dsl_dataset_create_key_mapping(dsl_dataset_t *ds) } int -dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, - ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) { - int err; - - err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); - if (err != 0) - return (err); - - ASSERT3P(*dsp, !=, NULL); - - if (flags & DS_HOLD_FLAG_DECRYPT) { - err = dsl_dataset_create_key_mapping(*dsp); - if (err != 0) - dsl_dataset_rele(*dsp, tag); - } - - return (err); + return (dsl_dataset_hold_obj_flags(dp, dsobj, 0, tag, dsp)); } int @@ -917,6 +920,114 @@ dsl_dataset_long_held(dsl_dataset_t *ds) return (!zfs_refcount_is_zero(&ds->ds_longholds)); } +/* + * Enumerate active datasets. This function is intended for use cases that + * want to avoid I/O, and only operate on those that have been loaded in + * memory. This works by enumerating the objects in the MOS that are known, + * and calling back with each dataset's MOS object IDs. It would be nice if + * the objset_t's were registered in a spa_t global list, but they're not, + * so this implementation is a bit more complex... + */ +static int +dsl_dataset_active_foreach(spa_t *spa, int func(dsl_dataset_t *, void *), + void *cl) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + dnode_t *mdn = DMU_META_DNODE(mos); + dmu_buf_impl_t *db; + uint64_t blkid, dsobj, i; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; + dsl_dataset_t *ds; + int epb, error; + int ret = 0; + + /* + * For each block of the MOS's meta-dnode's full size: + * - If the block is not cached, skip. + * - If the block has no user, skip. + * - For each dnode child of the meta-dnode block: + * - If not loaded (no dnode pointer), skip. + * - Attempt to hold the dataset, skip on failure. + * - Call the callback, quit if returns non zero, + * - Rele the dataset either way. + */ + rrw_enter(&dp->dp_config_rwlock, RW_READER, FTAG); + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + for (blkid = dsobj = 0; + ret == 0 && blkid <= mdn->dn_maxblkid; + blkid++, dsobj += epb) { + epb = DNODES_PER_BLOCK; + error = dbuf_hold_impl(mdn, 0, blkid, TRUE, TRUE, FTAG, &db); + if (error != 0) { + continue; + } + + epb = db->db.db_size >> DNODE_SHIFT; + children_dnodes = dmu_buf_get_user(&db->db); + if (children_dnodes == NULL) { + goto skip; + } + + for (i = 0; ret == 0 && i < epb; i++) { + dnh = &children_dnodes->dnc_children[i]; + if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) + continue; + + error = dsl_dataset_hold_obj_flags(dp, dsobj + i, + DS_HOLD_FLAG_MUST_BE_OPEN, FTAG, &ds); + if (error != 0) + continue; + + ret = func(ds, cl); + dsl_dataset_rele(ds, FTAG); + } + +skip: + dbuf_rele(db, FTAG); + } + rw_exit(&mdn->dn_struct_rwlock); + rrw_exit(&dp->dp_config_rwlock, FTAG); + + return (ret); +} + +/* + * Cancellation interfaces for send/receive streams. + * + * If a send/recv wins the race with a forced destroy, their pipes will be + * interrupted, and the destroy will wait for all ioctl references to drop. + * + * If a forced destroy wins the race, the send/receive will fail to start. + */ + +/* dsl_dataset_sendrecv_cancel_all callback for dsl_dataset_active_foreach. */ +static int +dsl_dataset_sendrecv_cancel_cb(dsl_dataset_t *ds, void *arg) +{ + int err; + + err = dmu_send_close(ds); + if (err == 0) + err = dmu_recv_close(ds); + + return (err); +} + +/* + * Cancel all outstanding sends/receives. Used when the pool is trying to + * forcibly exit. Iterates on all datasets in the MOS and cancels any + * running sends/receives by interrupting them. + */ +int +dsl_dataset_sendrecv_cancel_all(spa_t *spa) +{ + + return (dsl_dataset_active_foreach(spa, + dsl_dataset_sendrecv_cancel_cb, NULL)); +} + void dsl_dataset_name(dsl_dataset_t *ds, char *name) { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 62ee9bb9ab6c..370886492644 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -834,8 +834,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { - dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); - return (0); + return (dsl_scan_restart_resilver(spa->spa_dsl_pool, 0)); } if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { @@ -1099,13 +1098,20 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) /* start a new scan, or restart an existing one. */ -void +int dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { + int error; + if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + ASSERT(spa_exiting_any(dp->dp_spa)); + dmu_tx_abort(tx); + return (error); + } txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; @@ -1114,6 +1120,7 @@ dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg); + return (0); } void diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 148e8fff2437..ab5576adfece 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -57,7 +57,12 @@ dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, top: tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return (err); + } dst.dst_pool = dp; dst.dst_txg = dmu_tx_get_txg(tx); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index e588765b3382..1e6d3748e606 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -508,6 +508,16 @@ metaslab_class_get_dspace(metaslab_class_t *mc) return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } +void +metaslab_class_force_discard(metaslab_class_t *mc) +{ + + mc->mc_alloc = 0; + mc->mc_deferred = 0; + mc->mc_space = 0; + mc->mc_dspace = 0; +} + void metaslab_class_histogram_verify(metaslab_class_t *mc) { @@ -2769,6 +2779,19 @@ metaslab_fini(metaslab_t *msp) metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); + if (spa_exiting_any(mg->mg_vd->vdev_spa)) { + /* Catch-all cleanup as required for force export. */ + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + range_tree_vacate(msp->ms_freeing, NULL, NULL); + range_tree_vacate(msp->ms_freed, NULL, NULL); + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + for (int t = 0; t < TXG_SIZE; t++) + range_tree_vacate(msp->ms_allocating[t], NULL, NULL); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + range_tree_vacate(msp->ms_defer[t], NULL, NULL); + msp->ms_deferspace = 0; + } + VERIFY(msp->ms_group == NULL); /* @@ -3937,6 +3960,29 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) return; } + /* + * The pool is being forcibly exported. Just discard everything. + */ + if (spa_exiting_any(spa)) { + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + range_tree_vacate(alloctree, NULL, NULL); + range_tree_vacate(msp->ms_freeing, NULL, NULL); + range_tree_vacate(msp->ms_freed, NULL, NULL); + range_tree_vacate(msp->ms_trim, NULL, NULL); + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + range_tree_vacate(msp->ms_allocating[txg & TXG_MASK], + NULL, NULL); + range_tree_vacate(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK], + NULL, NULL); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_vacate(msp->ms_defer[t], NULL, NULL); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + return; + } + /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being @@ -3956,7 +4002,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) return; - VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); + spa_verify_dirty_txg(spa, txg); /* * The only state that can actually be changing concurrently @@ -4325,7 +4371,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); - if (msp->ms_deferspace != 0) { + if (msp->ms_deferspace != 0 && !spa_exiting_any(spa)) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 26995575adaa..3d3124865d1f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1349,6 +1349,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) static void spa_deactivate(spa_t *spa) { + int error; + ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); @@ -1388,10 +1390,19 @@ spa_deactivate(spa_t *spa) for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); - VERIFY0(zio_wait(spa->spa_txg_zio[i])); + error = zio_wait(spa->spa_txg_zio[i]); + VERIFY(error == 0 || (spa_exiting_any(spa) && error == EIO)); spa->spa_txg_zio[i] = NULL; } + if (spa_exiting_any(spa)) { + metaslab_class_force_discard(spa->spa_normal_class); + metaslab_class_force_discard(spa->spa_log_class); + metaslab_class_force_discard(spa->spa_embedded_log_class); + metaslab_class_force_discard(spa->spa_special_class); + metaslab_class_force_discard(spa->spa_dedup_class); + } + metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; @@ -1521,7 +1532,12 @@ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txerr = dmu_tx_assign(tx, TXG_WAIT); + if (txerr != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return; + } ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); @@ -1577,9 +1593,13 @@ spa_destroy_aux_threads(spa_t *spa) /* * Opposite of spa_load(). */ -static void -spa_unload(spa_t *spa) +static int +spa_unload(spa_t *spa, txg_wait_flag_t txg_how) { + int err; + vdev_t *vd; + uint64_t t, txg; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); @@ -1614,10 +1634,45 @@ spa_unload(spa_t *spa) * Stop syncing. */ if (spa->spa_sync_on) { - txg_sync_stop(spa->spa_dsl_pool); + err = txg_sync_stop(spa->spa_dsl_pool, txg_how); + if (err != 0) { + spa_async_resume(spa); + return (err); + } spa->spa_sync_on = B_FALSE; } + /* + * If the pool is being forcibly exported, it may be necessary to + * cleanup again. This normally would be handled by spa_sync(), + * except it's possible that followup txg's were skipped, and + * thus the opportunity to have performed these operations. + * + * This is the correct place to perform these operations, as just + * now, spa_sync() and vdev activity has been stopped, and after + * here, the metaslabs are destroyed. + */ + if (spa_exiting_any(spa)) { + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) + vdev_config_clean(vd); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) + vdev_state_clean(vd); + /* The only dirty entries should be for spa_syncing_txg + 1. */ + t = 0; + txg = spa_syncing_txg(spa) + 1; + while (t < TXG_SIZE) { + vd = txg_list_remove(&spa->spa_vdev_txg_list, t); + if (vd == NULL) { + t++; + continue; + } + VERIFY3U(t, ==, txg & TXG_MASK); + vdev_sync_done(vd, txg); + } + spa_config_exit(spa, SCL_ALL, spa); + } + /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. @@ -1722,6 +1777,7 @@ spa_unload(spa_t *spa) } spa_config_exit(spa, SCL_ALL, spa); + return (0); } /* @@ -2580,9 +2636,10 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %d remaining", dle->dle_bpobj.bpo_object, ll_obj, count - 1); - VERIFY0(dsl_sync_task(spa_name(spa), NULL, + err = dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, - ZFS_SPACE_CHECK_DESTROY)); + ZFS_SPACE_CHECK_DESTROY); + VERIFY(err == 0 || spa_exiting_any(spa)); } else { VERIFY3U(err, ==, EINTR); } @@ -2597,8 +2654,10 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); - VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, - &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); + int err = dsl_sync_task(spa_name(spa), NULL, + livelist_delete_sync, &sync_arg, 0, + ZFS_SPACE_CHECK_DESTROY); + VERIFY(err == 0 || spa_exiting_any(spa)); } } @@ -4444,7 +4503,7 @@ spa_ld_prepare_for_reload(spa_t *spa) spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_activate(spa, mode); @@ -4960,7 +5019,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; @@ -5122,6 +5181,16 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, return (SET_ERROR(ENOENT)); } + /* + * If the pool is exiting, only the thread forcing it to exit may + * open new references to it. + */ + if (spa_exiting(spa)) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENXIO)); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; @@ -5150,7 +5219,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); @@ -5172,7 +5241,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) @@ -5846,7 +5915,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -6105,15 +6174,13 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } - spa_async_resume(spa); - /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. @@ -6161,9 +6228,21 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) /* * Update the config cache to include the newly-imported pool. */ - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + mutex_exit(&spa_namespace_lock); + error = spa_config_update_pool(spa); + if (error != 0) { + mutex_enter(&spa_namespace_lock); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + mutex_enter(&spa_namespace_lock); } + spa_async_resume(spa); + /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. @@ -6286,7 +6365,7 @@ spa_tryimport(nvlist_t *tryconfig) spa_config_exit(spa, SCL_CONFIG, FTAG); } - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -6294,6 +6373,17 @@ spa_tryimport(nvlist_t *tryconfig) return (config); } +static void +spa_set_export_initiator(spa_t *spa, void *initiator) +{ + + mutex_enter(&spa->spa_evicting_os_lock); + spa->spa_export_initiator = initiator; + if (initiator != NULL) + txg_completion_notify(spa_get_dsl(spa)); + mutex_exit(&spa->spa_evicting_os_lock); +} + /* * Pool export/destroy * @@ -6309,6 +6399,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, { int error; spa_t *spa; + boolean_t force_removal, modifying; if (oldconfig) *oldconfig = NULL; @@ -6329,13 +6420,49 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, } spa->spa_is_exporting = B_TRUE; + /* XXX Should this be chained instead of rejected? */ + if (spa_exiting(spa)) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } + + modifying = spa->spa_sync_on && (new_state == POOL_STATE_DESTROYED || + new_state == POOL_STATE_EXPORTED); + /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); + + /* + * Mark the pool as facing impending exit if this is a forced + * destroy or export. + */ + force_removal = hardforce && modifying; + if (force_removal) { + /* Ensure that references see this change after this. */ + spa_set_export_initiator(spa, curthread); + } mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); + + /* + * Cancel all sends/receives if necessary, and wait for their holds + * to expire. This is done without the namespace lock, since some + * operations may require acquiring it (although they will fail). + */ + if (force_removal && spa->spa_sync_on) { + error = dsl_dataset_sendrecv_cancel_all(spa); + if (error != 0) { + spa_set_export_initiator(spa, NULL); + spa_async_resume(spa); + return (error); + } + txg_force_export(spa); + spa_evicting_os_wait(spa); + } + if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); @@ -6345,22 +6472,45 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; + /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ - if (spa->spa_sync_on) { - txg_wait_synced(spa->spa_dsl_pool, 0); + if (!force_removal && spa->spa_sync_on) { + error = txg_wait_synced_tx(spa->spa_dsl_pool, 0, + NULL, TXG_WAIT_F_NOSUSPEND); + if (error != 0) + goto fail; spa_evicting_os_wait(spa); } + /* + * For forced removal, wait for refcount to drop to minref. At this + * point, all ioctls should be on their way out or getting rejected + * at the front door. + */ + if (force_removal) { + mutex_exit(&spa_namespace_lock); + mutex_enter(&spa->spa_evicting_os_lock); + while (zfs_refcount_count(&spa->spa_refcount) > + spa->spa_minref) { + zio_cancel(spa); + cv_wait(&spa->spa_evicting_os_cv, + &spa->spa_evicting_os_lock); + } + mutex_exit(&spa->spa_evicting_os_lock); + mutex_enter(&spa_namespace_lock); + } + /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { + VERIFY(!force_removal); error = SET_ERROR(EBUSY); goto fail; } @@ -6391,6 +6541,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); + l2arc_spa_rebuild_stop(spa); } /* @@ -6415,7 +6566,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); + /* + * If the pool is not being hard forced, throw an error upon + * suspension and abort. + */ + error = spa_unload(spa, + hardforce ? TXG_WAIT_F_NONE : TXG_WAIT_F_NOSUSPEND); + if (error != 0) + goto fail; spa_deactivate(spa); } @@ -6423,7 +6581,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { - if (!hardforce) + if (!force_removal) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } else { @@ -6439,6 +6597,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, return (0); fail: + if (force_removal) + spa_set_export_initiator(spa, NULL); spa->spa_is_exporting = B_FALSE; spa_async_resume(spa); mutex_exit(&spa_namespace_lock); @@ -6641,10 +6801,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) */ (void) spa_vdev_exit(spa, vd, txg, 0); - mutex_enter(&spa_namespace_lock); - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_config_update_pool(spa); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); - mutex_exit(&spa_namespace_lock); return (0); } @@ -6691,6 +6849,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, NULL, txg, error)); } + /* If the pool is being force-exported, no vdev changes may occur. */ + ASSERT(!spa_exiting_any(spa)); + if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); @@ -6857,8 +7018,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_propagate_state(pvd); tvd = newvd->vdev_top; - ASSERT(pvd->vdev_top == tvd); - ASSERT(tvd->vdev_parent == rvd); + ASSERT3P(pvd->vdev_top, ==, tvd); + ASSERT3P(tvd->vdev_parent, ==, rvd); vdev_config_dirty(tvd); @@ -6902,8 +7063,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + VERIFY0(dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg)); } } @@ -7763,7 +7924,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, return (error); out: - spa_unload(newspa); + VERIFY0(spa_unload(newspa, TXG_WAIT_F_NONE)); spa_deactivate(newspa); spa_remove(newspa); @@ -8081,6 +8242,19 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } +static uint64_t +spa_pool_space(spa_t *spa) +{ + uint64_t space; + + space = metaslab_class_get_space(spa_normal_class(spa)); + space += metaslab_class_get_space(spa_special_class(spa)); + space += metaslab_class_get_space(spa_dedup_class(spa)); + space += metaslab_class_get_space(spa_embedded_log_class(spa)); + + return (space); +} + static void spa_async_thread(void *arg) { @@ -8101,21 +8275,9 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; - mutex_enter(&spa_namespace_lock); - old_space = metaslab_class_get_space(spa_normal_class(spa)); - old_space += metaslab_class_get_space(spa_special_class(spa)); - old_space += metaslab_class_get_space(spa_dedup_class(spa)); - old_space += metaslab_class_get_space( - spa_embedded_log_class(spa)); - - spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - - new_space = metaslab_class_get_space(spa_normal_class(spa)); - new_space += metaslab_class_get_space(spa_special_class(spa)); - new_space += metaslab_class_get_space(spa_dedup_class(spa)); - new_space += metaslab_class_get_space( - spa_embedded_log_class(spa)); - mutex_exit(&spa_namespace_lock); + new_space = old_space = spa_pool_space(spa); + if (spa_config_update_pool(spa) == 0) + new_space = spa_pool_space(spa); /* * If the pool grew as a result of the config update, @@ -8172,7 +8334,7 @@ spa_async_thread(void *arg) !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) - dsl_scan_restart_resilver(dp, 0); + (void) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); @@ -8425,6 +8587,9 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) size_t nvsize = 0; dmu_buf_t *db; + if (spa_exiting_any(spa)) + return; + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* @@ -8894,6 +9059,9 @@ vdev_indirect_state_sync_verify(vdev_t *vd) vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; + if (spa_exiting_any(vd->vdev_spa)) + return; + if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); @@ -9097,10 +9265,10 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; + boolean_t exiting = B_FALSE; - for (;;) { + while (exiting == B_FALSE) { int error = 0; - /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. @@ -9144,7 +9312,18 @@ spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); - zio_resume_wait(spa); + + mutex_enter(&spa->spa_suspend_lock); + for (;;) { + exiting = spa_exiting(spa); + if (exiting || spa_suspended(spa) == B_FALSE) + break; + cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + } + mutex_exit(&spa->spa_suspend_lock); + + if (exiting) + zio_cancel(spa); } } @@ -9353,7 +9532,8 @@ spa_sync_allpools(void) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); - txg_wait_synced(spa_get_dsl(spa), 0); + txg_wait_synced_flags(spa_get_dsl(spa), 0, + TXG_WAIT_F_NOSUSPEND); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } @@ -9392,7 +9572,7 @@ spa_evict_all(void) spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { - spa_unload(spa); + VERIFY0(spa_unload(spa, TXG_WAIT_F_NONE)); spa_deactivate(spa); } spa_remove(spa); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 5fb614467273..23c1ed5fd1d2 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -399,6 +399,7 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *rvd = spa->spa_root_vdev; + int err = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; @@ -433,9 +434,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) error, vd->vdev_id); } - VERIFY0(dsl_sync_task(spa->spa_name, NULL, + err = dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_thread_sync, vd, - 0, ZFS_SPACE_CHECK_NONE)); + 0, ZFS_SPACE_CHECK_NONE); + VERIFY(err == 0 || spa_exiting_any(spa)); dmu_buf_rele_array(dbp, numbufs, FTAG); } @@ -443,9 +445,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) VERIFY(spa_checkpoint_discard_is_done(spa)); VERIFY0(spa->spa_checkpoint_info.sci_dspace); - VERIFY0(dsl_sync_task(spa->spa_name, NULL, + err = dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_complete_sync, spa, - 0, ZFS_SPACE_CHECK_NONE)); + 0, ZFS_SPACE_CHECK_NONE); + VERIFY(err == 0 || spa_exiting_any(spa)); } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index ad82932ce567..5b72ee373e91 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -541,75 +541,115 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) return (config); } -/* - * Update all disk labels, generate a fresh config based on the current - * in-core state, and sync the global config cache (do not sync the config - * cache if this is a booting rootpool). - */ -void -spa_config_update(spa_t *spa, int what) +static int +spa_config_update_begin(spa_t *spa, void *tag) { - vdev_t *rvd = spa->spa_root_vdev; - uint64_t txg; - int c; - - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - txg = spa_last_synced_txg(spa) + 1; - if (what == SPA_CONFIG_UPDATE_POOL) { - vdev_config_dirty(rvd); - } else { - /* - * If we have top-level vdevs that were added but have - * not yet been prepared for allocation, do that now. - * (It's safe now because the config cache is up to date, - * so it will be able to translate the new DVAs.) - * See comments in spa_vdev_add() for full details. - */ - for (c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; + return (spa_config_enter_flags(spa, SCL_ALL, tag, RW_WRITER, + SCL_FLAG_NOSUSPEND)); +} - /* - * Explicitly skip vdevs that are indirect or - * log vdevs that are being removed. The reason - * is that both of those can have vdev_ms_array - * set to 0 and we wouldn't want to change their - * metaslab size nor call vdev_expand() on them. - */ - if (!vdev_is_concrete(tvd) || - (tvd->vdev_islog && tvd->vdev_removing)) - continue; +/* Complete a label update. */ +static int +spa_config_update_complete(spa_t *spa, uint64_t txg, boolean_t postsysevent, + void *tag) +{ + int error = 0; - if (tvd->vdev_ms_array == 0) - vdev_metaslab_set_size(tvd); - vdev_expand(tvd, txg); - } - } - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, tag); /* * Wait for the mosconfig to be regenerated and synced. */ - txg_wait_synced(spa->spa_dsl_pool, txg); + error = txg_wait_synced_tx(spa->spa_dsl_pool, txg, NULL, 0); + if (error == 0 && !spa->spa_is_root) { + /* + * Update the global config cache to reflect the new mosconfig. + * This operation does not perform any pool I/O, so it is + * safe even if one or more of them are suspended. + */ + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(spa, B_FALSE, postsysevent); + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* Update any top-level vdevs needing expansion. */ +static int +spa_config_update_vdevs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int c, error; + + error = spa_config_update_begin(spa, FTAG); + if (error != 0) + return (error); + + txg = spa_last_synced_txg(spa) + 1; /* - * Update the global config cache to reflect the new mosconfig. + * If we have top-level vdevs that were added but have + * not yet been prepared for allocation, do that now. + * (It's safe now because the config cache is up to date, + * so it will be able to translate the new DVAs.) + * See comments in spa_vdev_add() for full details. */ - if (!spa->spa_is_root) { - spa_write_cachefile(spa, B_FALSE, - what != SPA_CONFIG_UPDATE_POOL); + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + /* + * Explicitly skip vdevs that are indirect or + * log vdevs that are being removed. The reason + * is that both of those can have vdev_ms_array + * set to 0 and we wouldn't want to change their + * metaslab size nor call vdev_expand() on them. + */ + if (!vdev_is_concrete(tvd) || + (tvd->vdev_islog && tvd->vdev_removing)) + continue; + + if (tvd->vdev_ms_array == 0) + vdev_metaslab_set_size(tvd); + vdev_expand(tvd, txg); } - if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); + return (spa_config_update_complete(spa, txg, B_TRUE, FTAG)); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache (do not sync the config + * cache if this is a booting rootpool). + */ +int +spa_config_update_pool(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int error; + + error = spa_config_update_begin(spa, FTAG); + if (error != 0) + return (error); + + txg = spa_last_synced_txg(spa) + 1; + vdev_config_dirty(rvd); + + error = spa_config_update_complete(spa, txg, B_FALSE, FTAG); + if (error == 0) + error = spa_config_update_vdevs(spa); + + return (error); } EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); -EXPORT_SYMBOL(spa_config_update); +EXPORT_SYMBOL(spa_config_update_pool); /* BEGIN CSTYLED */ #ifdef __linux__ diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index fa5120eb61b3..2b66e442c002 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -99,9 +99,9 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) /* * If we are trying to import a pool, ignore any errors, as we won't be - * writing to the pool any time soon. + * writing to the pool any time soon. Same for force exports. */ - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + if (spa_exiting_any(spa) || spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return; mutex_enter(&spa->spa_errlist_lock); @@ -305,6 +305,9 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) void *cookie; if (avl_numnodes(t) != 0) { + if (spa_exiting_any(spa)) + goto done; + /* create log if necessary */ if (*obj == 0) *obj = zap_create(spa->spa_meta_objset, @@ -321,6 +324,7 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) *obj, buf, 1, strlen(name) + 1, name, tx); } +done: /* purge the error list */ cookie = NULL; while ((se = avl_destroy_nodes(t, &cookie)) != NULL) diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 0482e0f6c39d..d9f20cdeab55 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -382,7 +382,7 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) } tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - err = dmu_tx_assign(tx, TXG_WAIT); + err = dmu_tx_assign(tx, TXG_WAIT | TXG_NOSUSPEND); if (err) { dmu_tx_abort(tx); return (err); @@ -518,9 +518,10 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. - * Likewise if the pool is not writeable. + * Likewise if the pool is not writeable, or is being force exported. */ - if (spa_is_initializing(spa) || !spa_writeable(spa)) { + if (spa_is_initializing(spa) || !spa_writeable(spa) || + spa_exiting_any(spa)) { fnvlist_free(nvl); return; } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 1a2e5abc5335..bf22c856e628 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -463,45 +463,32 @@ spa_config_lock_destroy(spa_t *spa) } } -int -spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +static int +spa_config_eval_flags(spa_t *spa, spa_config_flag_t flags) { - for (int i = 0; i < SCL_LOCKS; i++) { - spa_config_lock_t *scl = &spa->spa_config_lock[i]; - if (!(locks & (1 << i))) - continue; - mutex_enter(&scl->scl_lock); - if (rw == RW_READER) { - if (scl->scl_writer || scl->scl_write_wanted) { - mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks & ((1 << i) - 1), - tag); - return (0); - } - } else { - ASSERT(scl->scl_writer != curthread); - if (!zfs_refcount_is_zero(&scl->scl_count)) { - mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks & ((1 << i) - 1), - tag); - return (0); - } - scl->scl_writer = curthread; - } - (void) zfs_refcount_add(&scl->scl_count, tag); - mutex_exit(&scl->scl_lock); + int error = 0; + + if ((flags & SCL_FLAG_TRYENTER) != 0) + error = SET_ERROR(EAGAIN); + if (error == 0 && ((flags & SCL_FLAG_NOSUSPEND) != 0)) { + /* Notification given by zio_suspend(). */ + mutex_enter(&spa->spa_suspend_lock); + error = spa_suspended(spa) ? SET_ERROR(EAGAIN) : 0; + mutex_exit(&spa->spa_suspend_lock); } - return (1); + return (error); } -void -spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +int +spa_config_enter_flags(spa_t *spa, int locks, const void *tag, krw_t rw, + spa_config_flag_t flags) { + int error = 0; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); - for (int i = 0; i < SCL_LOCKS; i++) { + for (int i = 0; error == 0 && i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); @@ -510,21 +497,53 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { + error = spa_config_eval_flags(spa, flags); + if (error != 0) + break; cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (!zfs_refcount_is_zero(&scl->scl_count)) { + error = spa_config_eval_flags(spa, flags); + if (error != 0) + break; scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } - scl->scl_writer = curthread; + if (error == 0) + scl->scl_writer = curthread; } - (void) zfs_refcount_add(&scl->scl_count, tag); + if (error == 0) + (void) zfs_refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); + + if (error != 0 && i > 0) { + /* Should never happen for classic spa_config_enter. */ + ASSERT3U(flags, !=, 0); + spa_config_exit(spa, locks & ((1 << i) - 1), tag); + } } + ASSERT3U(wlocks_held, <=, locks); + return (error); +} + +void +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_flag_t flags = 0; + + spa_config_enter_flags(spa, locks, tag, rw, flags); +} + +int +spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + + return (spa_config_enter_flags(spa, locks, tag, rw, + SCL_FLAG_TRYENTER) == 0); } void @@ -888,6 +907,20 @@ spa_open_ref(spa_t *spa, void *tag) (void) zfs_refcount_add(&spa->spa_refcount, tag); } +/* + * Remove a reference to a given spa_t. Common routine that also includes + * notifying the exporter if one is registered, when minref has been reached. + */ +static void +spa_close_common(spa_t *spa, void *tag) +{ + if (zfs_refcount_remove(&spa->spa_refcount, tag) == spa->spa_minref) { + mutex_enter(&spa->spa_evicting_os_lock); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); + } +} + /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. @@ -897,7 +930,7 @@ spa_close(spa_t *spa, void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); - (void) zfs_refcount_remove(&spa->spa_refcount, tag); + spa_close_common(spa, tag); } /* @@ -911,7 +944,7 @@ spa_close(spa_t *spa, void *tag) void spa_async_close(spa_t *spa, void *tag) { - (void) zfs_refcount_remove(&spa->spa_refcount, tag); + spa_close_common(spa, tag); } /* @@ -1741,6 +1774,19 @@ spa_syncing_txg(spa_t *spa) return (spa->spa_syncing_txg); } +/* + * Verify that the requesting thread isn't dirtying a txg it's not supposed + * to be. Normally, this must be spa_final_dirty_txg(), but if the pool is + * being force exported, no data will be written to stable storage anyway. + */ +void +spa_verify_dirty_txg(spa_t *spa, uint64_t txg) +{ + + if (spa->spa_export_initiator == NULL) + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); +} + /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. @@ -1979,6 +2025,18 @@ spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, return (spa_normal_class(spa)); } +void +spa_evicting_os_lock(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_unlock(spa_t *spa) +{ + mutex_exit(&spa->spa_evicting_os_lock); +} + void spa_evicting_os_register(spa_t *spa, objset_t *os) { @@ -2603,6 +2661,31 @@ spa_maxblocksize(spa_t *spa) return (SPA_OLD_MAXBLOCKSIZE); } +boolean_t +spa_exiting_any(spa_t *spa) +{ + return (spa->spa_export_initiator != NULL); +} + +/* + * NB: must hold spa_namespace_lock or spa_evicting_os_lock if the result of + * this is critical. + */ +boolean_t +spa_exiting(spa_t *spa) +{ + return (spa_exiting_any(spa) && spa->spa_export_initiator != curthread); +} + +int +spa_operation_interrupted(spa_t *spa) +{ + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + if (spa_exiting(spa)) + return (SET_ERROR(ENXIO)); + return (0); +} /* * Returns the txg that the last device removal completed. No indirect mappings @@ -2883,6 +2966,8 @@ EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); +EXPORT_SYMBOL(spa_exiting); +EXPORT_SYMBOL(spa_operation_interrupted); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 3db7d199199c..78255df2c2d2 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -860,7 +860,7 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(dmu_tx_is_syncing(tx)); - VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); + spa_verify_dirty_txg(spa, dmu_tx_get_txg(tx)); dmu_object_info_from_db(sm->sm_dbuf, &doi); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 497e19dd58eb..80bc0a29b708 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -255,10 +256,11 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) /* * Stop syncing transaction groups. */ -void -txg_sync_stop(dsl_pool_t *dp) +int +txg_sync_stop(dsl_pool_t *dp, txg_wait_flag_t txg_how) { tx_state_t *tx = &dp->dp_tx; + int err; dprintf("pool %p\n", dp); /* @@ -269,7 +271,10 @@ txg_sync_stop(dsl_pool_t *dp) /* * We need to ensure that we've vacated the deferred metaslab trees. */ - txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); + err = txg_wait_synced_tx(dp, tx->tx_open_txg + TXG_DEFER_SIZE, + NULL, txg_how); + if (err != 0) + return (err); /* * Wake all sync threads and wait for them to die. @@ -290,6 +295,7 @@ txg_sync_stop(dsl_pool_t *dp) tx->tx_exiting = 0; mutex_exit(&tx->tx_sync_lock); + return (0); } /* @@ -522,6 +528,24 @@ txg_has_quiesced_to_sync(dsl_pool_t *dp) return (tx->tx_quiesced_txg != 0); } +/* + * Notify of completion. This is usually only called by the sync thread, + * but in force-export/unmount scenarios, it can be called by another thread + * that has generated an alternative completion scenario. + */ +void +txg_completion_notify(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + boolean_t locked = MUTEX_HELD(&tx->tx_sync_lock); + + if (!locked) + mutex_enter(&tx->tx_sync_lock); + cv_broadcast(&tx->tx_sync_done_cv); + if (!locked) + mutex_exit(&tx->tx_sync_lock); +} + static void txg_sync_thread(void *arg) { @@ -600,7 +624,7 @@ txg_sync_thread(void *arg) tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_done_cv); + txg_completion_notify(dp); /* * Dispatch commit callbacks to worker threads. @@ -691,59 +715,77 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) mutex_exit(&tx->tx_sync_lock); } -static boolean_t -txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig) +int +txg_wait_synced_tx(dsl_pool_t *dp, uint64_t txg, dmu_tx_t *tx, + txg_wait_flag_t flags) { - tx_state_t *tx = &dp->dp_tx; + tx_state_t *dp_tx = &dp->dp_tx; + int error = 0; + objset_t *os = NULL; ASSERT(!dsl_pool_config_held(dp)); - mutex_enter(&tx->tx_sync_lock); - ASSERT3U(tx->tx_threads, ==, 2); + mutex_enter(&dp_tx->tx_sync_lock); + ASSERT3U(dp_tx->tx_threads, ==, 2); if (txg == 0) - txg = tx->tx_open_txg + TXG_DEFER_SIZE; - if (tx->tx_sync_txg_waiting < txg) - tx->tx_sync_txg_waiting = txg; + txg = dp_tx->tx_open_txg + TXG_DEFER_SIZE; + if (dp_tx->tx_sync_txg_waiting < txg) + dp_tx->tx_sync_txg_waiting = txg; + if (tx != NULL && tx->tx_objset != NULL) + os = tx->tx_objset; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", - txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); - while (tx->tx_synced_txg < txg) { + txg, dp_tx->tx_quiesce_txg_waiting, dp_tx->tx_sync_txg_waiting); + while (error == 0 && dp_tx->tx_synced_txg < txg) { dprintf("broadcasting sync more " "tx_synced=%llu waiting=%llu dp=%px\n", - tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); - cv_broadcast(&tx->tx_sync_more_cv); - if (wait_sig) { + dp_tx->tx_synced_txg, dp_tx->tx_sync_txg_waiting, dp); + cv_broadcast(&dp_tx->tx_sync_more_cv); + /* + * If we are suspending and exiting, give up, because our + * data isn't going to be pushed. + */ + if (spa_suspended(dp->dp_spa)) { + if ((flags & TXG_WAIT_F_NOSUSPEND) || + spa_exiting_any(dp->dp_spa)) { + error = SET_ERROR(EAGAIN); + } + } + if (error == 0 && os != NULL && dmu_objset_exiting(os)) + error = SET_ERROR(EAGAIN); + if (error != 0) + break; + if (flags & TXG_WAIT_F_SIGNAL) { /* * Condition wait here but stop if the thread receives a * signal. The caller may call txg_wait_synced*() again * to resume waiting for this txg. */ - if (cv_wait_io_sig(&tx->tx_sync_done_cv, - &tx->tx_sync_lock) == 0) { - mutex_exit(&tx->tx_sync_lock); - return (B_TRUE); + if (cv_wait_io_sig(&dp_tx->tx_sync_done_cv, + &dp_tx->tx_sync_lock) == 0) { + error = SET_ERROR(EINTR); + break; } } else { - cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + cv_wait_io(&dp_tx->tx_sync_done_cv, + &dp_tx->tx_sync_lock); } } - mutex_exit(&tx->tx_sync_lock); - return (B_FALSE); + + mutex_exit(&dp_tx->tx_sync_lock); + dprintf("txg=%llu error=%d\n", txg, error); + return (error); } -void -txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +int +txg_wait_synced_flags(dsl_pool_t *dp, uint64_t txg, txg_wait_flag_t flags) { - VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE)); + return (txg_wait_synced_tx(dp, txg, NULL, flags)); } -/* - * Similar to a txg_wait_synced but it can be interrupted from a signal. - * Returns B_TRUE if the thread was signaled while waiting. - */ -boolean_t -txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg) +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { - return (txg_wait_synced_impl(dp, txg, B_TRUE)); + (void) txg_wait_synced_tx(dp, txg, NULL, 0); } /* @@ -822,6 +864,34 @@ txg_sync_waiting(dsl_pool_t *dp) tx->tx_quiesced_txg != 0); } +void +txg_force_export(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + tx_state_t *tx = &dp->dp_tx; + uint64_t t, txg; + + /* + * When forcing removal, push through TXG_SIZE TXGs to ensure that + * all state is cleaned up by spa_sync(). While waiting for each + * TXG to complete, cancel any suspended zios that appear. + */ + ASSERT(spa_exiting_any(spa)); + txg = tx->tx_synced_txg + 1; + for (t = 0; t < TXG_SIZE; t++) { + txg_wait_open(dp, txg + t, B_TRUE); + + boolean_t complete = B_FALSE; + while (!complete) { + zio_cancel(spa); + mutex_enter(&tx->tx_sync_lock); + cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + complete = (tx->tx_synced_txg >= (txg + t)); + mutex_exit(&tx->tx_sync_lock); + } + } +} + /* * Verify that this txg is active (open, quiescing, syncing). Non-active * txg's should not be manipulated. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5e14d71f1946..6c695552c6c2 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -971,6 +971,8 @@ void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + uint64_t t, txg; + metaslab_t *msp; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); @@ -995,6 +997,19 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + /* If the pool is being forcibly exported, clean up any stragglers. */ + if (spa_exiting_any(spa)) { + for (t = 0, txg = spa_syncing_txg(spa) + 1; t < TXG_SIZE; ) { + msp = txg_list_remove(&vd->vdev_ms_list, t); + if (msp == NULL) { + t++; + continue; + } + VERIFY3U(t, ==, txg & TXG_MASK); + /* Metaslab already destroyed, nothing to do. */ + } + } + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); @@ -1024,6 +1039,9 @@ vdev_free(vdev_t *vd) vd->vdev_log_mg = NULL; } + if (spa_exiting_any(spa)) + vdev_clear_stats(vd); + ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); @@ -3228,6 +3246,16 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); + /* + * The pool is being forcibly exported. Just discard everything. + */ + if (spa_exiting(spa)) { + mutex_enter(&vd->vdev_dtl_lock); + range_tree_vacate(rt, NULL, NULL); + mutex_exit(&vd->vdev_dtl_lock); + return; + } + ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -4643,6 +4671,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; + if (vd == NULL && spa_exiting_any(spa)) { + /* Forced export resulted in partially constructed I/O. */ + return; + } + if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index e539e9aa2d70..89d7b4364bb3 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -562,13 +562,19 @@ spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + dmu_tx_t *tx; + int txgoff; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); - dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return; + } + txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync @@ -651,6 +657,7 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; + int err = 0; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -744,9 +751,10 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr) if (zthr_iscancelled(zthr)) return; - VERIFY0(dsl_sync_task(spa_name(spa), NULL, + err = dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, - ZFS_SPACE_CHECK_EXTRA_RESERVED)); + ZFS_SPACE_CHECK_EXTRA_RESERVED); + VERIFY(err == 0 || spa_exiting_any(spa)); } /* diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index e9156c32f384..55af45ed25a8 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -125,6 +125,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; + /* + * In this context, a pool vdev is initializing. Usually, we would + * want to handle txg failure, but this can only happen if the pool + * becomes suspended and then forcibly exported when this occurs. In + * which case, the caller here hung while holding the namespace lock, + * so there's little that can be done (including attempt a force + * export, which requires the namespace lock) to recover. + */ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 04202a9f8960..5a411886556c 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1884,7 +1884,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) * bailing out and declaring the pool faulted. */ if (error != 0) { - if ((flags & ZIO_FLAG_TRYHARD) != 0) + if (spa_exiting_any(spa) || (flags & ZIO_FLAG_TRYHARD) != 0) return (error); flags |= ZIO_FLAG_TRYHARD; } diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 4d7de0c6c44c..d1a65becf755 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -283,7 +283,12 @@ vdev_rebuild_initiate(vdev_t *vd) ASSERT(!vd->vdev_rebuilding); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(vd->vdev_spa)); + dmu_tx_abort(tx); + return; + } vd->vdev_rebuilding = B_TRUE; @@ -571,7 +576,15 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + mutex_enter(&vr->vr_io_lock); + vr->vr_bytes_inflight -= psize; + mutex_exit(&vr->vr_io_lock); + return (err); + } uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); @@ -901,9 +914,15 @@ vdev_rebuild_thread(void *arg) dsl_pool_t *dp = spa_get_dsl(spa); dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txerr = dmu_tx_assign(tx, TXG_WAIT); mutex_enter(&vd->vdev_rebuild_lock); + if (txerr != 0) { + ASSERT(spa_exiting_any(vd->vdev_spa)); + vd->vdev_rebuilding = B_FALSE; + dmu_tx_abort(tx); + goto done; + } if (error == 0) { /* * After a successful rebuild clear the DTLs of all ranges @@ -942,6 +961,7 @@ vdev_rebuild_thread(void *arg) dmu_tx_commit(tx); +done: vd->vdev_rebuild_thread = NULL; mutex_exit(&vd->vdev_rebuild_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index d7c0641c8c2c..ab0099f949ff 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1528,7 +1528,15 @@ spa_vdev_remove_thread(void *arg) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + /* + * If a tx can't be assigned, just punt and wait for + * the next round. This must be an exiting spa. + */ + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + goto done; + } uint64_t txg = dmu_tx_get_txg(tx); /* @@ -1562,6 +1570,7 @@ spa_vdev_remove_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); +done: /* * Wait for all copies to finish before cleaning up the vca. */ diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index deea7fedd770..4770ac76d4fb 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -317,7 +317,14 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txerr = dmu_tx_assign(tx, TXG_WAIT); + if (txerr != 0) { + ASSERT(spa_exiting_any(spa)); + dmu_tx_abort(tx); + return; + } + + vd->vdev_trim_state = new_state; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, guid, tx); @@ -502,7 +509,15 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) mutex_exit(&vd->vdev_trim_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + ASSERT(spa_exiting_any(spa)); + mutex_enter(&vd->vdev_trim_io_lock); + vd->vdev_trim_inflight[ta->trim_type]--; + mutex_exit(&vd->vdev_trim_io_lock); + dmu_tx_abort(tx); + return (err); + } uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index c0c280c52076..9ac2994bcb7d 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -545,6 +545,13 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); + zap_leaf_phys_t *zap_phys = db->db_data; + if (zap_phys->l_hdr.lh_block_type != ZBT_LEAF || + zap_phys->l_hdr.lh_magic != ZAP_LEAF_MAGIC) { + dmu_buf_rele(db, NULL); + return (SET_ERROR(EIO)); + } + zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) @@ -559,8 +566,6 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 0d5536cf7cb0..d94a3ec14637 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1580,7 +1580,9 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; - zfs_log_history(zc); + if (!force && !hardforce) + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); @@ -7154,9 +7156,9 @@ zfs_ioctl_init(void) * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, - zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); @@ -7164,10 +7166,10 @@ zfs_ioctl_init(void) zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, - zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, - zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d9c3042084e3..4c4fae9e8053 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -692,7 +692,12 @@ zil_create(zilog_t *zilog) */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return (NULL); + } dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -747,6 +752,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; + int error; /* * Wait for any previous destroy to complete. @@ -759,7 +765,12 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) return; tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return; + } dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -1490,7 +1501,11 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ - VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + if (dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE) != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return (NULL); + } dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -2776,7 +2791,12 @@ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + ASSERT(dmu_objset_exiting(zilog->zl_os)); + dmu_tx_abort(tx); + return; + } itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; @@ -2938,6 +2958,12 @@ zil_commit(zilog_t *zilog, uint64_t foid) return; } + /* + * If the objset is being forced to exit, there's nothing more to do. + */ + if (dmu_objset_exiting(zilog->zl_os)) + return; + /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out @@ -3281,13 +3307,15 @@ zil_close(zilog_t *zilog) * ZIL to be clean, and to wait for all pending lwbs to be * written out. */ - if (txg != 0) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (!dmu_objset_exiting(zilog->zl_os)) { + if (txg != 0) + txg_wait_synced(zilog->zl_dmu_pool, txg); - if (zilog_is_dirty(zilog)) - zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, txg); - if (txg < spa_freeze_txg(zilog->zl_spa)) - VERIFY(!zilog_is_dirty(zilog)); + if (zilog_is_dirty(zilog)) + zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, txg); + if (txg < spa_freeze_txg(zilog->zl_spa)) + VERIFY(!zilog_is_dirty(zilog)); + } zilog->zl_get_data = NULL; @@ -3304,7 +3332,15 @@ zil_close(zilog_t *zilog) metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); list_remove(&zilog->zl_lwb_list, lwb); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + if (lwb->lwb_buf != NULL) { + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + } else { + /* + * Pool is being force exported, while this lwb was + * between zil_lwb_flush_vdevs_done and zil_sync. + */ + ASSERT(spa_exiting(zilog->zl_spa)); + } zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 66ac545c7981..e55823aad020 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2243,6 +2243,14 @@ zio_wait(zio_t *zio) mutex_exit(&zio->io_lock); error = zio->io_error; + if (error != 0 && (zio->io_flags & ZIO_FLAG_CANFAIL) == 0 && + spa_exiting_any(zio->io_spa)) { + /* + * Don't report errors to the callers. In this context, the + * pool is being forcibly exported, so just throw it away. + */ + error = 0; + } zio_destroy(zio); return (error); @@ -2297,11 +2305,22 @@ zio_reexecute(zio_t *pio) pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; - pio->io_pipeline = pio->io_orig_pipeline; - pio->io_reexecute = 0; + if (spa_exiting_any(pio->io_spa)) { + /* + * This pool is being forcibly exported; skip everything and + * finish as soon as possible. + */ + pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (pio->io_error == 0) + pio->io_error = SET_ERROR(EIO); + pio->io_reexecute = ZIO_REEXECUTE_CANCELLED; + } else { + pio->io_pipeline = pio->io_orig_pipeline; + pio->io_error = 0; + pio->io_reexecute = 0; + } pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; - pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) @@ -2343,6 +2362,8 @@ zio_reexecute(zio_t *pio) void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { + dsl_pool_t *dp = spa_get_dsl(spa); + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " @@ -2373,16 +2394,19 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) } mutex_exit(&spa->spa_suspend_lock); + + /* Notify waiters that might care about this state transition. */ + for (int i = 0; i < SCL_LOCKS; i++) + cv_broadcast(&spa->spa_config_lock[i].scl_cv); + cv_broadcast(&spa->spa_evicting_os_cv); + txg_completion_notify(dp); } -int -zio_resume(spa_t *spa) +static zio_t * +zio_unsuspend(spa_t *spa) { zio_t *pio; - /* - * Reexecute all previously suspended i/o. - */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); @@ -2390,20 +2414,52 @@ zio_resume(spa_t *spa) spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); - if (pio == NULL) - return (0); - - zio_reexecute(pio); - return (zio_wait(pio)); + return (pio); } void -zio_resume_wait(spa_t *spa) +zio_cancel(spa_t *spa) { + zio_t *pio; + + /* + * Interrupt all physical zios. + * Only meaningful in the context of a forced export. + */ mutex_enter(&spa->spa_suspend_lock); - while (spa_suspended(spa)) - cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + pio = spa->spa_suspend_zio_root; + spa->spa_suspend_zio_root = NULL; + cv_broadcast(&spa->spa_suspend_cv); mutex_exit(&spa->spa_suspend_lock); + if (pio == NULL) + return; + + zio_reexecute(pio); + (void) zio_wait(pio); +} + +int +zio_resume(spa_t *spa) +{ + zio_t *pio; + + /* + * Issue an async request to update the pool's configuration in case + * suspension occurred while such an update was in progress. This + * will restart the update process from the beginning. We could + * make it conditional, but it's safer not to. + */ + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + + /* + * Reexecute all previously suspended i/o. + */ + pio = zio_unsuspend(spa); + if (pio == NULL) + return (0); + + zio_reexecute(pio); + return (zio_wait(pio)); } /* @@ -4347,7 +4403,7 @@ zio_ready(zio_t *zio) return (NULL); } - if (zio->io_ready) { + if (zio->io_ready && zio->io_spa->spa_export_initiator == NULL) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); @@ -4493,6 +4549,13 @@ zio_done(zio_t *zio) return (NULL); } + /* + * If the pool is forcibly exporting, make sure everything is + * thrown away, as nothing can be trusted now. + */ + if (spa_exiting_any(zio->io_spa) && zio->io_error == 0) + zio->io_error = SET_ERROR(EIO); + /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async @@ -4612,7 +4675,7 @@ zio_done(zio_t *zio) } } - if (zio->io_error) { + if (zio->io_error && !spa_exiting_any(zio->io_spa)) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure @@ -4751,7 +4814,20 @@ zio_done(zio_t *zio) } } - if ((pio = zio_unique_parent(zio)) != NULL) { + if (zio->io_reexecute & ZIO_REEXECUTE_CANCELLED) { + /* + * This zio had been marked for reexecute previously, + * and upon reexecution, found the pool being forcibly + * exported. Nothing to do now but clean up. + * + * This special flag is used because it allows the + * zio pipeline to mark all zios in the tree as + * cancelled, before cleaning them up. + */ + ASSERT3U(zio->io_error, !=, 0); + zio->io_reexecute = 0; + goto finish; + } else if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors @@ -4784,9 +4860,11 @@ zio_done(zio_t *zio) return (NULL); } +finish: ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); - ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL) || + zio->io_spa->spa_export_initiator != NULL); /* * Report any checksum errors, since the I/O is complete. diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e87c1cd641ff..257da5853963 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -373,7 +373,8 @@ tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', - 'zpool_export_003_neg', 'zpool_export_004_pos'] + 'zpool_export_003_neg', 'zpool_export_004_pos', 'zpool_export_005_pos', + 'zpool_export_006_pos', 'zpool_export_007_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 0db9724eead0..93e30643ec0b 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -55,6 +55,7 @@ export SYSTEM_FILES_COMMON='arp logname ls mkdir + mkfifo mknod mktemp mount @@ -70,6 +71,7 @@ export SYSTEM_FILES_COMMON='arp printenv printf ps + pv pwd python python2 diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index a1b75a48292f..6d297c9d2829 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -33,6 +33,7 @@ DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check +FORCED_EXPORT_UNMOUNT UNSUPPORTED zfs_forced_export_unmount_enabled INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh index 258de033b86c..e1b3ea2040f5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh @@ -49,16 +49,12 @@ log_must eval "zpool events -H -f > $EVENTS_FILE &" pid=$! # 3. Generate some ZFS events -for i in `seq 1 $EVENTS_NUM`; do - log_must zpool clear $TESTPOOL -done +log_must zpool clear $TESTPOOL # wait a bit to allow the kernel module to process new events zpool_events_settle # 4. Verify 'zpool events -f' successfully recorded these new events EVENTS_LOG=$(cat $EVENTS_FILE | wc -l) -if [[ $EVENTS_LOG -ne $EVENTS_NUM ]]; then - log_fail "Unexpected number of events: $EVENTS_LOG != $EVENTS_NUM" -fi +log_must test $EVENTS_LOG -gt 0 log_pass "'zpool events -f' successfully follows new events." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am index 1c06d5b59e9b..23c864db19ac 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am @@ -5,7 +5,10 @@ dist_pkgdata_SCRIPTS = \ zpool_export_001_pos.ksh \ zpool_export_002_pos.ksh \ zpool_export_003_neg.ksh \ - zpool_export_004_pos.ksh + zpool_export_004_pos.ksh \ + zpool_export_005_pos.ksh \ + zpool_export_006_pos.ksh \ + zpool_export_007_pos.ksh dist_pkgdata_DATA = \ zpool_export.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib index 5484f20674d5..9b2cfec1fb85 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.kshlib @@ -25,6 +25,12 @@ . $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.cfg +function create_fifo +{ + log_must rm -f $1 + log_must mkfifo $1 +} + function zpool_export_cleanup { [[ -d $TESTDIR0 ]] && log_must rm -rf $TESTDIR0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh new file mode 100755 index 000000000000..1bd0de168021 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_005_pos.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib + +# +# DESCRIPTION: +# A pool should be force exportable, while a send is running from it. +# +# STRATEGY: +# 1. Initiate a send from pool to a file. +# 2. Slow the send using pv, so it blocks a normal pool export. +# 3. Check that normal export fails. +# 4. Forcibly export pool. +# 5. Verify pool is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup { + [[ -n "$sendpid" ]] && kill -9 "$sendpid" + [[ -n "$pvpid" ]] && kill -9 $pvpid + [[ -n "$snapstream" ]] && rm -f "$snapstream" + zpool_export_cleanup +} + +log_onexit cleanup + +log_assert "Verify a pool can be forcibly exported while sending." + +snap=$TESTPOOL1/$TESTFS@$TESTSNAP +snapstream=$TEST_BASE_DIR/send.$$ + +vdev0=$TESTDIR0/$TESTFILE0 +log_must mkdir -p $TESTDIR0 +log_must truncate -s 1G $vdev0 +log_must zpool create -f $TESTPOOL1 $vdev0 +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/$TESTFILE1 bs=1M count=16 + +log_must zfs snapshot $snap + +# Create FIFOs for the send, so the processes can be controlled and +# monitored individually. +create_fifo $TESTDIR0/snapfifo +zfs send $snap > $TESTDIR0/snapfifo & +sendpid=$! +pv -L 1k < $TESTDIR0/snapfifo > $snapstream & +pvpid=$! + +log_note "zfs send pid is $sendpid, pv pid is $pvpid" + +log_mustnot zpool export $TESTPOOL1 + +# Send should still be running; now try force export. +log_must kill -0 $sendpid +log_must zpool export -F $TESTPOOL1 + +lsout=$(ls -l $snapstream) +log_note "snapstream: $lsout" + +# Send should have exited non-zero. +log_mustnot wait $sendpid + +poolexists $TESTPOOL1 && \ + log_fail "$TESTPOOL1 unexpectedly found in 'zpool list' output." + +log_pass "Successfully forcibly exported a pool while sending." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh new file mode 100755 index 000000000000..6b3bd6923d28 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_006_pos.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib + +# +# DESCRIPTION: +# A pool should be force exportable, while a send is running from it. +# +# STRATEGY: +# 1. Initiate a send from pool A to pool B. +# 2. Slow the send using pv, so it blocks a normal pool export. +# 3. Check that normal export of pool Bfails. +# 4. Forcibly export pool B. +# 5. Verify pool B is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup { + [[ -n "$sendpid" ]] && kill -9 "$sendpid" + [[ -n "$recvpid" ]] && kill -9 "$recvpid" + [[ -n "$pvpid" ]] && kill -9 "$pvpid" + zpool_export_cleanup +} + +log_onexit cleanup + +log_assert "Verify a receiving pool can be forcibly exported." + +srcsnap=$TESTPOOL1/$TESTFS@$TESTSNAP +dstsnap=$TESTPOOL2/$TESTFS@$TESTSNAP + +vdev0=$TESTDIR0/$TESTFILE0 +vdev1=$TESTDIR0/$TESTFILE1 +log_must mkdir -p $TESTDIR0 +log_must truncate -s 1G $vdev0 $vdev1 +log_must zpool create -f $TESTPOOL1 $vdev0 +log_must zpool create -f $TESTPOOL2 $vdev1 +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) +log_must dd if=/dev/urandom of=$mntpnt/$TESTFILE1 bs=1M count=16 + +log_must zfs snapshot $srcsnap + +# Create FIFOs for send and receive, so the processes can be controlled and +# monitored individually. +create_fifo $TESTDIR0/sendfifo +create_fifo $TESTDIR0/recvfifo + +zfs send $srcsnap > $TESTDIR0/sendfifo & +sendpid=$! +pv -L 1k < $TESTDIR0/sendfifo > $TESTDIR0/recvfifo & +pvpid=$! +zfs recv $dstsnap < $TESTDIR0/recvfifo & +recvpid=$! + +log_note "zfs send pid is $sendpid, recv pid is $recvpid, pv pid is $pvpid" + +log_note "Waiting until zfs receive has a chance to start ..." +typeset -i i=0 +typeset -i timeout=5 +while (( $i < $timeout )); do + zfs list $TESTPOOL2/$TESTFS >/dev/null 2>&1 && break + sleep 1 + ((i = i + 1)) +done +[[ $i -lt $timeout ]] || log_fail "receive failed to start" + +log_must zfs list $TESTPOOL2/$TESTFS + +log_mustnot zpool export $TESTPOOL2 + +# Send & receive should still be running; now try force export. +log_must kill -0 $sendpid +log_must kill -0 $recvpid +log_must zpool export -F $TESTPOOL2 + +# Both zfs send & recv should have exited non-zero. +log_mustnot wait $recvpid +log_mustnot wait $sendpid + +poolexists $TESTPOOL1 || \ + log_fail "$TESTPOOL1 should be in 'zpool list' output." +poolexists $TESTPOOL2 && \ + log_fail "$TESTPOOL2 unexpectedly found in 'zpool list' output." + +log_pass "Successfully forcibly exported a pool while receiving." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh new file mode 100755 index 000000000000..8fb16e92dcf9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_007_pos.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Klara Systems, Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_export/zpool_export.kshlib + +# +# DESCRIPTION: +# A pool should be force exportable, while POSIX I/O is in flight. +# +# STRATEGY: +# 1. Write to a file that is held open, slowed using pv, so it blocks a +# normal filesystem unmount / pool export. +# 2. Check that normal export fails. +# 3. Forcibly export pool. +# 4. Verify pool is no longer present in the list output. +# + +verify_runnable "global" + +function cleanup { + [[ -n "$ddinpid" ]] && kill -9 "$ddinpid" + [[ -n "$ddoutpid" ]] && kill -9 "$ddoutpid" + if is_linux; then + log_must set_tunable64 FORCED_EXPORT_UNMOUNT 0 + fi + zpool_export_cleanup +} + +log_onexit cleanup + +log_assert "Verify a pool can be forcibly exported while writing POSIX I/O" + +snap=$TESTPOOL1/$TESTFS@$TESTSNAP +snapstream=$TEST_BASE_DIR/send.$$ + +# On Linux, it's necessary to enable a tunable for the test to be able to +# kick the POSIX I/O user off. +if is_linux; then + log_must set_tunable64 FORCED_EXPORT_UNMOUNT 1 +fi + +vdev0=$TESTDIR0/$TESTFILE0 +log_must mkdir -p $TESTDIR0 +log_must truncate -s 1G $vdev0 +log_must zpool create -f $TESTPOOL1 $vdev0 +log_must zfs create $TESTPOOL1/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS) + +# Create FIFOs for the writes, so the processes can be controlled and +# monitored individually. +create_fifo $TESTDIR0/writefifo +dd if=/dev/urandom bs=1M count=16 | pv -L 1k > $TESTDIR0/writefifo & +ddinpid=$! + +dd of=${mntpnt}/$TESTFILE1 < $TESTDIR0/writefifo & +ddoutpid=$! + +log_note "dd input pid is $ddinpid, dd output pid is $ddoutpid" + +log_note "Waiting until output file is filling ..." +typeset -i i=0 +typeset -i timeout=5 +while (( $i < $timeout )); do + test -f ${mntpnt}/$TESTFILE1 && break + sleep 1 + ((i = i + 1)) +done +[[ $i -lt $timeout ]] || log_fail "dd failed to start" + +log_mustnot zpool export $TESTPOOL1 + +# Write should still be running; now try force export. We must do this +# twice so dd dies initially. +log_must kill -0 $ddoutpid +log_mustnot zpool export -F $TESTPOOL1 +# Write should have exited non-zero. +log_mustnot wait $ddoutpid +log_must zpool export -F $TESTPOOL1 + +poolexists $TESTPOOL1 && \ + log_fail "$TESTPOOL1 unexpectedly found in 'zpool list' output." + +log_pass "Successfully forcibly exported a pool while writing POSIX I/O sending." diff --git a/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh index 03fc15a8a7cb..f17b6edf2eda 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh @@ -105,7 +105,7 @@ log_must mkfile $FSIZE /$TESTPOOL/data for offline_disk in $autoonline_disks do - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL host=$(get_scsi_host $offline_disk) diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh index 544e9291de29..1f3f7b7e1124 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh @@ -78,6 +78,7 @@ typeset log_blk_end=$(get_arcstat l2_log_blk_writes) typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL +log_must zpool sync $TESTPOOL typeset l2_hits_start=$(get_arcstat l2_hits) diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh index ee46e7b8cad6..24cf8369ee97 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh @@ -84,6 +84,7 @@ typeset log_blk_end=$(get_arcstat l2_log_blk_writes) typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL +log_must zpool sync $TESTPOOL log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" typeset l2_hits_start=$(get_arcstat l2_hits) diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.cfg b/tests/zfs-tests/tests/functional/mmp/mmp.cfg index 9f7e76e27018..d25e96074a51 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.cfg +++ b/tests/zfs-tests/tests/functional/mmp/mmp.cfg @@ -20,6 +20,7 @@ export PREV_UBER="$TEST_BASE_DIR/mmp-uber-prev.txt" export CURR_UBER="$TEST_BASE_DIR/mmp-uber-curr.txt" export DISK=${DISKS%% *} +export TESTPOOL="testpool.mmp" export HOSTID_FILE="/etc/hostid" export HOSTID1=01234567 diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh index 6e7bb637548d..ee9468ce2a0b 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh @@ -57,19 +57,19 @@ default_setup_noexit $DISK log_must zpool set multihost=off $TESTPOOL for opt in "" "-f"; do - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must import_no_activity_check $TESTPOOL $opt done # 3. Verify multihost=off and hostids differ (no activity check) -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID2 log_mustnot import_no_activity_check $TESTPOOL "" log_must import_no_activity_check $TESTPOOL "-f" # 4. Verify multihost=off and hostid zero allowed (no activity check) -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_mustnot import_no_activity_check $TESTPOOL "" log_must import_no_activity_check $TESTPOOL "-f" @@ -79,19 +79,19 @@ log_must mmp_pool_set_hostid $TESTPOOL $HOSTID1 log_must zpool set multihost=on $TESTPOOL for opt in "" "-f"; do - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must import_no_activity_check $TESTPOOL $opt done # 6. Verify multihost=on and hostids differ (activity check) -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID2 log_mustnot import_activity_check $TESTPOOL "" log_must import_activity_check $TESTPOOL "-f" # 7. Verify mmp_write and mmp_fail are set correctly -log_must zpool export -F $TESTPOOL +log_must zpool export -f $TESTPOOL log_must verify_mmp_write_fail_present ${DISK[0]} # 8. Verify multihost=on and hostid zero fails (no activity check) diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh index 6e3d1fe34d4b..4e4dc2fcafa7 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh @@ -97,15 +97,15 @@ for x in $(seq 10); do log_must mmp_set_hostid $HOSTID1 log_must zpool import $TESTPOOL elif [ $action -eq 1 ]; then - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must zpool import $TESTPOOL elif [ $action -eq 2 ]; then - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must mmp_clear_hostid log_must mmp_set_hostid $HOSTID2 log_must zpool import -f $TESTPOOL elif [ $action -eq 3 ]; then - log_must zpool export -F $TESTPOOL + log_must zpool export -f $TESTPOOL log_must set_tunable64 MULTIHOST_INTERVAL $MMP_INTERVAL_MIN log_must zpool import $TESTPOOL elif [ $action -eq 4 ]; then diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh index 59f64081a977..48b0a871dd1c 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh @@ -47,6 +47,7 @@ log_must zpool checkpoint $NESTEDPOOL log_must truncate -s $EXPSZ $FILEDISK1 log_must zpool online -e $NESTEDPOOL $FILEDISK1 +log_must zpool sync $NESTEDPOOL NEWSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}') nested_change_state_after_checkpoint log_mustnot [ "$INITSZ" = "$NEWSZ" ]