From e42a63a0bac36aaf468f1ab6042f3f3b208087c5 Mon Sep 17 00:00:00 2001
From: kevans <kevans@FreeBSD.org>
Date: Thu, 9 May 2019 03:51:34 +0000
Subject: [PATCH] MFC r346602, r346670-r346671, r347183: tun/tap race fixes

r346602:
tun(4): Defer clearing TUN_OPEN until much later

tun destruction will not continue until TUN_OPEN is cleared. There are brief
moments in tunclose where the mutex is dropped and we've already cleared
TUN_OPEN, so tun_destroy would be able to proceed while we're in the middle
of cleaning up the tun still. tun_destroy should be blocked until these
parts (address/route purges, mostly) are complete.

r346670:
tun/tap: close race between destroy/ioctl handler

It seems that there should be a better way to handle this, but this seems to
be the more common approach and it should likely get replaced in all of the
places it happens... Basically, thread 1 is in the process of destroying the
tun/tap while thread 2 is executing one of the ioctls that requires the
tun/tap mutex and the mutex is destroyed before the ioctl handler can
acquire it.

This is only one of the races described/found in PR 233955.

r346671:
tun(4): Don't allow open of open or dying devices

Previously, a pid check was used to prevent open of the tun(4); this works,
but may not make the most sense as we don't prevent the owner process from
opening the tun device multiple times.

The potential race described near tun_pid should not be an issue: if a
tun(4) is to be handed off, its fd has to have been sent via control message
or some other mechanism that duplicates the fd to the receiving process so
that it may set the pid. Otherwise, the pid gets cleared when the original
process closes it and you have no effective handoff mechanism.

Close up another potential issue with handing a tun(4) off by not clobbering
state if the closer isn't the controller anymore. If we want some state to
be cleared, we should do that a little more surgically.

Additionally, nothing prevents a dying tun(4) from being "reopened" in the
middle of tun_destroy as soon as the mutex is unlocked, quickly leading to a
bad time. Return EBUSY if we're marked for destruction, as well, and the
consumer will need to deal with it. The associated character device will be
destroyed in short order.

r347183:
geom: fix initialization order

There's a race between the initialization of devsoftc.mtx (by devinit)
and the creation of the geom worker thread g_run_events, which calls
devctl_queue_data_f. Both of those are initialized at SI_SUB_DRIVERS
and SI_ORDER_FIRST, which means the geom worked thread can be created
before the mutex has been initialized, leading to the panic below:

 wpanic: mtx_lock() of spin mutex (null) @ /usr/home/osstest/build.135317.build-amd64-freebsd/freebsd/sys/kern/subr_bus.c:620
 cpuid = 3
 time = 1
 KDB: stack backtrace:
 db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe003b968710
 vpanic() at vpanic+0x19d/frame 0xfffffe003b968760
 panic() at panic+0x43/frame 0xfffffe003b9687c0
 __mtx_lock_flags() at __mtx_lock_flags+0x145/frame 0xfffffe003b968810
 devctl_queue_data_f() at devctl_queue_data_f+0x6a/frame 0xfffffe003b968840
 g_dev_taste() at g_dev_taste+0x463/frame 0xfffffe003b968a00
 g_load_class() at g_load_class+0x1bc/frame 0xfffffe003b968a30
 g_run_events() at g_run_events+0x197/frame 0xfffffe003b968a70
 fork_exit() at fork_exit+0x84/frame 0xfffffe003b968ab0
 fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe003b968ab0
 --- trap 0, rip = 0, rsp = 0, rbp = 0 ---
 KDB: enter: panic
 [ thread pid 13 tid 100029 ]
 Stopped at      kdb_enter+0x3b: movq    $0,kdb_why

Fix this by initializing geom at SI_ORDER_SECOND instead of
SI_ORDER_FIRST.

PR:		233955
---
 sys/geom/geom.h  |  2 +-
 sys/net/if_tap.c | 19 +++++++++++++++++-
 sys/net/if_tun.c | 52 +++++++++++++++++++++++++++++++-----------------
 3 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/sys/geom/geom.h b/sys/geom/geom.h
index 25282805e292..0802c53ef210 100644
--- a/sys/geom/geom.h
+++ b/sys/geom/geom.h
@@ -415,7 +415,7 @@ g_free(void *ptr)
 	static moduledata_t name##_mod = {			\
 		#name, g_modevent, &class			\
 	};							\
-	DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+	DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
 
 int g_is_geom_thread(struct thread *td);
 
diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c
index 0600db86f5bb..c9057c0a216c 100644
--- a/sys/net/if_tap.c
+++ b/sys/net/if_tap.c
@@ -41,6 +41,7 @@
 
 #include <sys/param.h>
 #include <sys/conf.h>
+#include <sys/lock.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
@@ -55,6 +56,7 @@
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
+#include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/ttycom.h>
@@ -163,6 +165,9 @@ MALLOC_DECLARE(M_TAP);
 MALLOC_DEFINE(M_TAP, CDEV_NAME, "Ethernet tunnel interface");
 SYSCTL_INT(_debug, OID_AUTO, if_tap_debug, CTLFLAG_RW, &tapdebug, 0, "");
 
+static struct sx tap_ioctl_sx;
+SX_SYSINIT(tap_ioctl_sx, &tap_ioctl_sx, "tap_ioctl");
+
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0,
     "Ethernet tunnel software network interface");
@@ -218,6 +223,10 @@ tap_destroy(struct tap_softc *tp)
 	struct ifnet *ifp = tp->tap_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
+	sx_xlock(&tap_ioctl_sx);
+	ifp->if_softc = NULL;
+	sx_xunlock(&tap_ioctl_sx);
+
 	destroy_dev(tp->tap_dev);
 	seldrain(&tp->tap_rsel);
 	knlist_clear(&tp->tap_rsel.si_note, 0);
@@ -601,12 +610,18 @@ tapifinit(void *xtp)
 static int
 tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
-	struct tap_softc	*tp = ifp->if_softc;
+	struct tap_softc	*tp;
 	struct ifreq		*ifr = (struct ifreq *)data;
 	struct ifstat		*ifs = NULL;
 	struct ifmediareq	*ifmr = NULL;
 	int			 dummy, error = 0;
 
+	sx_xlock(&tap_ioctl_sx);
+	tp = ifp->if_softc;
+	if (tp == NULL) {
+		error = ENXIO;
+		goto bad;
+	}
 	switch (cmd) {
 		case SIOCSIFFLAGS: /* XXX -- just like vmnet does */
 		case SIOCADDMULTI:
@@ -649,6 +664,8 @@ tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 			break;
 	}
 
+bad:
+	sx_xunlock(&tap_ioctl_sx);
 	return (error);
 } /* tapifioctl */
 
diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c
index 8b39fb171d4a..2544ed2b5f9f 100644
--- a/sys/net/if_tun.c
+++ b/sys/net/if_tun.c
@@ -20,6 +20,7 @@
 #include "opt_inet6.h"
 
 #include <sys/param.h>
+#include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
@@ -30,6 +31,7 @@
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
+#include <sys/sx.h>
 #include <sys/ttycom.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
@@ -79,16 +81,10 @@ struct tun_softc {
 #define	TUN_RWAIT	0x0040
 #define	TUN_ASYNC	0x0080
 #define	TUN_IFHEAD	0x0100
+#define	TUN_DYING	0x0200
 
 #define TUN_READY       (TUN_OPEN | TUN_INITED)
 
-	/*
-	 * XXXRW: tun_pid is used to exclusively lock /dev/tun.  Is this
-	 * actually needed?  Can we just return EBUSY if already open?
-	 * Problem is that this involved inherent races when a tun device
-	 * is handed off from one process to another, as opposed to just
-	 * being slightly stale informationally.
-	 */
 	pid_t	tun_pid;		/* owning pid */
 	struct	ifnet *tun_ifp;		/* the interface */
 	struct  sigio *tun_sigio;	/* information for async I/O */
@@ -115,6 +111,9 @@ static struct clonedevs *tunclones;
 static TAILQ_HEAD(,tun_softc)	tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
 SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");
 
+static struct sx tun_ioctl_sx;
+SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl");
+
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0,
     "IP tunnel software network interface.");
@@ -272,12 +271,17 @@ tun_destroy(struct tun_softc *tp)
 	struct cdev *dev;
 
 	mtx_lock(&tp->tun_mtx);
+	tp->tun_flags |= TUN_DYING;
 	if ((tp->tun_flags & TUN_OPEN) != 0)
 		cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx);
 	else
 		mtx_unlock(&tp->tun_mtx);
 
 	CURVNET_SET(TUN2IFP(tp)->if_vnet);
+	sx_xlock(&tun_ioctl_sx);
+	TUN2IFP(tp)->if_softc = NULL;
+	sx_xunlock(&tun_ioctl_sx);
+
 	dev = tp->tun_dev;
 	bpfdetach(TUN2IFP(tp));
 	if_detach(TUN2IFP(tp));
@@ -464,19 +468,13 @@ tunopen(struct cdev *dev, int flag, int mode, struct thread *td)
 		tp = dev->si_drv1;
 	}
 
-	/*
-	 * XXXRW: This use of tun_pid is subject to error due to the
-	 * fact that a reference to the tunnel can live beyond the
-	 * death of the process that created it.  Can we replace this
-	 * with a simple busy flag?
-	 */
 	mtx_lock(&tp->tun_mtx);
-	if (tp->tun_pid != 0 && tp->tun_pid != td->td_proc->p_pid) {
+	if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) {
 		mtx_unlock(&tp->tun_mtx);
 		return (EBUSY);
 	}
-	tp->tun_pid = td->td_proc->p_pid;
 
+	tp->tun_pid = td->td_proc->p_pid;
 	tp->tun_flags |= TUN_OPEN;
 	ifp = TUN2IFP(tp);
 	if_link_state_change(ifp, LINK_STATE_UP);
@@ -500,8 +498,16 @@ tunclose(struct cdev *dev, int foo, int bar, struct thread *td)
 	ifp = TUN2IFP(tp);
 
 	mtx_lock(&tp->tun_mtx);
-	tp->tun_flags &= ~TUN_OPEN;
-	tp->tun_pid = 0;
+	/*
+	 * Simply close the device if this isn't the controlling process.  This
+	 * may happen if, for instance, the tunnel has been handed off to
+	 * another process.  The original controller should be able to close it
+	 * without putting us into an inconsistent state.
+	 */
+	if (td->td_proc->p_pid != tp->tun_pid) {
+		mtx_unlock(&tp->tun_mtx);
+		return (0);
+	}
 
 	/*
 	 * junk all pending output
@@ -540,6 +546,8 @@ tunclose(struct cdev *dev, int foo, int bar, struct thread *td)
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	TUNDEBUG (ifp, "closed\n");
+	tp->tun_flags &= ~TUN_OPEN;
+	tp->tun_pid = 0;
 
 	cv_broadcast(&tp->tun_cv);
 	mtx_unlock(&tp->tun_mtx);
@@ -588,10 +596,16 @@ static int
 tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
-	struct tun_softc *tp = ifp->if_softc;
+	struct tun_softc *tp;
 	struct ifstat *ifs;
 	int		error = 0;
 
+	sx_xlock(&tun_ioctl_sx);
+	tp = ifp->if_softc;
+	if (tp == NULL) {
+		error = ENXIO;
+		goto bad;
+	}
 	switch(cmd) {
 	case SIOCGIFSTATUS:
 		ifs = (struct ifstat *)data;
@@ -618,6 +632,8 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 	default:
 		error = EINVAL;
 	}
+bad:
+	sx_xunlock(&tun_ioctl_sx);
 	return (error);
 }