MDEV-16962 Assertion failed in open_purge_table upon concurrent ALTER/FLUSH

FooBarrior · FooBarrior · commit 6ba5f81c7dcb · 2021-04-27T11:51:17.000+03:00
So we are having a race condition of three of threads, resulting in a
deadlock backoff in purge, which is unexpected.

More precisely, the following happens:
T1: NOCOPY ALTER TABLE begins, and eventually it holds MDL_SHARED_NO_WRITE
 lock;
T2: FLUSH TABLES begins. it sets share-&gt;tdc-&gt;flushed = true
T3: purge on a record with virtual column begins. it is going to open a
 table. MDL_SHARED_READ lock is acquired therefore.
Since share-&gt;tdc-&gt;flushed is set, it waits for a TDC purge end.
T1: is going to elevate MDL LOCK to exclusive and therefore has to set
 other waiters to back off.
T3: receives VICTIM status, reports a DEADLOCK, sets OT_BACKOFF_AND_RETRY
 to Open_table_context::m_action

My fix is to allow opening table in purge while flushing. It is already
done the same way in other maintainance facilities like REPAIR TABLE.

Another way would be making an actual backoff, but Open_table_context
does not allow to distinguish it from other failure types, which still
seem to be unexpected. Making this would require hacking into
Open_table_context interface for no benefit, in comparison to passing
MYSQL_OPEN_IGNORE_FLUSH during table open.
diff --git a/mysql-test/suite/gcol/r/innodb_virtual_debug_purge.result b/mysql-test/suite/gcol/r/innodb_virtual_debug_purge.result
@@ -232,7 +232,6 @@ set debug_sync= "now WAIT_FOR got_no_such_table TIMEOUT 1";
 set global debug_dbug= @saved_dbug;
 drop table t1;
 set debug_sync=reset;
-SET GLOBAL innodb_purge_rseg_truncate_frequency = @saved_frequency;
 #
 # MDEV-18546 ASAN heap-use-after-free
 # in innobase_get_computed_value / row_purge
@@ -277,3 +276,32 @@ pk	b	v
 DROP TABLE t1;
 SET debug_sync= reset;
 set global debug_dbug= @old_dbug;
+# MDEV-16962 Assertion '!error || !ot_ctx.can_recover_from_failed_open()'
+# failed in open_purge_table upon concurrent ALTER and FLUSH
+CREATE TABLE t1 (
+pk SERIAL,
+c VARCHAR(128),
+d DATE,
+vd DATE AS (d) VIRTUAL,
+PRIMARY KEY(pk),
+KEY(vd,c)
+) ENGINE=InnoDB;
+INSERT IGNORE INTO t1 (pk,c) VALUES (1,'foo');
+set debug_sync="now WAIT_FOR purge";
+connect  con1,localhost,root,,test;
+SET GLOBAL innodb_debug_sync="after_open_table_mdl_shared SIGNAL purge WAIT_FOR flush";
+SET global debug_dbug="d,ib_purge_virtual_index_callback";
+REPLACE INTO t1 (pk,c) VALUES (1,'bar');
+connection default;
+SET debug_sync="alter_table_before_rename_result_table WAIT_FOR flush";
+ALTER TABLE t1 ADD FULLTEXT KEY(c), ALGORITHM=COPY;
+connection con1;
+SET debug_sync="after_flush_unlock SIGNAL flush ";
+FLUSH TABLES;
+disconnect con1;
+connection default;
+InnoDB		0 transactions not purged
+DROP TABLE t1;
+SET debug_sync= reset;
+SET global debug_dbug=@old_dbug;
+SET GLOBAL innodb_purge_rseg_truncate_frequency = @saved_frequency;
diff --git a/mysql-test/suite/gcol/t/innodb_virtual_debug_purge.test b/mysql-test/suite/gcol/t/innodb_virtual_debug_purge.test
@@ -322,7 +322,6 @@ drop table t1;
 
 --source include/wait_until_count_sessions.inc
 set debug_sync=reset;
-SET GLOBAL innodb_purge_rseg_truncate_frequency = @saved_frequency;
 
 --echo #
 --echo # MDEV-18546 ASAN heap-use-after-free
@@ -386,3 +385,52 @@ SELECT * FROM t1;
 DROP TABLE t1;
 SET debug_sync= reset;
 set global debug_dbug= @old_dbug;
+
+
+--echo # MDEV-16962 Assertion '!error || !ot_ctx.can_recover_from_failed_open()'
+--echo # failed in open_purge_table upon concurrent ALTER and FLUSH
+
+CREATE TABLE t1 (
+  pk SERIAL,
+  c VARCHAR(128),
+  d DATE,
+  vd DATE AS (d) VIRTUAL,
+  PRIMARY KEY(pk),
+  KEY(vd,c)
+) ENGINE=InnoDB;
+INSERT IGNORE INTO t1 (pk,c) VALUES (1,'foo');
+
+--send
+set debug_sync="now WAIT_FOR purge";
+--connect (con1,localhost,root,,test)
+# Will break innodb purge thread inside open_purge_table after mdl
+# acquired, but before tdc->flushed check
+SET GLOBAL innodb_debug_sync="after_open_table_mdl_shared SIGNAL purge WAIT_FOR flush";
+
+# Workaround to pass trx_undo_roll_ptr_is_insert() in 10.2
+SET global debug_dbug="d,ib_purge_virtual_index_callback";
+
+REPLACE INTO t1 (pk,c) VALUES (1,'bar');
+
+--connection default
+# wait for MDL acquired by purge
+--reap
+# MDL_SHARED will be acquired, but will hang before MDL upgrade started.
+SET debug_sync="alter_table_before_rename_result_table WAIT_FOR flush";
+--send
+ALTER TABLE t1 ADD FULLTEXT KEY(c), ALGORITHM=COPY;
+--connection con1
+# Will hang after tdc->flushed is set, but before emptying tdc cache.
+SET debug_sync="after_flush_unlock SIGNAL flush ";
+FLUSH TABLES;
+
+# Cleanup
+--disconnect con1
+--connection default
+--reap
+--source ../../innodb/include/wait_all_purged.inc
+DROP TABLE t1;
+SET debug_sync= reset;
+SET global debug_dbug=@old_dbug;
+
+SET GLOBAL innodb_purge_rseg_truncate_frequency = @saved_frequency;
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
@@ -4424,7 +4424,7 @@ TABLE *open_purge_table(THD *thd, const char *db, size_t dblen,
   DBUG_ASSERT(thd->open_tables == NULL);
   DBUG_ASSERT(thd->locked_tables_mode < LTM_PRELOCKED);
 
-  Open_table_context ot_ctx(thd, 0);
+  Open_table_context ot_ctx(thd, MYSQL_OPEN_IGNORE_FLUSH);
   TABLE_LIST *tl= (TABLE_LIST*)thd->alloc(sizeof(TABLE_LIST));
 
   tl->init_one_table(db, dblen, tb, tblen, tb, TL_READ);