Skip to content

Commit d781738

Browse files
committed
MDEV-27900: aio handle partial reads/writes
As btrfs showed, a partial read of data in AIO /O_DIRECT circumstances can really confuse MariaDB. Filipe Manana (SuSE)[1] showed how database programmers can assume O_DIRECT is all or nothing. While a fix was done in the kernel side, we can do better in our code by requesting that the rest of the block be read/written synchronously if we do only get a partial read/write. Per the APIs, a partial read/write can occur before an error, so reattempting the request will leave the caller with a concrete error to handle. [1] https://lore.kernel.org/linux-btrfs/CABVffENfbsC6HjGbskRZGR2NvxbnQi17gAuW65eOM+QRzsr8Bg@mail.gmail.com/T/#mb2738e675e48e0e0778a2e8d1537dec5ec0d3d3a Also spell synchronously correctly in other files.
1 parent 4cfb6ed commit d781738

File tree

6 files changed

+62
-29
lines changed

6 files changed

+62
-29
lines changed

storage/innobase/buf/buf0rea.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ buf_read_page_low(
297297
/* Trx sys header is so low in the latching order that we play
298298
safe and do not leave the i/o-completion to an asynchronous
299299
i/o-thread. Change buffer pages must always be read with
300-
syncronous i/o, to make sure they do not get involved in
300+
synchronous i/o, to make sure they do not get involved in
301301
thread deadlocks. */
302302
sync = true;
303303
}

storage/innobase/os/os0file.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2807,7 +2807,7 @@ os_file_set_eof(
28072807

28082808
#endif /* !_WIN32*/
28092809

2810-
/** Does a syncronous read or write depending upon the type specified
2810+
/** Does a synchronous read or write depending upon the type specified
28112811
In case of partial reads/writes the function tries
28122812
NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
28132813
@param[in] type, IO flags

tpool/aio_linux.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ class aio_linux final : public aio
131131
{
132132
iocb->m_ret_len= event.res;
133133
iocb->m_err= 0;
134+
if (iocb->m_ret_len != iocb->m_len)
135+
finish_synchronous(iocb);
134136
}
135137
iocb->m_internal_task.m_func= iocb->m_callback;
136138
iocb->m_internal_task.m_arg= iocb;

tpool/aio_simulated.cc

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -136,32 +136,7 @@ class simulated_aio : public aio
136136
static void simulated_aio_callback(void *param)
137137
{
138138
aiocb *cb= (aiocb *) param;
139-
#ifdef _WIN32
140-
size_t ret_len;
141-
#else
142-
ssize_t ret_len;
143-
#endif
144-
int err= 0;
145-
switch (cb->m_opcode)
146-
{
147-
case aio_opcode::AIO_PREAD:
148-
ret_len= pread(cb->m_fh, cb->m_buffer, cb->m_len, cb->m_offset);
149-
break;
150-
case aio_opcode::AIO_PWRITE:
151-
ret_len= pwrite(cb->m_fh, cb->m_buffer, cb->m_len, cb->m_offset);
152-
break;
153-
default:
154-
abort();
155-
}
156-
#ifdef _WIN32
157-
if (static_cast<int>(ret_len) < 0)
158-
err= GetLastError();
159-
#else
160-
if (ret_len < 0)
161-
err= errno;
162-
#endif
163-
cb->m_ret_len = ret_len;
164-
cb->m_err = err;
139+
synchronous(cb);
165140
cb->m_internal_task.m_func= cb->m_callback;
166141
thread_pool *pool= (thread_pool *)cb->m_internal;
167142
pool->submit_task(&cb->m_internal_task);

tpool/tpool.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ class aio
156156
{
157157
public:
158158
/**
159-
Submit asyncronous IO.
159+
Submit asynchronous IO.
160160
On completion, cb->m_callback is executed.
161161
*/
162162
virtual int submit_io(aiocb *cb)= 0;
@@ -165,6 +165,10 @@ class aio
165165
/** "Unind" file to AIO handler (used on Windows only) */
166166
virtual int unbind(const native_file_handle &fd)= 0;
167167
virtual ~aio(){};
168+
protected:
169+
static void synchronous(aiocb *cb);
170+
/** finish a partial read/write callback synchronously */
171+
static void finish_synchronous(aiocb *cb);
168172
};
169173

170174
class timer

tpool/tpool_generic.cc

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,58 @@ namespace tpool
4747
static const std::chrono::milliseconds LONG_TASK_DURATION = std::chrono::milliseconds(500);
4848
static const int OVERSUBSCRIBE_FACTOR = 2;
4949

50+
/**
51+
Process the cb synchronously
52+
*/
53+
void aio::synchronous(aiocb *cb)
54+
{
55+
#ifdef _WIN32
56+
size_t ret_len;
57+
#else
58+
ssize_t ret_len;
59+
#endif
60+
int err= 0;
61+
switch (cb->m_opcode)
62+
{
63+
case aio_opcode::AIO_PREAD:
64+
ret_len= pread(cb->m_fh, cb->m_buffer, cb->m_len, cb->m_offset);
65+
break;
66+
case aio_opcode::AIO_PWRITE:
67+
ret_len= pwrite(cb->m_fh, cb->m_buffer, cb->m_len, cb->m_offset);
68+
break;
69+
default:
70+
abort();
71+
}
72+
#ifdef _WIN32
73+
if (static_cast<int>(ret_len) < 0)
74+
err= GetLastError();
75+
#else
76+
if (ret_len < 0)
77+
{
78+
err= errno;
79+
ret_len= 0;
80+
}
81+
#endif
82+
cb->m_ret_len = ret_len;
83+
cb->m_err = err;
84+
if (!err && cb->m_ret_len != cb->m_len)
85+
finish_synchronous(cb);
86+
}
87+
88+
89+
/**
90+
A partial read/write has occured, continue synchronously.
91+
*/
92+
void aio::finish_synchronous(aiocb *cb)
93+
{
94+
assert(cb->m_ret_len != (unsigned int) cb->m_len && !cb->m_err);
95+
/* partial read/write */
96+
cb->m_buffer= (char *) cb->m_buffer + cb->m_ret_len;
97+
cb->m_len-= (unsigned int) cb->m_ret_len;
98+
cb->m_offset+= cb->m_ret_len;
99+
synchronous(cb);
100+
}
101+
50102
/**
51103
Implementation of generic threadpool.
52104
This threadpool consists of the following components

0 commit comments

Comments
 (0)