@@ -119,6 +119,70 @@ class READ_INFO {
119
119
*to= chr;
120
120
return false ;
121
121
}
122
+
123
+ /* *
124
+ Read a tail of a multi-byte character.
125
+ The first byte of the character is assumed to be already
126
+ read from the file and appended to "str".
127
+
128
+ @returns true - if EOF happened unexpectedly
129
+ @returns false - no EOF happened: found a good multi-byte character,
130
+ or a bad byte sequence
131
+
132
+ Note:
133
+ The return value depends only on EOF:
134
+ - read_mbtail() returns "false" is a good character was read, but also
135
+ - read_mbtail() returns "false" if an incomplete byte sequence was found
136
+ and no EOF happened.
137
+
138
+ For example, suppose we have an ujis file with bytes 0x8FA10A, where:
139
+ - 0x8FA1 is an incomplete prefix of a 3-byte character
140
+ (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
141
+ - 0x0A is a line demiliter
142
+ This file has some broken data, the trailing [A1-FE] is missing.
143
+
144
+ In this example it works as follows:
145
+ - 0x8F is read from the file and put into "data" before the call
146
+ for read_mbtail()
147
+ - 0xA1 is read from the file and put into "data" by read_mbtail()
148
+ - 0x0A is kept in the read queue, so the next read iteration after
149
+ the current read_mbtail() call will normally find it and recognize as
150
+ a line delimiter
151
+ - the current call for read_mbtail() returns "false",
152
+ because no EOF happened
153
+ */
154
+ bool read_mbtail (String *str)
155
+ {
156
+ int chlen;
157
+ if ((chlen= my_charlen (read_charset, str->end () - 1 , str->end ())) == 1 )
158
+ return false ; // Single byte character found
159
+ for (uint32 length0= str->length () - 1 ; MY_CS_IS_TOOSMALL (chlen); )
160
+ {
161
+ int chr= GET;
162
+ if (chr == my_b_EOF)
163
+ {
164
+ DBUG_PRINT (" info" , (" read_mbtail: chlen=%d; unexpected EOF" , chlen));
165
+ return true ; // EOF
166
+ }
167
+ str->append (chr);
168
+ chlen= my_charlen (read_charset, str->ptr () + length0, str->end ());
169
+ if (chlen == MY_CS_ILSEQ)
170
+ {
171
+ /* *
172
+ It has been an incomplete (but a valid) sequence so far,
173
+ but the last byte turned it into a bad byte sequence.
174
+ Unget the very last byte.
175
+ */
176
+ str->length (str->length () - 1 );
177
+ PUSH (chr);
178
+ DBUG_PRINT (" info" , (" read_mbtail: ILSEQ" ));
179
+ return false ; // Bad byte sequence
180
+ }
181
+ }
182
+ DBUG_PRINT (" info" , (" read_mbtail: chlen=%d" , chlen));
183
+ return false ; // Good multi-byte character
184
+ }
185
+
122
186
public:
123
187
bool error,line_cuted,found_null,enclosed;
124
188
uchar *row_start, /* Found row starts here */
@@ -1590,33 +1654,8 @@ int READ_INFO::read_field()
1590
1654
}
1591
1655
}
1592
1656
data.append (chr);
1593
- if (use_mb (read_charset))
1594
- {
1595
- int chlen;
1596
- if ((chlen= my_charlen (read_charset, data.end () - 1 ,
1597
- data.end ())) != 1 )
1598
- {
1599
- for (uint32 length0= data.length () - 1 ; MY_CS_IS_TOOSMALL (chlen); )
1600
- {
1601
- chr= GET;
1602
- if (chr == my_b_EOF)
1603
- goto found_eof;
1604
- data.append (chr);
1605
- chlen= my_charlen (read_charset, data.ptr () + length0, data.end ());
1606
- if (chlen == MY_CS_ILSEQ)
1607
- {
1608
- /* *
1609
- It has been an incomplete (but a valid) sequence so far,
1610
- but the last byte turned it into a bad byte sequence.
1611
- Unget the very last byte.
1612
- */
1613
- data.length (data.length () - 1 );
1614
- PUSH (chr);
1615
- break ;
1616
- }
1617
- }
1618
- }
1619
- }
1657
+ if (use_mb (read_charset) && read_mbtail (&data))
1658
+ goto found_eof;
1620
1659
}
1621
1660
/*
1622
1661
** We come here if buffer is too small. Enlarge it and continue
@@ -1872,26 +1911,8 @@ int READ_INFO::read_value(int delim, String *val)
1872
1911
int chr;
1873
1912
String tmp;
1874
1913
1875
- for (chr= GET; my_tospace (chr) != delim && chr != my_b_EOF;)
1914
+ for (chr= GET; my_tospace (chr) != delim && chr != my_b_EOF; chr= GET )
1876
1915
{
1877
- #ifdef USE_MB
1878
- if (my_mbcharlen (read_charset, chr) > 1 )
1879
- {
1880
- DBUG_PRINT (" read_xml" ,(" multi byte" ));
1881
- int i, ml= my_mbcharlen (read_charset, chr);
1882
- for (i= 1 ; i < ml; i++)
1883
- {
1884
- val->append (chr);
1885
- /*
1886
- Don't use my_tospace() in the middle of a multi-byte character
1887
- TODO: check that the multi-byte sequence is valid.
1888
- */
1889
- chr= GET;
1890
- if (chr == my_b_EOF)
1891
- return chr;
1892
- }
1893
- }
1894
- #endif
1895
1916
if (chr == ' &' )
1896
1917
{
1897
1918
tmp.length (0 );
@@ -1911,8 +1932,11 @@ int READ_INFO::read_value(int delim, String *val)
1911
1932
}
1912
1933
}
1913
1934
else
1935
+ {
1914
1936
val->append (chr);
1915
- chr= GET;
1937
+ if (use_mb (read_charset) && read_mbtail (val))
1938
+ return my_b_EOF;
1939
+ }
1916
1940
}
1917
1941
return my_tospace (chr);
1918
1942
}
0 commit comments