Implement MISC::utf8bytes()

UTF-8で符号化された文字のバイト数を返す関数を追加する。また、バイト数を取得するコードを実装した関数に更新する。
JDimproved · Mar 5, 2022 · 26aed31 · 26aed31
1 parent 2ac67c0
commit 26aed31
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 32 deletions.
diff --git a/src/article/drawareabase.cpp b/src/article/drawareabase.cpp
@@ -16,6 +16,7 @@
 #include "embeddedimage.h"
 
 #include "jdlib/jdregex.h"
+#include "jdlib/misccharcode.h"
 #include "jdlib/miscgtk.h"
 #include "jdlib/miscmsg.h"
 #include "jdlib/miscutil.h"
@@ -1515,8 +1516,7 @@ bool DrawAreaBase::set_init_wide_mode( const char* str, const int pos_start, con
     int i = pos_start;
     while( i < pos_to ){
 
-        int byte_tmp;
-        MISC::utf8toucs2( str + i, byte_tmp );
+        const int byte_tmp = MISC::utf8bytes( str + i );
 
         // 文字列に全角が含まれていたら全角モードで開始
         if( byte_tmp != 1 ) break;

diff --git a/src/jdlib/misccharcode.cpp b/src/jdlib/misccharcode.cpp
@@ -167,27 +167,29 @@ bool MISC::is_sjis( const char* input, size_t read_byte )
 }
 
 
+namespace {
 /*---- UTF ---------------------------------------------------------*/
 //
-// 0xC0・0xC1はセキュリティ上の問題で使用が禁止されている
-//
-// [ 1バイト目の範囲 ] 0xC2〜0xFD [ RFC2279(破棄) ]
-// [ 1バイト目の範囲 ] 0xC2〜0xF4 [ RFC6329 ]
-#define UTF_RANGE_1( target ) ( (unsigned char)( target - 0xC2 ) < 0x33 )
-//
-// [ 1バイト目 (2バイト文字) ] 先頭2ビットが1
-#define UTF_FLAG_2( target ) ( ( target & 0xC0 ) == 0xC0 )
-//
-// [ 1バイト目 (3バイト文字) ] 先頭3ビットが1
-#define UTF_FLAG_3( target ) ( ( target & 0xE0 ) == 0xE0 )
-//
-// [ 1バイト目 (4バイト文字) ] 先頭4ビットが1
-#define UTF_FLAG_4( target ) ( ( target & 0xF0 ) == 0xF0 )
-//
-// [ 2バイト目以降 ] 0x80〜0xBF
-#define UTF_RANGE_MULTI_BYTE( target ) ( (unsigned char)( target - 0x80 ) < 0x40 )
-//
-bool MISC::is_utf( const char* input, size_t read_byte )
+/// @brief U+0080〜U+10FFFF (2〜4バイト) の1バイト目か調べる。
+///
+/// 0xC0, 0xC1, 0xF5〜0xFF の使用は禁止されている。(RFC3629)
+inline bool utf8_head_multi_byte( std::uint8_t x ) { return static_cast<std::uint8_t>( x - 0xC2 ) <= ( 0xF4 - 0xC2 ); }
+
+/// U+0080〜U+07FF (2バイト) の1バイト目か調べる。先頭ビットは 110, 範囲は 0xC2〜0xDF
+inline bool utf8_head_bytes2( std::uint8_t x ) { return static_cast<std::uint8_t>( x - 0xC2 ) <= ( 0xDF - 0xC2 ); }
+
+/// U+0800〜U+FFFF (3バイト) の1バイト目か調べる。先頭ビットは 1110, 範囲は 0xE0〜0xEF
+inline bool utf8_head_bytes3( std::uint8_t x ) { return ( x & 0xF0 ) == 0xE0; }
+
+/// U+10000〜U+10FFFF (4バイト) の1バイト目か調べる。先頭ビットは 11110, 範囲は 0xF0〜0xF4
+inline bool utf8_head_bytes4( std::uint8_t x ) { return static_cast<std::uint8_t>( x - 0xF0 ) <= ( 0xF4 - 0xF0 ); }
+
+/// U+0080〜 (2〜4バイト) の2バイト目以降か簡易的に調べる。先頭ビットは 10, 範囲は 0x80～0xBF
+inline bool utf8_following_byte( std::uint8_t x ) { return ( x & 0xC0 ) == 0x80; }
+
+} // namespace
+
+bool MISC::is_utf8( const char* input, size_t read_byte )
 {
     if( ! input ) return false;
 
@@ -205,24 +207,24 @@ bool MISC::is_utf( const char* input, size_t read_byte )
             continue;
         }
         // UTF-8の1バイト目の範囲ではない
-        else if( ! UTF_RANGE_1( input[ byte ] ) )
+        else if( ! utf8_head_multi_byte( input[ byte ] ) )
         {
             return false;
         }
 
         int byte_count = 1;
 
         // 4,3,2バイト
-        if( UTF_FLAG_4( input[ byte ] ) ) byte_count = 4;
-        else if( UTF_FLAG_3( input[ byte ] ) ) byte_count = 3;
-        else if( UTF_FLAG_2( input[ byte ] ) ) byte_count = 2;
+        if( utf8_head_bytes4( input[ byte ] ) ) byte_count = 4;
+        else if( utf8_head_bytes3( input[ byte ] ) ) byte_count = 3;
+        else if( utf8_head_bytes2( input[ byte ] ) ) byte_count = 2;
 
         ++byte;
 
         // 2バイト目以降
         while( byte_count > 1 )
         {
-            if( UTF_RANGE_MULTI_BYTE( input[ byte ] ) )
+            if( utf8_following_byte( input[ byte ] ) )
             {
                 ++byte;
             }
@@ -259,7 +261,7 @@ int MISC::judge_char_code( const std::string& str )
     else if( read_byte == str.length() ) code = CHARCODE_ASCII;
     // is_jis()でASCII範囲外のバイトが現れた箇所から判定する
     // UTF-8の範囲
-    else if( is_utf( str.c_str(), read_byte ) ) code = CHARCODE_UTF;
+    else if( is_utf8( str.c_str(), read_byte ) ) code = CHARCODE_UTF;
     // EUC-JPの範囲
     else if( is_euc( str.c_str(), read_byte ) ) code = CHARCODE_EUC_JP;
     // Shift_JISの範囲
@@ -268,3 +270,44 @@ int MISC::judge_char_code( const std::string& str )
     return code;
 }
 
+
+/** @brief utf-8文字のbyte数を返す
+ *
+ * @param [in] utf8str  入力文字 (UTF-8)
+ * @return 文字の長さ(バイト)が ASCII なら 1, UTF-8 なら 2 or 3 or 4, NULまたは不正な文字なら 0 を返す。
+ */
+int MISC::utf8bytes( const char* utf8str )
+{
+    int byte = 0;
+
+    if( utf8str && *utf8str != '\0' ){
+        const auto ch = static_cast<unsigned char>( *utf8str );
+        if( ch <= 0x7F ) byte = 1;
+        else if( utf8_head_bytes3( ch ) ) byte = 3;
+        else if( utf8_head_bytes4( ch ) ) byte = 4;
+        else if( utf8_head_bytes2( ch ) ) byte = 2;
+#ifdef _DEBUG
+        else { // 不正なUTF8
+            std::cout << "MISC::utf8bytes : invalid 1st byte: char = "
+                      << static_cast<unsigned int>( ch ) << std::endl;
+        }
+#endif
+    }
+
+    for( int i = 1; i < byte; ++i ){
+        if( ! utf8_following_byte( utf8str[ i ] ) ){
+#ifdef _DEBUG
+            // 不正なUTF8
+            std::cout << "MISC::utf8bytes : invalid code: char = " << static_cast<unsigned int>( utf8str[0] );
+            std::cout << ", " << static_cast<unsigned int>( utf8str[1] );
+            if( byte > 2 ) std::cout << ", " << static_cast<unsigned int>( utf8str[2] );
+            if( byte > 3 ) std::cout << ", " << static_cast<unsigned int>( utf8str[3] );
+            std::cout << std::endl;
+#endif
+            byte = 0;
+            break;
+        }
+    }
+
+    return byte;
+}
diff --git a/src/jdlib/misccharcode.h b/src/jdlib/misccharcode.h
@@ -22,8 +22,11 @@ namespace MISC
     bool is_euc( const char* input, size_t read_byte );
     bool is_jis( const char* input, size_t& read_byte );
     bool is_sjis( const char* input, size_t read_byte );
-    bool is_utf( const char* input, size_t read_byte );
+    bool is_utf8( const char* input, size_t read_byte );
     int judge_char_code( const std::string& str );
+
+    /// utf-8文字のbyte数を返す
+    int utf8bytes( const char* utf8str );
 }
 
 #endif
diff --git a/src/jdlib/miscutil.cpp b/src/jdlib/miscutil.cpp
@@ -3,11 +3,12 @@
 //#define _DEBUG
 #include "jddebug.h"
 
-#include "miscutil.h"
-#include "miscmsg.h"
+#include "hkana.h"
 #include "jdiconv.h"
 #include "jdregex.h"
-#include "hkana.h"
+#include "misccharcode.h"
+#include "miscmsg.h"
+#include "miscutil.h"
 
 #include "dbtree/spchar_decoder.h"
 #include "dbtree/node.h"
@@ -677,7 +678,7 @@ std::string MISC::cut_str( const std::string& str, const unsigned int maxsize )
     const size_t outstr_length = outstr.length();
 
     for( pos = 0, lng_str = 0; pos < outstr_length; pos += byte ){
-        MISC::utf8toucs2( outstr.c_str()+pos, byte );
+        byte = MISC::utf8bytes( outstr.c_str() + pos );
         if( byte > 1 ) lng_str += 2;
         else ++lng_str;
         if( lng_str >= maxsize ) break;