Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement MISC::get_unicodeblock() #953

Merged
merged 2 commits into from Apr 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/article/drawareabase.cpp
Expand Up @@ -4104,10 +4104,10 @@ bool DrawAreaBase::set_carets_dclick( CARET_POSITION& caret_left, CARET_POSITION

int byte_char_pointer;
const char32_t uch_pointer = MISC::utf8toutf32( layout->text + pos, byte_char_pointer );
const int ucstype_pointer = MISC::get_ucs2mode( uch_pointer );
const MISC::UnicodeBlock block_pointer = MISC::get_unicodeblock( uch_pointer );
#ifdef _DEBUG
std::cout << "utf32 = " << std::hex << uch_pointer << std::dec
<< " type = " << ucstype_pointer << " pos = " << pos << std::endl;
<< " type = " << static_cast<int>( block_pointer ) << " pos = " << pos << std::endl;
#endif

// 区切り文字をダブルクリックした
Expand All @@ -4124,16 +4124,16 @@ bool DrawAreaBase::set_carets_dclick( CARET_POSITION& caret_left, CARET_POSITION

int byte_char;
const char32_t uch = MISC::utf8toutf32( layout->text + pos_tmp, byte_char );
const int ucstype = MISC::get_ucs2mode( uch );
const MISC::UnicodeBlock block = MISC::get_unicodeblock( uch );

int byte_char_next;
const char32_t uch_next = MISC::utf8toutf32( layout->text + pos_tmp + byte_char, byte_char_next );
const int ucstype_next = MISC::get_ucs2mode( uch_next );
const MISC::UnicodeBlock block_next = MISC::get_unicodeblock( uch_next );

// 区切り文字が来たら左位置を移動する
if( uch_next == '\0' || is_separate_char( uch )
// 文字種が変わった
|| ( ucstype != ucstype_pointer && ucstype_next == ucstype_pointer )
|| ( block != block_pointer && block_next == block_pointer )

) pos_left = pos_tmp + byte_char;

Expand All @@ -4146,11 +4146,11 @@ bool DrawAreaBase::set_carets_dclick( CARET_POSITION& caret_left, CARET_POSITION

int byte_char;
const char32_t uch = MISC::utf8toutf32( layout->text + pos_right, byte_char );
const int ucstype = MISC::get_ucs2mode( uch );
const MISC::UnicodeBlock block = MISC::get_unicodeblock( uch );

int byte_char_next;
const char32_t uch_next = MISC::utf8toutf32( layout->text + pos_right + byte_char, byte_char_next );
const int ucstype_next = MISC::get_ucs2mode( uch_next );
const MISC::UnicodeBlock block_next = MISC::get_unicodeblock( uch_next );

// 区切り文字が来たらbreak
if( is_separate_char( uch ) ) break;
Expand All @@ -4159,7 +4159,7 @@ bool DrawAreaBase::set_carets_dclick( CARET_POSITION& caret_left, CARET_POSITION

// 文字種が変わった
if( uch_next == '\0'
|| ( ucstype == ucstype_pointer && ucstype_next != ucstype_pointer )
|| ( block == block_pointer && block_next != block_pointer )
) break;
}

Expand Down
15 changes: 15 additions & 0 deletions src/jdlib/misccharcode.cpp
Expand Up @@ -353,3 +353,18 @@ char32_t MISC::utf8toutf32( const char* utf8str, int& byte )

return unich;
}


/** @brief 特定のUnicodeブロックかコードポイントを調べる
*
* @param[in] unich Unicodeコードポイント
* @return MISC::UnicodeBlock 列挙体
*/
MISC::UnicodeBlock MISC::get_unicodeblock( const char32_t unich )
{
if( unich <= 0x007F ) return UnicodeBlock::BasicLatin;
if( unich >= 0x3040 && unich <= 0x309F ) return UnicodeBlock::Hira;
if( unich >= 0x30A0 && unich <= 0x30FF ) return UnicodeBlock::Kata;

return UnicodeBlock::Other;
}
13 changes: 13 additions & 0 deletions src/jdlib/misccharcode.h
Expand Up @@ -19,6 +19,16 @@ namespace MISC
CHARCODE_UTF
};

/// @brief get_unicodeblock() の戻り値
enum class UnicodeBlock
{
BasicLatin, ///< 基本ラテン文字 [U+0000, U+007F]
Hira, ///< 平仮名 [U+3040, U+309F]
Kata, ///< 片仮名 [U+30A0, U+30FF]

Other, ///< 上記以外
};

bool is_euc( const char* input, size_t read_byte );
bool is_jis( const char* input, size_t& read_byte );
bool is_sjis( const char* input, size_t read_byte );
Expand All @@ -33,6 +43,9 @@ namespace MISC
// 出力 : byte 長さ(バイト) utf8str が ASCII なら 1, UTF-8 なら 2 or 3 or 4, それ以外は 0 を入れて返す
// 戻り値 : unicode code point
char32_t utf8toutf32( const char* utf8str, int& byte );

/// 特定のUnicodeブロックかコードポイントを調べる
UnicodeBlock get_unicodeblock( const char32_t unich );
}

#endif
12 changes: 0 additions & 12 deletions src/jdlib/miscutil.cpp
Expand Up @@ -1579,18 +1579,6 @@ int MISC::ucs2toutf8( const int ucs2, char* utfstr )
}


//
// ucs2 の種類
//
int MISC::get_ucs2mode( const int ucs2 )
{
if( ucs2 >= 0x0000 && ucs2 <= 0x007f ) return UCS2MODE_BASIC_LATIN;
if( ucs2 >= 0x3040 && ucs2 <= 0x309f ) return UCS2MODE_HIRA;
if( ucs2 >= 0x30a0 && ucs2 <= 0x30ff ) return UCS2MODE_KATA;

return UCS2MODE_OTHER;
}

//
// WAVEDASHなどのWindows系UTF-8文字をUnix系文字と相互変換
//
Expand Down
13 changes: 0 additions & 13 deletions src/jdlib/miscutil.h
Expand Up @@ -28,16 +28,6 @@ namespace MISC
SCHEME_SSSP
};

// get_ucs2mode()の戻り値
enum
{
UCS2MODE_BASIC_LATIN = 0,
UCS2MODE_HIRA,
UCS2MODE_KATA,

UCS2MODE_OTHER
};


// utf8_fix_wavedash のモード
enum
Expand Down Expand Up @@ -231,9 +221,6 @@ namespace MISC
// str に含まれる「&#数字;」形式の数字参照文字列を全てユニーコード文字に変換する
std::string decode_spchar_number( const std::string& str );

// ucs2 の種類
int get_ucs2mode( const int ucs2 );

// ucs2 -> utf8 変換
// 出力 : utfstr 変換後の文字
// 戻り値 : バイト数
Expand Down
7 changes: 4 additions & 3 deletions src/skeleton/editview.cpp
Expand Up @@ -14,6 +14,7 @@
#include "environment.h"
#include "session.h"

#include "jdlib/misccharcode.h"
#include "jdlib/miscutil.h"
#include "config/globalconf.h"

Expand Down Expand Up @@ -733,10 +734,10 @@ static gboolean EditTextView_slot_extend_selection( GtkTextView*,
Gtk::TextIter& end_iter = Glib::wrap( end );

if( granularity == GTK_TEXT_EXTEND_SELECTION_WORD ) {
const auto mode = MISC::get_ucs2mode( loc_char );
const auto block = MISC::get_unicodeblock( loc_char );
const bool sep = is_separate_char( loc_char );
const auto find_char = [mode, sep]( char32_t c ) {
return mode != MISC::get_ucs2mode( c ) || sep != is_separate_char( c );
const auto find_char = [block, sep]( char32_t c ) {
return block != MISC::get_unicodeblock( c ) || sep != is_separate_char( c );
};
if( start_iter.backward_find_char( find_char ) ) {
start_iter.forward_char();
Expand Down
31 changes: 31 additions & 0 deletions test/gtest_jdlib_misccharcode.cpp
Expand Up @@ -164,4 +164,35 @@ TEST_F(Utf8ToUtf32Test, invalid_bytes)
EXPECT_EQ( 0, byte );
}

class GetUnicodeBlockTest : public ::testing::Test {};

TEST_F(GetUnicodeBlockTest, basic_latin)
{
EXPECT_EQ( MISC::UnicodeBlock::BasicLatin, MISC::get_unicodeblock( 0x0000 ) );
EXPECT_EQ( MISC::UnicodeBlock::BasicLatin, MISC::get_unicodeblock( 0x007F ) );
}

TEST_F(GetUnicodeBlockTest, hiragana)
{
EXPECT_EQ( MISC::UnicodeBlock::Hira, MISC::get_unicodeblock( 0x3040 ) );
EXPECT_EQ( MISC::UnicodeBlock::Hira, MISC::get_unicodeblock( 0x309F ) );
}

TEST_F(GetUnicodeBlockTest, katanaka)
{
EXPECT_EQ( MISC::UnicodeBlock::Kata, MISC::get_unicodeblock( 0x30A0 ) );
EXPECT_EQ( MISC::UnicodeBlock::Kata, MISC::get_unicodeblock( 0x30FF ) );
}

TEST_F(GetUnicodeBlockTest, other)
{
EXPECT_EQ( MISC::UnicodeBlock::Other, MISC::get_unicodeblock( 0x0080 ) );

EXPECT_EQ( MISC::UnicodeBlock::Other, MISC::get_unicodeblock( 0x303F ) );
EXPECT_EQ( MISC::UnicodeBlock::Other, MISC::get_unicodeblock( 0x3100 ) );

EXPECT_EQ( MISC::UnicodeBlock::Other, MISC::get_unicodeblock( 0x10FFFF ) );
EXPECT_EQ( MISC::UnicodeBlock::Other, MISC::get_unicodeblock( 0x110000 ) );
}

} // namespace