Skip to content

Commit

Permalink
Merge pull request unicode-rs#45 from Jules-Bertholet/control
Browse files Browse the repository at this point in the history
Assign width 1 to control characters
  • Loading branch information
Manishearth committed May 9, 2024
2 parents 86970a1 + 4efb180 commit 3063422
Show file tree
Hide file tree
Showing 5 changed files with 547 additions and 565 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use unicode_width::UnicodeWidthStr;

fn main() {
let teststr = "Hello, world!";
let width = UnicodeWidthStr::width(teststr);
let width = teststr.width();
println!("{}", teststr);
println!("The above string is {} columns wide.", width);
let width = teststr.width_cjk();
Expand All @@ -34,9 +34,9 @@ extern crate unicode_width;
use unicode_width::UnicodeWidthStr;

fn main() {
assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman
assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope
assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist
assert_eq!("👩".width(), 2); // Woman
assert_eq!("🔬".width(), 2); // Microscope
assert_eq!("👩‍🔬".width(), 4); // Woman scientist
}
```

Expand Down
41 changes: 2 additions & 39 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,21 +165,14 @@ def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if
- it is a control character,
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

zw_map = [False] * NUM_CODEPOINTS

# Control characters have width 0
for c in range(0x00, 0x20):
zw_map[c] = True
for c in range(0x7F, 0xA0):
zw_map[c] = True

# `Default_Ignorable_Code_Point`s also have 0 width:
# https://www.unicode.org/faq/unsup_char.html#3
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
Expand Down Expand Up @@ -563,7 +556,7 @@ def emit_module(
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
#[inline]
fn lookup_width(c: char, is_cjk: bool) -> usize {
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
let cp = c as usize;
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
Expand Down Expand Up @@ -664,36 +657,6 @@ def emit_module(
"""
)

module.write(
"""
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character other than `'\\x00'`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\\u{7F}' {
if c >= '\\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else if c == '\\0' {
// U+0000 *is* a control code, but it's special-cased
Some(0)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}
"""
)

subtable_count = 1
for i, table in enumerate(tables):
new_subtable_count = len(table.buckets())
Expand Down
105 changes: 68 additions & 37 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@
//!
//! 1. [Emoji presentation sequences] have width 2.
//! (The width of a string may therefore differ from the sum of the widths of its characters.)
//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
//! iff their base character fulfills all the following requirements:
//! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
//! have width 1:
//! - Has the [`Emoji_Presentation`] property, and
//! - Not in the [Enclosed Ideographic Supplement] block.
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 4. The following have width 0:
//! 3. The sequence `"\r\n"` has width 1.
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 5. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All @@ -55,9 +56,6 @@
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
//! have no defined width, and are ignored when determining the width of a string.
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
Expand Down Expand Up @@ -99,7 +97,7 @@ mod tables;
/// Methods for determining displayed width of Unicode characters.
pub trait UnicodeWidthChar {
/// Returns the character's displayed width in columns, or `None` if the
/// character is a control character other than `'\x00'`.
/// character is a control character.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All @@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
fn width(self) -> Option<usize>;

/// Returns the character's displayed width in columns, or `None` if the
/// character is a control character other than `'\x00'`.
/// character is a control character.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All @@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
impl UnicodeWidthChar for char {
#[inline]
fn width(self) -> Option<usize> {
cw::width(self, false)
single_char_width(self, false)
}

#[inline]
fn width_cjk(self) -> Option<usize> {
cw::width(self, true)
single_char_width(self, true)
}
}

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
fn single_char_width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\u{7F}' {
if c >= '\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(cw::lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}

/// Methods for determining displayed width of Unicode strings.
pub trait UnicodeWidthStr {
/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
Expand All @@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {

/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
Expand All @@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
}
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum VariationSelector {
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
enum NextCharInfo {
#[default]
Default,
LineFeed = 0x0A,
Vs15 = 0x0E,
Vs16 = 0x0F,
}

fn str_width(s: &str, is_cjk: bool) -> usize {
s.chars()
.rfold((0, None), |(sum, vsel), c| match c {
'\u{FE0E}' => (sum, Some(VariationSelector::Vs15)),
'\u{FE0F}' => (sum, Some(VariationSelector::Vs16)),
_ => {
let add = match vsel {
Some(VariationSelector::Vs15)
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
{
1
}

Some(VariationSelector::Vs16) if cw::starts_emoji_presentation_seq(c) => 2,
_ => cw::width(c, is_cjk).unwrap_or(0),
};
(sum + add, None)
}
.rfold((0, NextCharInfo::Default), |(sum, next_info), c| {
let (add, info) = width_in_str(c, is_cjk, next_info);
(sum + add, info)
})
.0
}

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
match next_info {
NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
(1, NextCharInfo::Default)
}
NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
_ => {
if c <= '\u{A0}' {
match c {
'\n' => (1, NextCharInfo::LineFeed),
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
_ => (1, NextCharInfo::Default),
}
} else {
match c {
'\u{FE0E}' => (0, NextCharInfo::Vs15),
'\u{FE0F}' => (0, NextCharInfo::Vs16),
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
}
}
}
}
}
Loading

0 comments on commit 3063422

Please sign in to comment.