From 19821ad23474a3d056feac94f11569841764eb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Wed, 10 Apr 2024 21:12:45 +0000 Subject: [PATCH] Properly handle emojis as literal prefix in macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not accept the following ```rust macro_rules! lexes {($($_:tt)*) => {}} lexes!(🐛"foo"); ``` Before, invalid emoji identifiers were gated during parsing instead of lexing in all cases, but this didn't account for macro expansion of literal prefixes. Fix #123696. --- compiler/rustc_lexer/src/lib.rs | 6 +++++- compiler/rustc_parse/src/lexer/mod.rs | 5 ++++- src/librustdoc/html/highlight.rs | 7 ++++--- .../rust-analyzer/crates/parser/src/lexed_str.rs | 2 +- tests/ui/lexer/emoji-literal-prefix.rs | 8 ++++++++ tests/ui/lexer/emoji-literal-prefix.stderr | 14 ++++++++++++++ 6 files changed, 36 insertions(+), 6 deletions(-) create mode 100644 tests/ui/lexer/emoji-literal-prefix.rs create mode 100644 tests/ui/lexer/emoji-literal-prefix.stderr diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index ca84e930c2439..83fff98bad56c 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -88,6 +88,10 @@ pub enum TokenKind { /// tokens. UnknownPrefix, + /// Similar to the above, but *always* an error on every edition. This is used + /// for emoji identifier recovery, as those are not meant to be ever accepted. + InvalidPrefix, + /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid /// suffix, but may be present here on string and float literals. Users of /// this type will need to check for and reject that case. @@ -528,7 +532,7 @@ impl Cursor<'_> { // Known prefixes must have been handled earlier. So if // we see a prefix here, it is definitely an unknown prefix. match self.first() { - '#' | '"' | '\'' => UnknownPrefix, + '#' | '"' | '\'' => InvalidPrefix, _ => InvalidIdent, } } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 69b48bf0aff71..85c4c74e1e90f 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -205,6 +205,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { self.ident(start) } rustc_lexer::TokenKind::InvalidIdent + | rustc_lexer::TokenKind::InvalidPrefix // Do not recover an identifier with emoji if the codepoint is a confusable // with a recoverable substitution token, like `➖`. if !UNICODE_ARRAY @@ -302,7 +303,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> { rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { + rustc_lexer::TokenKind::Unknown + | rustc_lexer::TokenKind::InvalidIdent + | rustc_lexer::TokenKind::InvalidPrefix => { // Don't emit diagnostics for sequences of the same invalid token if swallow_next_invalid > 0 { swallow_next_invalid -= 1; diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index aa5998876d9ab..336d18a1df1c6 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -876,9 +876,10 @@ impl<'src> Classifier<'src> { }, Some(c) => c, }, - TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { - Class::Ident(self.new_span(before, text)) - } + TokenKind::RawIdent + | TokenKind::UnknownPrefix + | TokenKind::InvalidPrefix + | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)), TokenKind::Lifetime { .. } => Class::Lifetime, TokenKind::Eof => panic!("Eof in advance"), }; diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index 48e4c8a6225c4..e5fec67de7060 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -178,7 +178,7 @@ impl<'a> Converter<'a> { rustc_lexer::TokenKind::Ident => { SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) } - rustc_lexer::TokenKind::InvalidIdent => { + rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => { err = "Ident contains invalid characters"; IDENT } diff --git a/tests/ui/lexer/emoji-literal-prefix.rs b/tests/ui/lexer/emoji-literal-prefix.rs new file mode 100644 index 0000000000000..ccc8d48d4cc3e --- /dev/null +++ b/tests/ui/lexer/emoji-literal-prefix.rs @@ -0,0 +1,8 @@ +macro_rules! lexes {($($_:tt)*) => {}} + +lexes!(🐛#); //~ ERROR identifiers cannot contain emoji +lexes!(🐛"foo"); +lexes!(🐛'q'); +lexes!(🐛'q); + +fn main() {} diff --git a/tests/ui/lexer/emoji-literal-prefix.stderr b/tests/ui/lexer/emoji-literal-prefix.stderr new file mode 100644 index 0000000000000..25aafed48ea9a --- /dev/null +++ b/tests/ui/lexer/emoji-literal-prefix.stderr @@ -0,0 +1,14 @@ +error: identifiers cannot contain emoji: `🐛` + --> $DIR/emoji-literal-prefix.rs:3:8 + | +LL | lexes!(🐛#); + | ^^ +LL | lexes!(🐛"foo"); + | ^^ +LL | lexes!(🐛'q'); + | ^^ +LL | lexes!(🐛'q); + | ^^ + +error: aborting due to 1 previous error +