From 19821ad23474a3d056feac94f11569841764eb87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Esteban=20K=C3=BCber?= <esteban@kuber.com.ar>
Date: Wed, 10 Apr 2024 21:12:45 +0000
Subject: [PATCH] Properly handle emojis as literal prefix in macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not accept the following

```rust
macro_rules! lexes {($($_:tt)*) => {}}
lexes!(🐛"foo");
```

Before, invalid emoji identifiers were gated during parsing instead of lexing in all cases, but this didn't account for macro expansion of literal prefixes.

Fix #123696.
---
 compiler/rustc_lexer/src/lib.rs                    |  6 +++++-
 compiler/rustc_parse/src/lexer/mod.rs              |  5 ++++-
 src/librustdoc/html/highlight.rs                   |  7 ++++---
 .../rust-analyzer/crates/parser/src/lexed_str.rs   |  2 +-
 tests/ui/lexer/emoji-literal-prefix.rs             |  8 ++++++++
 tests/ui/lexer/emoji-literal-prefix.stderr         | 14 ++++++++++++++
 6 files changed, 36 insertions(+), 6 deletions(-)
 create mode 100644 tests/ui/lexer/emoji-literal-prefix.rs
 create mode 100644 tests/ui/lexer/emoji-literal-prefix.stderr

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index ca84e930c2439..83fff98bad56c 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -88,6 +88,10 @@ pub enum TokenKind {
     /// tokens.
     UnknownPrefix,
 
+    /// Similar to the above, but *always* an error on every edition. This is used
+    /// for emoji identifier recovery, as those are not meant to be ever accepted.
+    InvalidPrefix,
+
     /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
     /// suffix, but may be present here on string and float literals. Users of
     /// this type will need to check for and reject that case.
@@ -528,7 +532,7 @@ impl Cursor<'_> {
         // Known prefixes must have been handled earlier. So if
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
-            '#' | '"' | '\'' => UnknownPrefix,
+            '#' | '"' | '\'' => InvalidPrefix,
             _ => InvalidIdent,
         }
     }
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 69b48bf0aff71..85c4c74e1e90f 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -205,6 +205,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     self.ident(start)
                 }
                 rustc_lexer::TokenKind::InvalidIdent
+                | rustc_lexer::TokenKind::InvalidPrefix
                     // Do not recover an identifier with emoji if the codepoint is a confusable
                     // with a recoverable substitution token, like `➖`.
                     if !UNICODE_ARRAY
@@ -302,7 +303,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                 rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
                 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
-                rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
+                rustc_lexer::TokenKind::Unknown
+                | rustc_lexer::TokenKind::InvalidIdent
+                | rustc_lexer::TokenKind::InvalidPrefix => {
                     // Don't emit diagnostics for sequences of the same invalid token
                     if swallow_next_invalid > 0 {
                         swallow_next_invalid -= 1;
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index aa5998876d9ab..336d18a1df1c6 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -876,9 +876,10 @@ impl<'src> Classifier<'src> {
                 },
                 Some(c) => c,
             },
-            TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
-                Class::Ident(self.new_span(before, text))
-            }
+            TokenKind::RawIdent
+            | TokenKind::UnknownPrefix
+            | TokenKind::InvalidPrefix
+            | TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
             TokenKind::Lifetime { .. } => Class::Lifetime,
             TokenKind::Eof => panic!("Eof in advance"),
         };
diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
index 48e4c8a6225c4..e5fec67de7060 100644
--- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
+++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -178,7 +178,7 @@ impl<'a> Converter<'a> {
                 rustc_lexer::TokenKind::Ident => {
                     SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
                 }
-                rustc_lexer::TokenKind::InvalidIdent => {
+                rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
                     err = "Ident contains invalid characters";
                     IDENT
                 }
diff --git a/tests/ui/lexer/emoji-literal-prefix.rs b/tests/ui/lexer/emoji-literal-prefix.rs
new file mode 100644
index 0000000000000..ccc8d48d4cc3e
--- /dev/null
+++ b/tests/ui/lexer/emoji-literal-prefix.rs
@@ -0,0 +1,8 @@
+macro_rules! lexes {($($_:tt)*) => {}}
+
+lexes!(🐛#); //~ ERROR identifiers cannot contain emoji
+lexes!(🐛"foo");
+lexes!(🐛'q');
+lexes!(🐛'q);
+
+fn main() {}
diff --git a/tests/ui/lexer/emoji-literal-prefix.stderr b/tests/ui/lexer/emoji-literal-prefix.stderr
new file mode 100644
index 0000000000000..25aafed48ea9a
--- /dev/null
+++ b/tests/ui/lexer/emoji-literal-prefix.stderr
@@ -0,0 +1,14 @@
+error: identifiers cannot contain emoji: `🐛`
+  --> $DIR/emoji-literal-prefix.rs:3:8
+   |
+LL | lexes!(🐛#);
+   |        ^^
+LL | lexes!(🐛"foo");
+   |        ^^
+LL | lexes!(🐛'q');
+   |        ^^
+LL | lexes!(🐛'q);
+   |        ^^
+
+error: aborting due to 1 previous error
+