Skip to content

Commit

Permalink
Rollup merge of rust-lang#123752 - estebank:emoji-prefix, r=wesleywiser
Browse files Browse the repository at this point in the history
Properly handle emojis as literal prefix in macros

Do not accept the following

```rust
macro_rules! lexes {($($_:tt)*) => {}}
lexes!(πŸ›"foo");
```

Before, invalid emoji identifiers were gated during parsing instead of lexing in all cases, but this didn't account for macro pre-expansion of literal prefixes.

Fix rust-lang#123696.
  • Loading branch information
GuillaumeGomez committed Apr 18, 2024
2 parents 1beaaac + 19821ad commit 901707c
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 6 deletions.
6 changes: 5 additions & 1 deletion compiler/rustc_lexer/src/lib.rs
Expand Up @@ -88,6 +88,10 @@ pub enum TokenKind {
/// tokens.
UnknownPrefix,

/// Similar to the above, but *always* an error on every edition. This is used
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,

/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// suffix, but may be present here on string and float literals. Users of
/// this type will need to check for and reject that case.
Expand Down Expand Up @@ -528,7 +532,7 @@ impl Cursor<'_> {
// Known prefixes must have been handled earlier. So if
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
'#' | '"' | '\'' => InvalidPrefix,
_ => InvalidIdent,
}
}
Expand Down
5 changes: 4 additions & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Expand Up @@ -204,6 +204,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
self.ident(start)
}
rustc_lexer::TokenKind::InvalidIdent
| rustc_lexer::TokenKind::InvalidPrefix
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like `βž–`.
if !UNICODE_ARRAY
Expand Down Expand Up @@ -301,7 +302,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
rustc_lexer::TokenKind::Unknown
| rustc_lexer::TokenKind::InvalidIdent
| rustc_lexer::TokenKind::InvalidPrefix => {
// Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 {
swallow_next_invalid -= 1;
Expand Down
7 changes: 4 additions & 3 deletions src/librustdoc/html/highlight.rs
Expand Up @@ -876,9 +876,10 @@ impl<'src> Classifier<'src> {
},
Some(c) => c,
},
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
Class::Ident(self.new_span(before, text))
}
TokenKind::RawIdent
| TokenKind::UnknownPrefix
| TokenKind::InvalidPrefix
| TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
TokenKind::Lifetime { .. } => Class::Lifetime,
TokenKind::Eof => panic!("Eof in advance"),
};
Expand Down
2 changes: 1 addition & 1 deletion src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Expand Up @@ -178,7 +178,7 @@ impl<'a> Converter<'a> {
rustc_lexer::TokenKind::Ident => {
SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
}
rustc_lexer::TokenKind::InvalidIdent => {
rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
err = "Ident contains invalid characters";
IDENT
}
Expand Down
8 changes: 8 additions & 0 deletions tests/ui/lexer/emoji-literal-prefix.rs
@@ -0,0 +1,8 @@
macro_rules! lexes {($($_:tt)*) => {}}

lexes!(πŸ›#); //~ ERROR identifiers cannot contain emoji
lexes!(πŸ›"foo");
lexes!(πŸ›'q');
lexes!(πŸ›'q);

fn main() {}
14 changes: 14 additions & 0 deletions tests/ui/lexer/emoji-literal-prefix.stderr
@@ -0,0 +1,14 @@
error: identifiers cannot contain emoji: `πŸ›`
--> $DIR/emoji-literal-prefix.rs:3:8
|
LL | lexes!(πŸ›#);
| ^^
LL | lexes!(πŸ›"foo");
| ^^
LL | lexes!(πŸ›'q');
| ^^
LL | lexes!(πŸ›'q);
| ^^

error: aborting due to 1 previous error

0 comments on commit 901707c

Please sign in to comment.