Skip to content

Commit

Permalink
Auto merge of rust-lang#75642 - matklad:lexer-comments, r=petrochenkov
Browse files Browse the repository at this point in the history
Move doc comment parsing to rustc_lexer

Plain comments are trivia, while doc comments are not, so it feels
like this belongs to the rustc_lexer.

The specific reason to do this is the desire to use rustc_lexer in
rustdoc for syntax highlighting, without duplicating "is this a doc
comment?" logic there.

r? @ghost
  • Loading branch information
bors committed Aug 21, 2020
2 parents ff5e0f1 + ccbe94b commit b51651a
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 97 deletions.
48 changes: 4 additions & 44 deletions src/librustc_ast/util/comments.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::ast::AttrStyle;
use rustc_span::source_map::SourceMap;
use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol};

Expand All @@ -24,45 +23,6 @@ pub struct Comment {
pub pos: BytePos,
}

/// For a full line comment string returns its doc comment style if it's a doc comment
/// and returns `None` if it's a regular comment.
pub fn line_doc_comment_style(line_comment: &str) -> Option<AttrStyle> {
let line_comment = line_comment.as_bytes();
assert!(line_comment.starts_with(b"//"));
match line_comment.get(2) {
// `//!` is an inner line doc comment.
Some(b'!') => Some(AttrStyle::Inner),
Some(b'/') => match line_comment.get(3) {
// `////` (more than 3 slashes) is not considered a doc comment.
Some(b'/') => None,
// Otherwise `///` is an outer line doc comment.
_ => Some(AttrStyle::Outer),
},
_ => None,
}
}

/// For a full block comment string returns its doc comment style if it's a doc comment
/// and returns `None` if it's a regular comment.
pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option<AttrStyle> {
let block_comment = block_comment.as_bytes();
assert!(block_comment.starts_with(b"/*"));
assert!(!terminated || block_comment.ends_with(b"*/"));
match block_comment.get(2) {
// `/*!` is an inner block doc comment.
Some(b'!') => Some(AttrStyle::Inner),
Some(b'*') => match block_comment.get(3) {
// `/***` (more than 2 stars) is not considered a doc comment.
Some(b'*') => None,
// `/**/` is not considered a doc comment.
Some(b'/') if block_comment.len() == 4 => None,
// Otherwise `/**` is an outer block doc comment.
_ => Some(AttrStyle::Outer),
},
_ => None,
}
}

/// Makes a doc string more presentable to users.
/// Used by rustdoc and perhaps other tools, but not by rustc.
pub fn beautify_doc_string(data: Symbol) -> String {
Expand Down Expand Up @@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
}
}
}
rustc_lexer::TokenKind::BlockComment { terminated } => {
if block_doc_comment_style(token_text, terminated).is_none() {
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
if doc_style.is_none() {
let code_to_the_right = match text[pos + token.len..].chars().next() {
Some('\r' | '\n') => false,
_ => true,
Expand All @@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
comments.push(Comment { style, lines, pos: pos_in_file })
}
}
rustc_lexer::TokenKind::LineComment => {
if line_doc_comment_style(token_text).is_none() {
rustc_lexer::TokenKind::LineComment { doc_style } => {
if doc_style.is_none() {
comments.push(Comment {
style: if code_to_the_left {
CommentStyle::Trailing
Expand Down
7 changes: 0 additions & 7 deletions src/librustc_ast/util/comments/tests.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
use super::*;
use rustc_span::with_default_session_globals;

#[test]
fn line_doc_comments() {
assert!(line_doc_comment_style("///").is_some());
assert!(line_doc_comment_style("/// blah").is_some());
assert!(line_doc_comment_style("////").is_none());
}

#[test]
fn test_block_doc_comment_1() {
with_default_session_globals(|| {
Expand Down
35 changes: 30 additions & 5 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ impl Token {
pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
LineComment,
LineComment { doc_style: Option<DocStyle> },
/// `/* block comment */`
///
/// Block comments can be recursive, so the sequence like `/* /* */`
/// will not be considered terminated and will result in a parsing error.
BlockComment { terminated: bool },
BlockComment { doc_style: Option<DocStyle>, terminated: bool },
/// Any whitespace characters sequence.
Whitespace,
/// "ident" or "continue"
Expand Down Expand Up @@ -129,6 +129,12 @@ pub enum TokenKind {
Unknown,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum DocStyle {
Outer,
Inner,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99"
Expand Down Expand Up @@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
// a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level),
// then it may be valid Rust code, so consider it Rust code.
let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok|
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. })
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. })
);
if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
// No other choice than to consider this a shebang.
Expand Down Expand Up @@ -410,13 +416,32 @@ impl Cursor<'_> {
fn line_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.first() == '/');
self.bump();

let doc_style = match self.first() {
// `//!` is an inner line doc comment.
'!' => Some(DocStyle::Inner),
// `////` (more than 3 slashes) is not considered a doc comment.
'/' if self.second() != '/' => Some(DocStyle::Outer),
_ => None,
};

self.eat_while(|c| c != '\n');
LineComment
LineComment { doc_style }
}

fn block_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.first() == '*');
self.bump();

let doc_style = match self.first() {
// `/*!` is an inner block doc comment.
'!' => Some(DocStyle::Inner),
// `/***` (more than 2 stars) is not considered a doc comment.
// `/**/` is not considered a doc comment.
'*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
_ => None,
};

let mut depth = 1usize;
while let Some(c) = self.bump() {
match c {
Expand All @@ -438,7 +463,7 @@ impl Cursor<'_> {
}
}

BlockComment { terminated: depth == 0 }
BlockComment { doc_style, terminated: depth == 0 }
}

fn whitespace(&mut self) -> TokenKind {
Expand Down
96 changes: 55 additions & 41 deletions src/librustc_parse/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use rustc_ast::ast::AttrStyle;
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::util::comments;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base;
Expand All @@ -15,7 +15,7 @@ mod tokentrees;
mod unescape_error_reporting;
mod unicode_chars;

use rustc_lexer::unescape::Mode;
use rustc_lexer::{unescape::Mode, DocStyle};
use unescape_error_reporting::{emit_unescape_error, push_escaped_char};

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -168,25 +168,23 @@ impl<'a> StringReader<'a> {
/// symbols and runs additional validation.
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
match token {
rustc_lexer::TokenKind::LineComment => {
let string = self.str_from(start);
if let Some(attr_style) = comments::line_doc_comment_style(string) {
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
// Opening delimiter of the length 3 is not included into the symbol.
token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..]))
} else {
token::Comment
rustc_lexer::TokenKind::LineComment { doc_style } => {
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);

self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
None => token::Comment,
}
}
rustc_lexer::TokenKind::BlockComment { terminated } => {
let string = self.str_from(start);
let attr_style = comments::block_doc_comment_style(string, terminated);

rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
let msg = if attr_style.is_some() {
"unterminated block doc-comment"
} else {
"unterminated block comment"
let msg = match doc_style {
Some(_) => "unterminated block doc-comment",
None => "unterminated block comment",
};
let last_bpos = self.pos;
self.sess
Expand All @@ -199,18 +197,17 @@ impl<'a> StringReader<'a> {
.emit();
FatalError.raise();
}

if let Some(attr_style) = attr_style {
self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment");
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
token::DocComment(
CommentKind::Block,
attr_style,
Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]),
)
} else {
token::Comment
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);

self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
None => token::Comment,
}
}
rustc_lexer::TokenKind::Whitespace => token::Whitespace,
Expand Down Expand Up @@ -319,6 +316,34 @@ impl<'a> StringReader<'a> {
}
}

fn cook_doc_comment(
&self,
content_start: BytePos,
content: &str,
comment_kind: CommentKind,
doc_style: DocStyle,
) -> TokenKind {
if content.contains('\r') {
for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
self.err_span_(
content_start + BytePos(idx as u32),
content_start + BytePos(idx as u32 + 1),
match comment_kind {
CommentKind::Line => "bare CR not allowed in doc-comment",
CommentKind::Block => "bare CR not allowed in block doc-comment",
},
);
}
}

let attr_style = match doc_style {
DocStyle::Outer => AttrStyle::Outer,
DocStyle::Inner => AttrStyle::Inner,
};

token::DocComment(comment_kind, attr_style, Symbol::intern(content))
}

fn cook_lexer_literal(
&self,
start: BytePos,
Expand Down Expand Up @@ -472,17 +497,6 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}

fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
let mut idx = 0;
loop {
idx = match s[idx..].find('\r') {
None => break,
Some(it) => idx + it + 1,
};
self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg);
}
}

fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
match opt_err {
Some(RawStrError::InvalidStarter { bad_char }) => {
Expand Down

0 comments on commit b51651a

Please sign in to comment.