Skip to content

Commit

Permalink
Update syntax to just WITH HEADER (cols)
Browse files Browse the repository at this point in the history
  • Loading branch information
quodlibetor committed Jul 29, 2021
1 parent 095db1b commit fcda97a
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 62 deletions.
20 changes: 9 additions & 11 deletions src/sql-parser/src/ast/defs/ddl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,26 +205,24 @@ impl<T: AstInfo> Format<T> {
pub enum CsvColumns {
/// `WITH count COLUMNS`
Count(usize),
/// Column count not specified by syntax
Header,
/// `WITH HEADER COLUMNS (ident, ...)`
HeaderNamed(Vec<Ident>),
/// `WITH HEADER (ident, ...)?`: `names` is empty if there are no names specified
Header { names: Vec<Ident> },
}

impl AstDisplay for CsvColumns {
fn fmt<W: fmt::Write>(&self, f: &mut AstFormatter<W>) {
match self {
CsvColumns::Header => {
f.write_str("HEADER");
}
CsvColumns::Count(n) => {
f.write_str(n);
f.write_str(" COLUMNS")
}
CsvColumns::HeaderNamed(names) => {
f.write_str("HEADER COLUMNS (");
f.write_node(&display::comma_separated(&names));
f.write_str(")");
CsvColumns::Header { names } => {
f.write_str("HEADER");
if !names.is_empty() {
f.write_str(" (");
f.write_node(&display::comma_separated(&names));
f.write_str(")");
}
}
}
}
Expand Down
7 changes: 2 additions & 5 deletions src/sql-parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1464,11 +1464,8 @@ impl<'a> Parser<'a> {
} else if self.parse_keyword(CSV) {
self.expect_keyword(WITH)?;
let columns = if self.parse_keyword(HEADER) || self.parse_keyword(HEADERS) {
if self.parse_keyword(COLUMNS) {
let columns = self.parse_parenthesized_column_list(Mandatory)?;
CsvColumns::HeaderNamed(columns)
} else {
CsvColumns::Header
CsvColumns::Header {
names: self.parse_parenthesized_column_list(Optional)?,
}
} else {
let n_cols = self.parse_literal_uint()? as usize;
Expand Down
10 changes: 5 additions & 5 deletions src/sql-parser/tests/testdata/ddl
Original file line number Diff line number Diff line change
Expand Up @@ -410,14 +410,14 @@ CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH HEADER
----
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH HEADER
=>
CreateSource(CreateSourceStatement { name: UnresolvedObjectName([Ident("foo")]), col_names: [], connector: File { path: "bar", compression: None }, with_options: [Value { name: Ident("tail"), value: Boolean(false) }], format: Bare(Csv { columns: Header, delimiter: ',' }), key_envelope: None, envelope: None, if_not_exists: false, materialized: false, key_constraint: None })
CreateSource(CreateSourceStatement { name: UnresolvedObjectName([Ident("foo")]), col_names: [], connector: File { path: "bar", compression: None }, with_options: [Value { name: Ident("tail"), value: Boolean(false) }], format: Bare(Csv { columns: Header { names: [] }, delimiter: ',' }), key_envelope: None, envelope: None, if_not_exists: false, materialized: false, key_constraint: None })

parse-statement
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH HEADER COLUMNS (a, b, c)
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH HEADER (a, b, c)
----
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH HEADER COLUMNS (a, b, c)
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH HEADER (a, b, c)
=>
CreateSource(CreateSourceStatement { name: UnresolvedObjectName([Ident("foo")]), col_names: [], connector: File { path: "bar", compression: None }, with_options: [Value { name: Ident("tail"), value: Boolean(false) }], format: Bare(Csv { columns: HeaderNamed([Ident("a"), Ident("b"), Ident("c")]), delimiter: ',' }), key_envelope: None, envelope: None, if_not_exists: false, materialized: false, key_constraint: None })
CreateSource(CreateSourceStatement { name: UnresolvedObjectName([Ident("foo")]), col_names: [], connector: File { path: "bar", compression: None }, with_options: [Value { name: Ident("tail"), value: Boolean(false) }], format: Bare(Csv { columns: Header { names: [Ident("a"), Ident("b"), Ident("c")] }, delimiter: ',' }), key_envelope: None, envelope: None, if_not_exists: false, materialized: false, key_constraint: None })

parse-statement
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = false) FORMAT CSV WITH 3 COLUMNS
Expand All @@ -431,7 +431,7 @@ CREATE SOURCE foo (one, two) FROM FILE 'bar' FORMAT CSV WITH HEADER
----
CREATE SOURCE foo (one, two) FROM FILE 'bar' FORMAT CSV WITH HEADER
=>
CreateSource(CreateSourceStatement { name: UnresolvedObjectName([Ident("foo")]), col_names: [Ident("one"), Ident("two")], connector: File { path: "bar", compression: None }, with_options: [], format: Bare(Csv { columns: Header, delimiter: ',' }), key_envelope: None, envelope: None, if_not_exists: false, materialized: false, key_constraint: None })
CreateSource(CreateSourceStatement { name: UnresolvedObjectName([Ident("foo")]), col_names: [Ident("one"), Ident("two")], connector: File { path: "bar", compression: None }, with_options: [], format: Bare(Csv { columns: Header { names: [] }, delimiter: ',' }), key_envelope: None, envelope: None, if_not_exists: false, materialized: false, key_constraint: None })

parse-statement
CREATE SOURCE foo FROM FILE 'bar' WITH (tail = true) FORMAT CSV WITH 3 COLUMNS DELIMITED BY '|'
Expand Down
12 changes: 7 additions & 5 deletions src/sql/src/plan/statement/ddl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1059,13 +1059,15 @@ fn get_encoding_inner<T: sql_parser::ast::AstInfo>(
}
Format::Csv { columns, delimiter } => {
let columns = match columns {
CsvColumns::HeaderNamed(ns) => {
ColumnSpec::HeaderNames(ns.iter().cloned().map(|n| n.into_string()).collect())
CsvColumns::Header { names } => {
if names.is_empty() {
bail!("[internal error] column spec should get names in purify")
}
ColumnSpec::HeaderNames(
names.iter().cloned().map(|n| n.into_string()).collect(),
)
}
CsvColumns::Count(n) => ColumnSpec::Count(*n),
CsvColumns::Header => {
bail!("[internal error] column spec should get names in purify")
}
};
DataEncoding::Csv(CsvEncoding {
columns,
Expand Down
76 changes: 44 additions & 32 deletions src/sql/src/pure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -492,19 +492,27 @@ async fn purify_format_single(
.collect::<Vec<_>>(),
),
Ok(None) => {
if matches!(columns, CsvColumns::Header) {
bail!(
"CSV file expected to have at least one line \
to determine column names, but is empty"
);
if let CsvColumns::Header { names } = columns {
if names.is_empty() {
bail!(
"CSV file expected to have at least one line \
to determine column names, but is empty"
);
} else {
None
}
} else {
None
}
}
Err(e) => {
// TODO(#7562): support compressed files
if matches!(columns, CsvColumns::Header) {
bail!("Cannot determine header by reading CSV file: {}", e);
if let CsvColumns::Header { names } = columns {
if names.is_empty() {
bail!("Cannot determine header by reading CSV file: {}", e);
} else {
None
}
} else {
None
}
Expand All @@ -515,9 +523,36 @@ async fn purify_format_single(
};

match (&columns, first_row) {
(CsvColumns::Header, Some(cols)) => {
*columns = CsvColumns::HeaderNamed(cols.into_iter().map(Ident::from).collect())
(CsvColumns::Header { names }, Some(headers)) if names.is_empty() => {
*columns = CsvColumns::Header {
names: headers.into_iter().map(Ident::from).collect(),
};
}
(CsvColumns::Header { names }, Some(headers)) => {
if names.len() != headers.len() {
bail!(
"Named column count ({}) does not match \
number of columns discovered ({})",
names.len(),
headers.len()
);
} else if let Some((sql, csv)) = names
.iter()
.zip(headers.iter())
.find(|(sql, csv)| sql.as_str() != &**csv)
{
bail!("Header columns do not match named columns from CREATE SOURCE statement. \
First mismatched columns: {} != {}", sql, csv);
}
}
(CsvColumns::Header { names }, None) if names.is_empty() => {
bail!("WITH HEADER requires a way to determine the header row, but file does not exist");
}
(CsvColumns::Header { names }, None) => {
// we don't need to do any verification if we are told tha names of the headers
assert!(names.is_empty(), "match arm moved into the wrong");
}

(CsvColumns::Count(n), first_line) => {
if let Some(columns) = first_line {
if *n != columns.len() {
Expand All @@ -537,29 +572,6 @@ async fn purify_format_single(
.collect();
}
}
(CsvColumns::HeaderNamed(_), None) => {
// we don't need to do any verification if we are told tha names of the headers
}
(CsvColumns::HeaderNamed(names), Some(headers)) => {
if names.len() != headers.len() {
bail!(
"Named column count ({}) does not match \
number of columns discovered ({})",
names.len(),
headers.len()
);
} else if let Some((sql, csv)) = names
.iter()
.zip(headers.iter())
.find(|(sql, csv)| sql.as_str() != &**csv)
{
bail!("Header columns do not match named columns from CREATE SOURCE statement. \
First mismatched columns: {} != {}", sql, csv);
}
}
(CsvColumns::Header, None) => {
bail!("WITH HEADER requires a way to determine the header row, but file does not exist");
}
}
}
Format::Bytes | Format::Regex(_) | Format::Json | Format::Text => (),
Expand Down
8 changes: 4 additions & 4 deletions test/testdrive/csv-sources.td
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Specified column count (WITH 2 COLUMNS) does not match number of columns in CSV

> CREATE MATERIALIZED SOURCE matching_column_names
FROM FILE '${testdrive.temp-dir}/static.csv'
FORMAT CSV WITH HEADER COLUMNS (city, state, zip)
FORMAT CSV WITH HEADER (city, state, zip)

> SELECT * FROM matching_column_names where zip = '14618'
city state zip mz_line_no
Expand All @@ -37,7 +37,7 @@ Rochester NY 14618 1

> CREATE MATERIALIZED SOURCE matching_column_names_alias (a, b, c)
FROM FILE '${testdrive.temp-dir}/static.csv'
FORMAT CSV WITH HEADER COLUMNS (city, state, zip)
FORMAT CSV WITH HEADER (city, state, zip)

> SELECT * FROM matching_column_names_alias where c = '14618'
a b c mz_line_no
Expand All @@ -46,12 +46,12 @@ Rochester NY 14618 1

! CREATE SOURCE mismatched_column_names
FROM FILE '${testdrive.temp-dir}/static.csv'
FORMAT CSV WITH HEADER COLUMNS (cities, country, zip)
FORMAT CSV WITH HEADER (cities, country, zip)
Header columns do not match named columns from CREATE SOURCE statement. First mismatched columns: cities != city

! CREATE SOURCE mismatched_column_names_count
FROM FILE '${testdrive.temp-dir}/static.csv'
FORMAT CSV WITH HEADER COLUMNS (cities, state)
FORMAT CSV WITH HEADER (cities, state)
Named column count (2) does not match number of columns discovered (3)

# Static CSV without headers.
Expand Down

0 comments on commit fcda97a

Please sign in to comment.