Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed a lot of parsing errors #11

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c06ca03
Added support for timezone abbreviations
terhechte Sep 29, 2021
eef0baf
Add support for the 9:23:47 time format
terhechte Sep 29, 2021
c3607e7
Fix compile issue when enabling MIME
terhechte Sep 29, 2021
fd66dae
Support for latin1, shiftjs, etc codepage chars in subjects
terhechte Sep 29, 2021
ce4c06f
Support emails with no sender but multiple from addresses
terhechte Sep 29, 2021
9c2caf6
Support '.' character in from names (e.g. "From: hey.io <info@heo.io>"
terhechte Sep 29, 2021
6d8b94d
Support more address chars in From. Add '@'
terhechte Sep 29, 2021
1767c3a
Support for years in dates with only two digits. E.g. 11 for 2011
terhechte Sep 29, 2021
a0614de
Support display names in quotes with any characters inside
terhechte Sep 29, 2021
7c7acc0
Add support for '+00:00' timezone format
terhechte Sep 29, 2021
3e644dc
Some emails add a comment at the end of the date to indicate a timezone
terhechte Sep 29, 2021
305329f
Allow multiple 'to' values in headers as this is something gmail does…
terhechte Sep 29, 2021
c6199ef
Add feature flag to allow multiple headers for all fields. Vecs are j…
terhechte Sep 29, 2021
b2c93e0
Some addresses have additional whitespace after the closing angle
terhechte Sep 29, 2021
2cdebe6
Fix for an issue where unknown fields had weird unicode letters in th…
terhechte Sep 29, 2021
ff8e2a5
Support broken headers by moving them into unsupported (making parsin…
terhechte Sep 29, 2021
2100e15
Add feature to disable mime body decoding
terhechte Sep 29, 2021
88ba128
Add option to get the number for a month
terhechte Sep 29, 2021
21021c9
Added tests for many of the issues that were fixed in the past commits
terhechte Sep 29, 2021
da8582a
Update dependency
terhechte Sep 29, 2021
dba59d8
Support Apple Mail messages which use LF instead of CRLF
terhechte Oct 9, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions email-parser/Cargo.toml
Expand Up @@ -12,6 +12,7 @@ keywords = ["email", "mail", "mime", "parser"]

[dependencies]
textcode = {version="0.2", optional=true}
timezone-abbreviations = "0.1.0"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct me if I am wrong but custom timezone is defined by RFC 822 and this crate is focusing on RFC 5322. There is a feature named compatibility-fixes to allow older syntaxes. Please put everything timezone-related under this feature gate (including the new dependency).


[features]
default = ["headers"]
Expand All @@ -35,6 +36,8 @@ compatibility-fixes = []
content-disposition = ["mime"]
unrecognized-headers = ["mime"]
mime = ["textcode"]
allow-duplicate-headers = []
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need a feature for that. I would like to leave the Email struct untouched, and rather create a new PermissiveEmail struct storing headers as Vec of their values so that it allows duplicate and even missing headers. But Email should designate a compliant email.

decode-mime-body = []

[dev-dependencies]
email = "0.0.21"
Expand Down
171 changes: 81 additions & 90 deletions email-parser/src/email.rs
Expand Up @@ -145,91 +145,47 @@ impl<'a> Email<'a> {
match field {
#[cfg(feature = "from")]
Field::From(mailboxes) => {
if from.is_none() {
from = Some(mailboxes)
} else {
return Err(Error::DuplicateHeader("From"));
}
merge_headers(&mut from, mailboxes, "From")?;
}
#[cfg(feature = "sender")]
Field::Sender(mailbox) => {
if sender.is_none() {
sender = Some(mailbox)
} else {
return Err(Error::DuplicateHeader("Sender"));
}
assign_header(&mut sender, mailbox, "Sender")?;
}
#[cfg(feature = "subject")]
Field::Subject(data) => {
if subject.is_none() {
subject = Some(data)
} else {
return Err(Error::DuplicateHeader("Subject"));
}
assign_header(&mut subject, data, "Subject")?;
}
#[cfg(feature = "date")]
Field::Date(data) => {
if date.is_none() {
date = Some(data)
} else {
return Err(Error::DuplicateHeader("Date"));
}
assign_header(&mut date, data, "Date")?;
}
#[cfg(feature = "to")]
Field::To(addresses) => {
if to.is_none() {
to = Some(addresses)
} else {
return Err(Error::DuplicateHeader("To"));
}
merge_headers(&mut to, addresses, "To")?;
}
#[cfg(feature = "cc")]
Field::Cc(addresses) => {
if cc.is_none() {
cc = Some(addresses)
} else {
return Err(Error::DuplicateHeader("Cc"));
}
merge_headers(&mut cc, addresses, "Cc")?;
}
#[cfg(feature = "bcc")]
Field::Bcc(addresses) => {
if bcc.is_none() {
bcc = Some(addresses)
} else {
return Err(Error::DuplicateHeader("Bcc"));
}
merge_headers(&mut bcc, addresses, "Bcc")?;
}
#[cfg(feature = "message-id")]
Field::MessageId(id) => {
if message_id.is_none() {
message_id = Some(id)
} else {
return Err(Error::DuplicateHeader("Message-ID"));
}
assign_header(&mut message_id, id, "Message-ID")?;
}
#[cfg(feature = "in-reply-to")]
Field::InReplyTo(ids) => {
if in_reply_to.is_none() {
in_reply_to = Some(ids)
} else {
return Err(Error::DuplicateHeader("In-Reply-To"));
}
merge_headers(&mut in_reply_to, ids, "In-Reply-To")?;
}
#[cfg(feature = "references")]
Field::References(ids) => {
if references.is_none() {
references = Some(ids)
} else {
return Err(Error::DuplicateHeader("References"));
}
merge_headers(&mut references, ids, "References")?;
}
#[cfg(feature = "reply-to")]
Field::ReplyTo(mailboxes) => {
if reply_to.is_none() {
reply_to = Some(mailboxes)
} else {
return Err(Error::DuplicateHeader("Reply-To"));
}
merge_headers(&mut reply_to, mailboxes, "Reply-To")?;
}
#[cfg(feature = "comments")]
Field::Comments(data) => comments.push(data),
Expand All @@ -247,55 +203,39 @@ impl<'a> Email<'a> {
}
#[cfg(feature = "mime")]
Field::MimeVersion(major, minor) => {
if mime_version.is_none() {
mime_version = Some((major, minor))
} else {
return Err(Error::DuplicateHeader("Mime-Version"));
}
assign_header(&mut mime_version, (major, minor), "Mime-Version")?;
}
#[cfg(feature = "mime")]
Field::ContentType {
mime_type,
subtype,
parameters,
} => {
if content_type.is_none() {
content_type = Some((mime_type, subtype, parameters))
} else {
return Err(Error::DuplicateHeader("Content-Type"));
}
assign_header(
&mut content_type,
(mime_type, subtype, parameters),
"Content-Type",
)?;
}
#[cfg(feature = "mime")]
Field::ContentTransferEncoding(encoding) => {
if content_transfer_encoding.is_none() {
content_transfer_encoding = Some(encoding)
} else {
return Err(Error::DuplicateHeader("Content-Transfer-Encoding"));
}
assign_header(
&mut content_transfer_encoding,
encoding,
"Content-Transfer-Encoding",
)?;
}
#[cfg(feature = "mime")]
Field::ContentId(id) => {
if content_id.is_none() {
content_id = Some(id)
} else {
return Err(Error::DuplicateHeader("Content-Id"));
}
assign_header(&mut content_id, id, "Content-Id")?;
}
#[cfg(feature = "mime")]
Field::ContentDescription(description) => {
if content_description.is_none() {
content_description = Some(description)
} else {
return Err(Error::DuplicateHeader("Content-Description"));
}
assign_header(&mut content_description, description, "Content-Description")?;
}
#[cfg(feature = "content-disposition")]
Field::ContentDisposition(disposition) => {
if content_disposition.is_none() {
content_disposition = Some(disposition)
} else {
return Err(Error::DuplicateHeader("Content-Disposition"));
}
assign_header(&mut content_disposition, disposition, "Content-Disposition")?;
}
Field::Unknown { name, value } => {
unknown_fields.push((name, value));
Expand All @@ -312,7 +252,7 @@ impl<'a> Email<'a> {
let sender = match sender {
Some(sender) => sender,
None => {
if from.len() == 1 {
if from.len() >= 1 {
from[0].clone()
} else {
return Err(Error::MissingHeader("Sender"));
Expand All @@ -329,11 +269,21 @@ impl<'a> Email<'a> {
.into_iter()
.collect(),
)),
#[allow(unused_variables)]
if let Some(body) = body {
Some(crate::parsing::mime::entity::decode_value(
Cow::Borrowed(body),
content_transfer_encoding.unwrap_or(ContentTransferEncoding::SevenBit),
)?)
#[cfg(feature = "decode-mime-body")]
{
crate::parsing::mime::entity::decode_value(
Cow::Borrowed(body),
content_transfer_encoding.unwrap_or(ContentTransferEncoding::SevenBit),
)
.ok()
}

#[cfg(not(feature = "decode-mime-body"))]
{
None
}
} else {
None
},
Expand Down Expand Up @@ -395,6 +345,45 @@ impl<'a> std::convert::TryFrom<&'a [u8]> for Email<'a> {
}
}

#[allow(unused_variables, unused_mut)]
fn merge_headers<T>(
existing: &mut Option<Vec<T>>,
mut new: Vec<T>,
name: &'static str,
) -> Result<(), Error> {
#[cfg(not(feature = "allow-duplicate-headers"))]
if existing.is_some() {
return Err(Error::DuplicateHeader(name));
} else {
*existing = Some(new);
}

#[cfg(feature = "allow-duplicate-headers")]
if let Some(value) = existing.as_mut() {
let value: &mut Vec<T> = value;
value.append(&mut new);
} else {
*existing = Some(new);
}
Ok(())
}

#[allow(unused_variables)]
fn assign_header<T>(existing: &mut Option<T>, new: T, name: &'static str) -> Result<(), Error> {
#[cfg(not(feature = "allow-duplicate-headers"))]
if existing.is_some() {
return Err(Error::DuplicateHeader(name));
} else {
*existing = Some(new);
}

#[cfg(feature = "allow-duplicate-headers")]
if existing.is_none() {
*existing = Some(new);
}
Ok(())
}

#[cfg(test)]
mod test {
use super::*;
Expand Down Expand Up @@ -423,6 +412,7 @@ mod test {
)
.is_err());

#[cfg(not(feature = "allow-duplicate-headers"))]
assert!(Email::parse(
// 2 date fields
b"\
Expand All @@ -443,6 +433,7 @@ mod test {
)
.is_err());

#[cfg(not(feature = "allow-duplicate-headers"))]
assert!(Email::parse(
// 2 from fields
b"\
Expand Down
9 changes: 8 additions & 1 deletion email-parser/src/parsing/address.rs
Expand Up @@ -66,7 +66,12 @@ pub fn angle_addr(input: &[u8]) -> Res<EmailAddress> {
}

pub fn name_addr(input: &[u8]) -> Res<Mailbox> {
let (input, display_name) = optional(input, phrase);
let (input, display_name) = if let (input, Some(display_name)) = optional(input, in_quotes) {
(input, Some(display_name))
} else {
optional(input, phrase)
};

let (input, angle_addr) = angle_addr(input)?;

Ok((
Expand Down Expand Up @@ -144,6 +149,8 @@ pub fn mailbox_list(input: &[u8]) -> Res<Vec<Mailbox>> {
mailboxes.push(new_mailbox);
}

let (input, _) = skip_whitespace(&input)?;

Ok((input, mailboxes))
}

Expand Down
5 changes: 5 additions & 0 deletions email-parser/src/parsing/character_sets.rs
Expand Up @@ -17,6 +17,11 @@ pub fn is_vchar(character: u8) -> bool {
character >= 0x21 && character <= 0x7e
}

#[inline]
pub fn is_codepage_vchar(character: u8) -> bool {
character >= 0x21 && character <= 0xfe
}

#[inline]
pub fn is_alpha(c: u8) -> bool {
(c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a)
Expand Down
29 changes: 29 additions & 0 deletions email-parser/src/parsing/combinators.rs
@@ -1,6 +1,15 @@
use crate::prelude::*;
use std::borrow::Cow;

#[inline]
pub(crate) fn newline<'a>(input: &'a [u8], error_message: &'static str) -> Res<'a, ()> {
#[cfg(feature = "compatibility-fixes")]
return tag2(input, b"\r\n", b"\n", error_message);

#[cfg(not(feature = "compatibility-fixes"))]
return tag(input, b"\r\n", error_message);
}

#[inline]
pub(crate) fn tag<'a>(
input: &'a [u8],
Expand All @@ -15,6 +24,26 @@ pub(crate) fn tag<'a>(
}
}

#[inline]
pub(crate) fn tag2<'a>(
input: &'a [u8],
expected1: &'static [u8],
expected2: &'static [u8],
error_message: &'static str,
) -> Res<'a, ()> {
debug_assert!(std::str::from_utf8(expected1).is_ok());
if input.starts_with(expected1) {
Ok((unsafe { input.get_unchecked(expected1.len()..) }, ()))
} else {
debug_assert!(std::str::from_utf8(expected2).is_ok());
if input.starts_with(expected2) {
Ok((unsafe { input.get_unchecked(expected2.len()..) }, ()))
} else {
Err(Error::Explicit(error_message))
}
}
}

#[inline]
pub(crate) fn tag_no_case<'a>(
input: &'a [u8],
Expand Down