Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Count surrogate pairs. #2090

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/CsvHelper/Configuration/CsvConfiguration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,23 @@ public void Validate()
if (lineEndings.Contains(Delimiter)) throw new ConfigurationException($"The delimiter '{Delimiter}' cannot be a line ending. ('\\r', '\\n', '\\r\\n')");
if (whiteSpaceChars.Contains(Delimiter)) throw new ConfigurationException($"The delimiter '{Delimiter}' cannot be a WhiteSpaceChar.");

for (int i = 0; i < Delimiter.Length; i++)
{
char c = Delimiter[i];
if (char.IsSurrogate(c))
{
if (i + 1 >= Delimiter.Length)
{
throw new ConfigurationException($"The delimiter '{Delimiter}' has invalid surrogate characters.");
}

if (!char.IsSurrogatePair(c, Delimiter[i + 1]))
{
throw new ConfigurationException($"The delimiter '{Delimiter}' has invalid surrogate characters.");
}
}
}

// Detect Delimiter
if (DetectDelimiter && DetectDelimiterValues.Length == 0) throw new ConfigurationException($"At least one value is required for {nameof(DetectDelimiterValues)} when {nameof(DetectDelimiter)} is enabled.");
}
Expand Down
53 changes: 47 additions & 6 deletions src/CsvHelper/CsvParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ public class CsvParser : IParser, IDisposable
private string delimiter;
private char delimiterFirstChar;
private char[] buffer;
private char[] surrogateBuffer;
private char[] charCountBuffer;
private int bufferSize;
private int charsRead;
private int bufferPosition;
Expand All @@ -73,6 +75,7 @@ public class CsvParser : IParser, IDisposable
private bool fieldIsBadData;
private bool fieldIsQuoted;
private bool isProcessingField;
private bool isPartiallyProcessedSurrogate;
private bool isRecordProcessed;
private string[]? record;

Expand Down Expand Up @@ -204,6 +207,11 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea
quote = configuration.Quote;
whiteSpaceChars = configuration.WhiteSpaceChars;
trimOptions = configuration.TrimOptions;
if (configuration.CountBytes)
{
surrogateBuffer = new char[2];
charCountBuffer = new char[1];
}

buffer = new char[bufferSize];
processFieldBuffer = new char[processFieldBufferSize];
Expand Down Expand Up @@ -347,7 +355,7 @@ private ReadLineResult ReadLine(ref char c, ref char cPrev)

if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
CountBytes(ref c);
}

if (maxFieldSize > 0 && bufferPosition - fieldStartPosition - 1 > maxFieldSize)
Expand Down Expand Up @@ -504,13 +512,41 @@ private ReadLineResult ReadSpaces(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
CountBytes(ref c);
}
}

return ReadLineResult.Complete;
}

private void CountBytes(ref char c)
{
if (isPartiallyProcessedSurrogate)
{
if (!char.IsSurrogatePair(surrogateBuffer[0], c))
{
throw new ParserException(Context, "CSV file contains invalid surrogate pairs.");
}

surrogateBuffer[1] = c;
byteCount += encoding.GetByteCount(surrogateBuffer);
isPartiallyProcessedSurrogate = false;
return;
}

if (char.IsSurrogate(c))
{
surrogateBuffer[0] = c;
isPartiallyProcessedSurrogate = true;
}
else
{
charCountBuffer[0] = c;
byteCount += encoding.GetByteCount(charCountBuffer);
}

}

private ReadLineResult ReadBlankLine(ref char c)
{
while (bufferPosition < charsRead)
Expand All @@ -534,7 +570,7 @@ private ReadLineResult ReadBlankLine(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
CountBytes(ref c);
}
}

Expand Down Expand Up @@ -565,7 +601,7 @@ private ReadLineResult ReadDelimiter(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new[] { c });
CountBytes(ref c);
}

if (bufferPosition >= charsRead)
Expand Down Expand Up @@ -603,7 +639,7 @@ private ReadLineResult ReadLineEnding(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new char[] { c });
CountBytes(ref c);
}
}
}
Expand Down Expand Up @@ -642,7 +678,7 @@ private ReadLineResult ReadNewLine(ref char c)
charCount++;
if (countBytes)
{
byteCount += encoding.GetByteCount(new[] { c });
CountBytes(ref c);
}

if (bufferPosition >= charsRead)
Expand Down Expand Up @@ -747,6 +783,11 @@ private bool FillBuffer()
charsRead = reader.Read(buffer, charsLeft, buffer.Length - charsLeft);
if (charsRead == 0)
{
if (isPartiallyProcessedSurrogate)
{
throw new ParserException(Context, "CSV file contains invalid surrogate pairs.");
}

return false;
}

Expand Down