Skip to content

Commit

Permalink
fix: Fixed encoding detection for gutter (close #526) (#590)
Browse files Browse the repository at this point in the history
  • Loading branch information
edgardmessias authored and JohnstonCode committed Jun 1, 2019
1 parent 73f0ec6 commit 22e40f9
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 61 deletions.
44 changes: 32 additions & 12 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
"style-check": "npx prettylint src/**/*.ts"
},
"dependencies": {
"is-utf8": "^0.2.1",
"minimatch": "^3.0.4",
"original-fs": "^1.0.0",
"semver": "^6.0.0",
Expand Down
2 changes: 1 addition & 1 deletion src/common/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ export enum PropStatus {

export interface ICpOptions extends SpawnOptions {
cwd?: string;
encoding?: string;
encoding?: string | null;
log?: boolean;
username?: string;
password?: string;
Expand Down
72 changes: 72 additions & 0 deletions src/encoding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { jschardet } from "./vscodeModules";

jschardet.Constants.MINIMUM_THRESHOLD = 0.2;
jschardet.MacCyrillicModel.mTypicalPositiveRatio += 0.001;

function detectEncodingByBOM(buffer: Buffer): string | null {
if (!buffer || buffer.length < 2) {
return null;
}

const b0 = buffer.readUInt8(0);
const b1 = buffer.readUInt8(1);

// UTF-16 BE
if (b0 === 0xfe && b1 === 0xff) {
return "utf16be";
}

// UTF-16 LE
if (b0 === 0xff && b1 === 0xfe) {
return "utf16le";
}

if (buffer.length < 3) {
return null;
}

const b2 = buffer.readUInt8(2);

// UTF-8
if (b0 === 0xef && b1 === 0xbb && b2 === 0xbf) {
return "utf8";
}

return null;
}

const IGNORE_ENCODINGS = ["ascii", "utf-8", "utf-16", "utf-32"];

const JSCHARDET_TO_ICONV_ENCODINGS: { [name: string]: string } = {
ibm866: "cp866",
big5: "cp950"
};

export function detectEncoding(buffer: Buffer): string | null {
const result = detectEncodingByBOM(buffer);

if (result) {
return result;
}

const detected = jschardet.detect(buffer);

if (!detected || !detected.encoding || detected.confidence < 0.8) {
return null;
}

const encoding = detected.encoding;

// Ignore encodings that cannot guess correctly
// (http://chardet.readthedocs.io/en/latest/supported-encodings.html)
if (0 <= IGNORE_ENCODINGS.indexOf(encoding.toLowerCase())) {
return null;
}

const normalizedEncodingName = encoding
.replace(/[^a-zA-Z0-9]/g, "")
.toLowerCase();
const mapped = JSCHARDET_TO_ICONV_ENCODINGS[normalizedEncodingName];

return mapped || normalizedEncodingName;
}
52 changes: 21 additions & 31 deletions src/svn.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import * as cp from "child_process";
import { EventEmitter } from "events";
import isUtf8 = require("is-utf8");
import * as proc from "process";
import { Readable } from "stream";
import {
Expand All @@ -9,12 +8,13 @@ import {
IExecutionResult,
ISvnOptions
} from "./common/types";
import * as encodeUtil from "./encoding";
import { configuration } from "./helpers/configuration";
import { parseInfoXml } from "./infoParser";
import SvnError from "./svnError";
import { Repository } from "./svnRepository";
import { dispose, IDisposable, toDisposable } from "./util";
import { iconv, jschardet } from "./vscodeModules";
import { iconv } from "./vscodeModules";

export const svnErrorCodes: { [key: string]: string } = {
AuthorizationFailed: "E170001",
Expand Down Expand Up @@ -102,9 +102,14 @@ export class Svn {
// Force non interactive environment
args.push("--non-interactive");

let encoding = options.encoding || "";
let encoding: string | undefined | null = options.encoding;
delete options.encoding;

// SVN with '--xml' always return 'UTF-8', and jschardet detects this encoding: 'TIS-620'
if (args.includes("--xml")) {
encoding = "utf8";
}

const defaults: cp.SpawnOptions = {
env: proc.env
};
Expand Down Expand Up @@ -156,35 +161,20 @@ export class Svn {

dispose(disposables);

// SVN with '--xml' always return 'UTF-8', and jschardet detects this encoding: 'TIS-620'
if (args.includes("--xml")) {
encoding = "utf8";
} else if (encoding === "") {
encoding = "utf8"; // Initial encoding

const defaultEncoding = configuration.get<string>("default.encoding");
if (defaultEncoding) {
if (!iconv.encodingExists(defaultEncoding)) {
this.logOutput(
"svn.default.encoding: Invalid Parameter: '" +
defaultEncoding +
"'.\n"
);
} else if (!isUtf8(stdout)) {
encoding = defaultEncoding;
}
} else {
jschardet.MacCyrillicModel.mTypicalPositiveRatio += 0.001;

const encodingGuess = jschardet.detect(stdout);

if (
encodingGuess.confidence > 0.8 &&
iconv.encodingExists(encodingGuess.encoding)
) {
encoding = encodingGuess.encoding;
}
if (!encoding) {
encoding = encodeUtil.detectEncoding(stdout);
}

// if not detected
if (!encoding) {
encoding = configuration.get<string>("default.encoding");
}

if (!iconv.encodingExists(encoding)) {
if (encoding) {
console.warn(`SVN: The encoding "${encoding}" is invalid`);
}
encoding = "utf8";
}

const decodedStdout = iconv.decode(stdout, encoding);
Expand Down
Loading

0 comments on commit 22e40f9

Please sign in to comment.