From 22e40f9303afdd349223c7123c0c00666ecb026b Mon Sep 17 00:00:00 2001 From: Edgard Lorraine Messias Date: Sat, 1 Jun 2019 09:09:46 -0300 Subject: [PATCH] fix: Fixed encoding detection for gutter (close #526) (#590) --- package-lock.json | 44 +++++++++++++++++++-------- package.json | 1 - src/common/types.ts | 2 +- src/encoding.ts | 72 ++++++++++++++++++++++++++++++++++++++++++++ src/svn.ts | 52 +++++++++++++------------------- src/svnRepository.ts | 72 ++++++++++++++++++++++++++++++++++---------- 6 files changed, 182 insertions(+), 61 deletions(-) create mode 100644 src/encoding.ts diff --git a/package-lock.json b/package-lock.json index fa6e4755..c19b4d1e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2765,7 +2765,8 @@ "ansi-regex": { "version": "2.1.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "aproba": { "version": "1.2.0", @@ -2786,12 +2787,14 @@ "balanced-match": { "version": "1.0.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "brace-expansion": { "version": "1.1.11", "bundled": true, "dev": true, + "optional": true, "requires": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -2806,17 +2809,20 @@ "code-point-at": { "version": "1.1.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "concat-map": { "version": "0.0.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "console-control-strings": { "version": "1.1.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "core-util-is": { "version": "1.0.2", @@ -2933,7 +2939,8 @@ "inherits": { "version": "2.0.3", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "ini": { "version": "1.3.5", @@ -2945,6 +2952,7 @@ "version": "1.0.0", "bundled": true, "dev": true, + "optional": true, "requires": { "number-is-nan": "^1.0.0" } @@ -2959,6 +2967,7 @@ "version": "3.0.4", "bundled": true, "dev": true, + "optional": true, "requires": { "brace-expansion": "^1.1.7" } @@ -2966,12 +2975,14 @@ "minimist": { "version": "0.0.8", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "minipass": { "version": "2.3.5", "bundled": true, "dev": true, + "optional": true, "requires": { "safe-buffer": "^5.1.2", "yallist": "^3.0.0" @@ -2990,6 +3001,7 @@ "version": "0.5.1", "bundled": true, "dev": true, + "optional": true, "requires": { "minimist": "0.0.8" } @@ -3070,7 +3082,8 @@ "number-is-nan": { "version": "1.0.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "object-assign": { "version": "4.1.1", @@ -3082,6 +3095,7 @@ "version": "1.4.0", "bundled": true, "dev": true, + "optional": true, "requires": { "wrappy": "1" } @@ -3167,7 +3181,8 @@ "safe-buffer": { "version": "5.1.2", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "safer-buffer": { "version": "2.1.2", @@ -3203,6 +3218,7 @@ "version": "1.0.2", "bundled": true, "dev": true, + "optional": true, "requires": { "code-point-at": "^1.0.0", "is-fullwidth-code-point": "^1.0.0", @@ -3222,6 +3238,7 @@ "version": "3.0.1", "bundled": true, "dev": true, + "optional": true, "requires": { "ansi-regex": "^2.0.0" } @@ -3265,12 +3282,14 @@ "wrappy": { "version": "1.0.2", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "yallist": { "version": "3.0.3", "bundled": true, - "dev": true + "dev": true, + "optional": true } } }, @@ -4099,7 +4118,8 @@ "is-utf8": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", - "integrity": "sha1-Sw2hRCEE0bM2NA6AeX6GXPOffXI=" + "integrity": "sha1-Sw2hRCEE0bM2NA6AeX6GXPOffXI=", + "dev": true }, "is-windows": { "version": "1.0.2", diff --git a/package.json b/package.json index 3516edd8..6458c743 100644 --- a/package.json +++ b/package.json @@ -46,7 +46,6 @@ "style-check": "npx prettylint src/**/*.ts" }, "dependencies": { - "is-utf8": "^0.2.1", "minimatch": "^3.0.4", "original-fs": "^1.0.0", "semver": "^6.0.0", diff --git a/src/common/types.ts b/src/common/types.ts index 3797e522..44fd9fe5 100644 --- a/src/common/types.ts +++ b/src/common/types.ts @@ -190,7 +190,7 @@ export enum PropStatus { export interface ICpOptions extends SpawnOptions { cwd?: string; - encoding?: string; + encoding?: string | null; log?: boolean; username?: string; password?: string; diff --git a/src/encoding.ts b/src/encoding.ts new file mode 100644 index 00000000..5f4587eb --- /dev/null +++ b/src/encoding.ts @@ -0,0 +1,72 @@ +import { jschardet } from "./vscodeModules"; + +jschardet.Constants.MINIMUM_THRESHOLD = 0.2; +jschardet.MacCyrillicModel.mTypicalPositiveRatio += 0.001; + +function detectEncodingByBOM(buffer: Buffer): string | null { + if (!buffer || buffer.length < 2) { + return null; + } + + const b0 = buffer.readUInt8(0); + const b1 = buffer.readUInt8(1); + + // UTF-16 BE + if (b0 === 0xfe && b1 === 0xff) { + return "utf16be"; + } + + // UTF-16 LE + if (b0 === 0xff && b1 === 0xfe) { + return "utf16le"; + } + + if (buffer.length < 3) { + return null; + } + + const b2 = buffer.readUInt8(2); + + // UTF-8 + if (b0 === 0xef && b1 === 0xbb && b2 === 0xbf) { + return "utf8"; + } + + return null; +} + +const IGNORE_ENCODINGS = ["ascii", "utf-8", "utf-16", "utf-32"]; + +const JSCHARDET_TO_ICONV_ENCODINGS: { [name: string]: string } = { + ibm866: "cp866", + big5: "cp950" +}; + +export function detectEncoding(buffer: Buffer): string | null { + const result = detectEncodingByBOM(buffer); + + if (result) { + return result; + } + + const detected = jschardet.detect(buffer); + + if (!detected || !detected.encoding || detected.confidence < 0.8) { + return null; + } + + const encoding = detected.encoding; + + // Ignore encodings that cannot guess correctly + // (http://chardet.readthedocs.io/en/latest/supported-encodings.html) + if (0 <= IGNORE_ENCODINGS.indexOf(encoding.toLowerCase())) { + return null; + } + + const normalizedEncodingName = encoding + .replace(/[^a-zA-Z0-9]/g, "") + .toLowerCase(); + const mapped = JSCHARDET_TO_ICONV_ENCODINGS[normalizedEncodingName]; + + return mapped || normalizedEncodingName; +} diff --git a/src/svn.ts b/src/svn.ts index 44b46549..cded3104 100644 --- a/src/svn.ts +++ b/src/svn.ts @@ -1,6 +1,5 @@ import * as cp from "child_process"; import { EventEmitter } from "events"; -import isUtf8 = require("is-utf8"); import * as proc from "process"; import { Readable } from "stream"; import { @@ -9,12 +8,13 @@ import { IExecutionResult, ISvnOptions } from "./common/types"; +import * as encodeUtil from "./encoding"; import { configuration } from "./helpers/configuration"; import { parseInfoXml } from "./infoParser"; import SvnError from "./svnError"; import { Repository } from "./svnRepository"; import { dispose, IDisposable, toDisposable } from "./util"; -import { iconv, jschardet } from "./vscodeModules"; +import { iconv } from "./vscodeModules"; export const svnErrorCodes: { [key: string]: string } = { AuthorizationFailed: "E170001", @@ -102,9 +102,14 @@ export class Svn { // Force non interactive environment args.push("--non-interactive"); - let encoding = options.encoding || ""; + let encoding: string | undefined | null = options.encoding; delete options.encoding; + // SVN with '--xml' always return 'UTF-8', and jschardet detects this encoding: 'TIS-620' + if (args.includes("--xml")) { + encoding = "utf8"; + } + const defaults: cp.SpawnOptions = { env: proc.env }; @@ -156,35 +161,20 @@ export class Svn { dispose(disposables); - // SVN with '--xml' always return 'UTF-8', and jschardet detects this encoding: 'TIS-620' - if (args.includes("--xml")) { - encoding = "utf8"; - } else if (encoding === "") { - encoding = "utf8"; // Initial encoding - - const defaultEncoding = configuration.get("default.encoding"); - if (defaultEncoding) { - if (!iconv.encodingExists(defaultEncoding)) { - this.logOutput( - "svn.default.encoding: Invalid Parameter: '" + - defaultEncoding + - "'.\n" - ); - } else if (!isUtf8(stdout)) { - encoding = defaultEncoding; - } - } else { - jschardet.MacCyrillicModel.mTypicalPositiveRatio += 0.001; - - const encodingGuess = jschardet.detect(stdout); - - if ( - encodingGuess.confidence > 0.8 && - iconv.encodingExists(encodingGuess.encoding) - ) { - encoding = encodingGuess.encoding; - } + if (!encoding) { + encoding = encodeUtil.detectEncoding(stdout); + } + + // if not detected + if (!encoding) { + encoding = configuration.get("default.encoding"); + } + + if (!iconv.encodingExists(encoding)) { + if (encoding) { + console.warn(`SVN: The encoding "${encoding}" is invalid`); } + encoding = "utf8"; } const decodedStdout = iconv.decode(stdout, encoding); diff --git a/src/svnRepository.ts b/src/svnRepository.ts index 9b897b2d..2b9744b7 100644 --- a/src/svnRepository.ts +++ b/src/svnRepository.ts @@ -12,6 +12,7 @@ import { SvnDepth } from "./common/types"; import { sequentialize } from "./decorators"; +import * as encodeUtil from "./encoding"; import { exists, writeFile } from "./fs"; import { getBranchName } from "./helpers/branch"; import { configuration } from "./helpers/configuration"; @@ -20,7 +21,12 @@ import { parseSvnList } from "./listParser"; import { parseSvnLog } from "./logParser"; import { parseStatusXml } from "./statusParser"; import { Svn } from "./svn"; -import { fixPathSeparator, fixPegRevision, unwrap } from "./util"; +import { + fixPathSeparator, + fixPegRevision, + normalizePath, + unwrap +} from "./util"; export class Repository { private _infoCache: { [index: string]: ISvnInfo } = {}; @@ -165,20 +171,23 @@ export class Repository { public async show(file: string | Uri, revision?: string): Promise { const args = ["cat"]; - let target: string; + + let uri: Uri; + let filePath: string; + if (file instanceof Uri) { - target = file.toString(true); + uri = file; + filePath = file.toString(true); } else { - target = file; + uri = Uri.file(file); + filePath = file; } + + let target: string = this.removeAbsolutePath(filePath); if (revision) { args.push("-r", revision); - if ( - typeof file === "string" && - !["BASE", "COMMITTED", "PREV"].includes(revision.toUpperCase()) - ) { + if (["BASE", "COMMITTED", "PREV"].includes(revision.toUpperCase())) { const info = await this.getInfo(); - target = this.removeAbsolutePath(target); target = info.url + "/" + target.replace(/\\/g, "/"); // TODO move to SvnRI } @@ -186,13 +195,44 @@ export class Repository { args.push(target); - let encoding = "utf8"; - if (typeof file === "string") { - const uri = Uri.file(file); - file = this.removeAbsolutePath(file); - encoding = workspace - .getConfiguration("files", uri) - .get("encoding", encoding); + /** + * ENCODE DETECTION + * if TextDocuments exists and autoGuessEncoding is true, + * try detect current encoding of content + */ + const configs = workspace.getConfiguration("files", uri); + + let encoding: string | undefined | null = configs.get("encoding"); + let autoGuessEncoding: boolean = configs.get( + "autoGuessEncoding", + false + ); + + const textDocument = workspace.textDocuments.find( + doc => normalizePath(doc.uri.fsPath) === normalizePath(filePath) + ); + + if (textDocument) { + // Load encoding by languageId + const languageConfigs = workspace.getConfiguration( + `[${textDocument.languageId}]`, + uri + ); + if (languageConfigs["files.encoding"] !== undefined) { + encoding = languageConfigs["files.encoding"]; + } + if (languageConfigs["files.autoGuessEncoding"] !== undefined) { + autoGuessEncoding = languageConfigs["files.autoGuessEncoding"]; + } + + if (autoGuessEncoding) { + // The `getText` return a `utf-8` string + const buffer = Buffer.from(textDocument.getText(), "utf-8"); + const detectedEncoding = encodeUtil.detectEncoding(buffer); + if (detectedEncoding) { + encoding = detectedEncoding; + } + } } const result = await this.exec(args, { encoding });