GT-3407 Fix UnsupportedOp Exception with JISAutodetect charset. #1358

Fixes github issue #1358. Some character sets don't support the encoding operation.
NationalSecurityAgency · Dec 23, 2019 · 93bcabe · 93bcabe
1 parent 4c57727
commit 93bcabe
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 22 deletions.
diff --git a/...ramework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java b/...ramework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringDataInstance.java
@@ -684,8 +684,8 @@ private String getStringRep(char quoteChar, char quoteCharMulti) {
 
 		// if we get the same number of characters out that we put into the decoder,
 		// then its a good chance there is a one-to-one correspondence between original char
-		// and decoded char.
-		boolean canRecoverOriginalCharBytes =
+		// offsets and decoded char offsets.
+		boolean isByteToStringCharEquiv =
 			stringValue.length() == ((stringBytes.length - aci.byteStartOffset) / charSize);
 
 		stringValue = stringLayout.shouldTrimTrailingNulls() ? trimNulls(stringValue) : stringValue;
@@ -701,22 +701,10 @@ private String getStringRep(char quoteChar, char quoteCharMulti) {
 		// For each 32bit character in the java string try to add it to the StringRenderBuilder
 		for (int i = 0, strLength = stringValue.length(); i < strLength;) {
 			int codePoint = stringValue.codePointAt(i);
-			byte[] originalCharBytes;
-			if (canRecoverOriginalCharBytes) {
-				originalCharBytes = new byte[charSize];
-				System.arraycopy(stringBytes, i * charSize + aci.byteStartOffset, originalCharBytes,
-					0, charSize);
-			}
-			else {
-				// can't get original bytes, cheat and run the codePoint through the charset
-				// to get what should be the same as the original bytes.
-				String singleCharStr = new String(new int[] { codePoint }, 0, 1);
-				originalCharBytes = convertStringToBytes(singleCharStr, aci);
-			}
 
 			RENDER_ENUM currentCharRenderSetting = renderSetting;
-			if (codePoint == StringUtilities.UNICODE_REPLACEMENT && canRecoverOriginalCharBytes &&
-				isMismatchedCharBytes(originalCharBytes, codePoint)) {
+			if (codePoint == StringUtilities.UNICODE_REPLACEMENT && isByteToStringCharEquiv &&
+				!isReplacementCharAt(stringBytes, i * charSize + aci.byteStartOffset)) {
 				// if this is a true decode error and we can recover the original bytes,
 				// then force the render mode to byte seq.
 				currentCharRenderSetting = RENDER_ENUM.BYTE_SEQ;
@@ -753,7 +741,8 @@ else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
 						strBuf.addCodePointChar(codePoint);
 						break;
 					case BYTE_SEQ:
-						strBuf.addByteSeq(originalCharBytes);
+						strBuf.addByteSeq(getOriginalBytes(isByteToStringCharEquiv, i, codePoint,
+							stringBytes, aci));
 						break;
 					case ESC_SEQ:
 						strBuf.addEscapedCodePoint(codePoint);
@@ -779,6 +768,26 @@ else if (Character.isISOControl(codePoint) || !Character.isDefined(codePoint) ||
 		return prefix + strBuf.toString();
 	}
 
+	private byte[] getOriginalBytes(boolean isByteToStringCharEquiv, int charOffset, int codePoint,
+			byte[] stringBytes, AdjustedCharsetInfo aci) {
+
+		if (isByteToStringCharEquiv) {
+			byte[] originalCharBytes = new byte[charSize];
+			System.arraycopy(stringBytes, charOffset * charSize + aci.byteStartOffset,
+				originalCharBytes, 0, charSize);
+			return originalCharBytes;
+		}
+
+		// can't get original bytes, cheat and run the codePoint through the charset
+		// to get what should be the same as the original bytes.
+		String singleCharStr = new String(new int[] { codePoint }, 0, 1);
+		Charset cs = Charset.isSupported(aci.charsetName) ? Charset.forName(aci.charsetName) : null;
+		if (cs == null || !cs.canEncode()) {
+			return null;
+		}
+		return singleCharStr.getBytes(cs);
+	}
+
 	/**
 	 * Trims trailing nulls off the end of the string.
 	 *
@@ -837,10 +846,13 @@ public String getCharRepresentation() {
 			StringRenderBuilder.DOUBLE_QUOTE);
 	}
 
-	private boolean isMismatchedCharBytes(byte[] originalCharBytes, int codePoint) {
-		long originalValue = DataConverter.getInstance(buf.isBigEndian()).getValue(
-			originalCharBytes, Math.min(charSize, originalCharBytes.length));
-		return originalValue != codePoint;
+	private boolean isReplacementCharAt(byte[] stringBytes, int byteOffset) {
+		if (byteOffset + charSize > stringBytes.length) {
+			return false;
+		}
+		long origCodePointValue = DataConverter.getInstance(buf.isBigEndian()).getValue(stringBytes,
+			byteOffset, charSize);
+		return origCodePointValue == StringUtilities.UNICODE_REPLACEMENT;
 	}
 
 	private static String getTranslatedStringRepresentation(String translatedString) {

diff --git a/...amework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java b/...amework/SoftwareModeling/src/main/java/ghidra/program/model/data/StringRenderBuilder.java
@@ -112,9 +112,14 @@ public void addCodePointValue(int codePoint) {
 	 * <p>
 	 * {@literal { 0, 1, 2 } -> 00,01,02}
 	 *
-	 * @param bytes
+	 * @param bytes to convert to hex and append.  If null, append "???"
 	 */
 	public void addByteSeq(byte[] bytes) {
+		if (bytes == null) {
+			ensureByteMode();
+			sb.append("???");
+			return;
+		}
 		for (int i = 0; i < bytes.length; i++) {
 			ensureByteMode();
 			String valStr = Integer.toHexString(bytes[i] & 0xff).toUpperCase();