Resolved additional encoding issues which could take place when displ…

…aying RT_VERSION resources. Now most strings handled by the program should be UTF-8 encoded.
JusticeRage · May 29, 2016 · af2b664 · af2b664
1 parent 7d1827c
commit af2b664
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 27 deletions.
diff --git a/include/manacommons/escape.h b/include/manacommons/escape.h
@@ -64,6 +64,8 @@ struct escaped_string_raw
  *
  *	Paths contained in debug information insert unescaped backslashes which cause
  *	the resulting JSON to be invalid.
+ *	Non-printable characters are not escaped in this grammar, because we expect
+ *	UTF-8 strings.
  *
  *	WARNING: Single quotes are NOT escaped.
  */
@@ -82,9 +84,7 @@ struct escaped_string_json
 			('\r', "\\r")('\t', "\\t")('\v', "\\v")('\\', "\\\\")
 			('\"', "\\\"");
 
-		// JSON fails miserably on non-printable characters, but
-		// at the same time doesn't support the \x notation.
-		esc_str = *(esc_char | boost::spirit::karma::iso8859_1::print | "\\u00" << karma::right_align(2, 0)[karma::hex]);
+		esc_str = *(esc_char | boost::spirit::karma::char_);
 	}
 
 	karma::rule<OutputIterator, std::string()> esc_str;

diff --git a/include/manape/utils.h b/include/manape/utils.h
@@ -70,7 +70,7 @@ std::string read_ascii_string(FILE* f, unsigned int max_bytes = 0);
  *
  *	@param	FILE* f The file from which to read. The read will occur at the cursor's current position!
  *
- *	@return	The string at the current location in the file, converted to ASCII.
+ *	@return	The string at the current location in the file, encoded as UTF-8.
  */
 std::string read_prefixed_unicode_string(FILE* f);
 
@@ -94,7 +94,7 @@ std::wstring read_prefixed_unicode_wstring(FILE* f);
  *	@param	int max_bytes The maximum number of bytes to read from the file. 0 means no limit.
  *			If this parameter is odd, it will be rounded to max_bytes-1 since bytes are read two by two.
  *
- *	@return	The string at the current location in the file, converted to ASCII.
+ *	@return	The string at the current location in the file, encoded as UTF-8.
  */
 std::string read_unicode_string(FILE* f, unsigned int max_bytes = 0);
 

diff --git a/manape/resources.cpp b/manape/resources.cpp
@@ -292,16 +292,8 @@ DECLSPEC const_shared_strings Resource::interpret_as()
 	// RT_STRING resources are made of 16 contiguous "unicode" strings.
 	for (int i = 0; i < 16; ++i)
 	{
-		std::wstring s = utils::read_prefixed_unicode_wstring(f);
+		res->push_back(utils::read_prefixed_unicode_string(f));
 		std::vector<boost::uint8_t> utf8result;
-		try 
-		{
-			utf8::utf16to8(s.begin(), s.end(), std::back_inserter(utf8result));
-			res->push_back(std::string(utf8result.begin(), utf8result.end()));
-		}
-		catch (utf8::invalid_utf16)  {
-			PRINT_WARNING << "Couldn't convert a string from a RT_STRING resource to UTF-8!" << std::endl;
-		}
 	}
 
 	END:

diff --git a/manape/utils.cpp b/manape/utils.cpp
@@ -61,11 +61,17 @@ std::string read_unicode_string(FILE* f, unsigned int max_bytes)
 		}
 	}
 
-	// Convert the wstring into a string
-	auto conv = boost::shared_array<char>(new char[s.size() + 1]);
-	memset(conv.get(), 0, sizeof(char) * (s.size() + 1));
-	wcstombs(conv.get(), s.c_str(), s.size());
-	return std::string(conv.get());
+	try
+	{
+		std::vector<boost::uint8_t> utf8result;
+		utf8::utf16to8(s.begin(), s.end(), std::back_inserter(utf8result));
+		return std::string(utf8result.begin(), utf8result.end());
+	}
+	catch (utf8::invalid_utf16) {
+		PRINT_WARNING << "Couldn't convert a string from a RT_STRING resource to UTF-8!" 
+					  << DEBUG_INFO << std::endl;
+	}
+	return "";
 }
 
 // ----------------------------------------------------------------------------
@@ -95,11 +101,17 @@ std::string read_prefixed_unicode_string(FILE* f)
 {
 	std::wstring s = read_prefixed_unicode_wstring(f);
 
-	// Convert the wstring into a string
-	auto conv = boost::shared_array<char>(new char[s.size() + 1]);
-	memset(conv.get(), 0, sizeof(char) * (s.size() + 1));
-	wcstombs(conv.get(), s.c_str(), s.size());
-	return std::string(conv.get());
+	try
+	{
+		std::vector<boost::uint8_t> utf8result;
+		utf8::utf16to8(s.begin(), s.end(), std::back_inserter(utf8result));
+		return std::string(utf8result.begin(), utf8result.end());
+	}
+	catch (utf8::invalid_utf16) {
+		PRINT_WARNING << "Couldn't convert a string from a RT_STRING resource to UTF-8!" 
+					  << DEBUG_INFO << std::endl;
+	}
+	return "";
 }
 
 // ----------------------------------------------------------------------------

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required (VERSION 2.6)
 project (manalyze-tests)
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
-add_executable(manalyze-tests fixtures.cpp hash-library.cpp pe.cpp imports.cpp resources.cpp section.cpp escape.cpp
+add_executable(manalyze-tests fixtures.cpp hash-library.cpp pe.cpp imports.cpp resources.cpp section.cpp escape.cpp encoding.cpp
                               ../src/import_hash.cpp)
 
 target_link_libraries(

diff --git a/test/encoding.cpp b/test/encoding.cpp
@@ -0,0 +1,50 @@
+/*
+    This file is part of Manalyze.
+
+    Manalyze is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Manalyze is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Manalyze.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <string>
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/cstdint.hpp>
+
+#include "manape/utf8/utf8.h"
+
+// ----------------------------------------------------------------------------
+
+void check_conversion(std::wstring input, std::string expected)
+{
+	std::vector<boost::uint8_t> utf8result;
+	utf8::utf16to8(input.begin(), input.end(), std::back_inserter(utf8result));
+	std::string result(utf8result.begin(), utf8result.end());
+	BOOST_CHECK_EQUAL(result, expected);
+}
+
+// ----------------------------------------------------------------------------
+
+BOOST_AUTO_TEST_CASE(test_utf16_to_utf8)
+{
+	check_conversion(L"Simple ascii string", "Simple ascii string");
+	check_conversion(L"é", "\xc3\xa9");
+	check_conversion(L"©", "\xc2\xa9");
+	check_conversion(L"© Microsoft Corporation. All rights reserved.", "\xc2\xa9 Microsoft Corporation. All rights reserved.");
+	check_conversion(L"Ūnĭcōde̽", "\xc5\xaa\x6e\xc4\xad\x63\xc5\x8d\x64\x65\xcc\xbd");
+	check_conversion(L"Юникод", "\xd0\xae\xd0\xbd\xd0\xb8\xd0\xba\xd0\xbe\xd0\xb4");
+	check_conversion(L"უნიკოდი", "\xe1\x83\xa3\xe1\x83\x9c\xe1\x83\x98\xe1\x83\x99\xe1\x83\x9d\xe1\x83\x93\xe1\x83\x98");
+	check_conversion(L"標準萬國碼", "\xe6\xa8\x99\xe6\xba\x96\xe8\x90\xac\xe5\x9c\x8b\xe7\xa2\xbc");
+}
+
+// ----------------------------------------------------------------------------
diff --git a/test/escape.cpp b/test/escape.cpp
@@ -86,7 +86,6 @@ BOOST_AUTO_TEST_CASE(test_string_escape_json)
 	check_string_escaping_json("\"", "\\\"");
 	check_string_escaping_json("\\", "\\\\");
 	check_string_escaping_json("\\\\", "\\\\\\\\");
-	check_string_escaping_json("é", "\\u00e9");
-	check_string_escaping_json("\x01", "\\u0001");
+	check_string_escaping_json("\x01", "\x01");
 	check_string_escaping_json("\r\n", "\\r\\n");
 }