Add a new function DetectEncodings that returns all possible encoding…

…s that were detected.
Ghabry · Aug 11, 2016 · 6602cf1 · fdelapena · Aug 11, 2016 · 6602cf1
1 parent a33ba3a
commit 6602cf1
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 46 deletions.
diff --git a/src/reader_util.cpp b/src/reader_util.cpp
@@ -71,12 +71,22 @@ std::string ReaderUtil::CodepageToEncoding(int codepage) {
 	return outs;
 }
 
-std::string ReaderUtil::DetectEncoding(const std::string& database_file) {
-	std::string encoding;
+std::string ReaderUtil::DetectEncoding(const std::string &database_file) {
+	std::vector<std::string> encodings = DetectEncodings(database_file);
+
+	if (encodings.empty()) {
+		return "";
+	}
+
+	return encodings.front();
+}
+
+std::vector<std::string> ReaderUtil::DetectEncodings(const std::string& database_file) {
+	std::vector<std::string> encodings;
 #ifdef LCF_SUPPORT_ICU
 	std::ostringstream text;
 
-	//Populate Data::terms and Data::system or will empty by default even if load fails
+	// Populate Data::terms and Data::system or will empty by default even if load fails
 	LDB_Reader::Load(database_file, "");
 
 	text <<
@@ -124,58 +134,48 @@ std::string ReaderUtil::DetectEncoding(const std::string& database_file) {
 	Data::system.battletest_background <<
 	Data::system.frame_name;
 
-	if (!text.str().empty())
-	{
+	if (!text.str().empty()) {
 		UErrorCode status = U_ZERO_ERROR;
 		UCharsetDetector* detector = ucsdet_open(&status);
 
 		std::string s = text.str();
 		ucsdet_setText(detector, s.c_str(), s.length(), &status);
 
-		const UCharsetMatch* match = ucsdet_detect(detector, &status);
-		if (match != NULL)
-		{
-			encoding = ucsdet_getName(match, &status);
+		int32_t matches_count;
+		const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
+
+		if (matches != NULL) {
+			// Collect all candidates, most confident comes first
+			for (int i = 0; i < matches_count; ++i) {
+				std::string encoding = ucsdet_getName(matches[i], &status);
+
+				// Fixes to ensure proper Windows encodings
+				if (encoding == "Shift_JIS") {
+					encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
+				} else if (encoding == "EUC-KR") {
+					encodings.push_back("windows-949-2000"); // Korean with \ as backlash
+				} else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
+					encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
+				} else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
+					encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
+				} else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
+					encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
+				} else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
+					encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
+				} else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
+					encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
+				} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
+					encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
+				} else {
+					encodings.push_back(encoding);
+				}
+			}
 		}
 		ucsdet_close(detector);
-
-		// Fixes to ensure proper Windows encodings
-		if (encoding == "Shift_JIS")
-		{
-			encoding = "ibm-943_P15A-2003"; // Japanese with \ as backslash
-		}
-		else if (encoding == "EUC-KR")
-		{
-			encoding = "windows-949-2000"; // Korean with \ as backlash
-		}
-		else if (encoding == "ISO-8859-1" || encoding == "windows-1252")
-		{
-			encoding = "ibm-5348_P100-1997"; // Occidental with Euro
-		}
-		else if (encoding == "ISO-8859-2" || encoding == "windows-1250")
-		{
-			encoding = "ibm-5346_P100-1998"; // Central Europe with Euro
-		}
-		else if (encoding == "ISO-8859-5" || encoding == "windows-1251")
-		{
-			encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro
-		}
-		else if (encoding == "ISO-8859-6" || encoding == "windows-1256")
-		{
-			encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars
-		}
-		else if (encoding == "ISO-8859-7" || encoding == "windows-1253")
-		{
-			encoding = "ibm-5349_P100-1998"; // Greek with Euro
-		}
-		else if (encoding == "ISO-8859-8" || encoding == "windows-1255")
-		{
-			encoding = "ibm-9447_P100-2002"; // Hebrew with Euro
-		}
 	}
 #endif
 
-	return encoding;
+	return encodings;
 }
 
 std::string ReaderUtil::GetEncoding(const std::string& ini_file) {

diff --git a/src/reader_util.h b/src/reader_util.h
@@ -22,14 +22,24 @@ namespace ReaderUtil {
 	std::string CodepageToEncoding(int codepage);
 
 	/**
-	 * Detects the encoding based on text analysis.
+ 	 * Detects the encoding based on text analysis.
 	 *
 	 * @param text a string with few hundred of words to analyze.
 	 *
 	 * @return encoding or empty string if not detected.
-	 */
+ 	 */
 	std::string DetectEncoding(const std::string& database_file);
 
+	/**
+	 * Detects the encoding based on text analysis and returns a vector with
+	 * possible candidates, highest candidate being at the beginning.
+	 *
+	 * @param text a string with few hundred of words to analyze.
+	 *
+	 * @return list of encodings or empty if not detected
+	 */
+	std::vector<std::string> DetectEncodings(const std::string& database_file);
+
 	/**
 	 * Returns the encoding set in the ini file.
 	 *