Skip to content

Commit

Permalink
Add a new function DetectEncodings that returns all possible encoding…
Browse files Browse the repository at this point in the history
…s that were detected.
  • Loading branch information
Ghabry committed Aug 11, 2016
1 parent a33ba3a commit 6602cf1
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 46 deletions.
88 changes: 44 additions & 44 deletions src/reader_util.cpp
Expand Up @@ -71,12 +71,22 @@ std::string ReaderUtil::CodepageToEncoding(int codepage) {
return outs;
}

std::string ReaderUtil::DetectEncoding(const std::string& database_file) {
std::string encoding;
std::string ReaderUtil::DetectEncoding(const std::string &database_file) {
std::vector<std::string> encodings = DetectEncodings(database_file);

if (encodings.empty()) {
return "";
}

return encodings.front();
}

std::vector<std::string> ReaderUtil::DetectEncodings(const std::string& database_file) {
std::vector<std::string> encodings;
#ifdef LCF_SUPPORT_ICU
std::ostringstream text;

//Populate Data::terms and Data::system or will empty by default even if load fails
// Populate Data::terms and Data::system or will empty by default even if load fails
LDB_Reader::Load(database_file, "");

text <<
Expand Down Expand Up @@ -124,58 +134,48 @@ std::string ReaderUtil::DetectEncoding(const std::string& database_file) {
Data::system.battletest_background <<
Data::system.frame_name;

if (!text.str().empty())
{
if (!text.str().empty()) {
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector* detector = ucsdet_open(&status);

std::string s = text.str();
ucsdet_setText(detector, s.c_str(), s.length(), &status);

const UCharsetMatch* match = ucsdet_detect(detector, &status);
if (match != NULL)
{
encoding = ucsdet_getName(match, &status);
int32_t matches_count;
const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);

if (matches != NULL) {
// Collect all candidates, most confident comes first
for (int i = 0; i < matches_count; ++i) {
std::string encoding = ucsdet_getName(matches[i], &status);

// Fixes to ensure proper Windows encodings
if (encoding == "Shift_JIS") {
encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
} else if (encoding == "EUC-KR") {
encodings.push_back("windows-949-2000"); // Korean with \ as backlash
} else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
} else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
} else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
} else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
} else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
} else {
encodings.push_back(encoding);
}
}
}
ucsdet_close(detector);

// Fixes to ensure proper Windows encodings
if (encoding == "Shift_JIS")
{
encoding = "ibm-943_P15A-2003"; // Japanese with \ as backslash
}
else if (encoding == "EUC-KR")
{
encoding = "windows-949-2000"; // Korean with \ as backlash
}
else if (encoding == "ISO-8859-1" || encoding == "windows-1252")
{
encoding = "ibm-5348_P100-1997"; // Occidental with Euro
}
else if (encoding == "ISO-8859-2" || encoding == "windows-1250")
{
encoding = "ibm-5346_P100-1998"; // Central Europe with Euro
}
else if (encoding == "ISO-8859-5" || encoding == "windows-1251")
{
encoding = "ibm-5347_P100-1998"; // Cyrillic with Euro
}
else if (encoding == "ISO-8859-6" || encoding == "windows-1256")
{
encoding = "ibm-9448_X100-2005"; // Arabic with Euro + 8 chars
}
else if (encoding == "ISO-8859-7" || encoding == "windows-1253")
{
encoding = "ibm-5349_P100-1998"; // Greek with Euro
}
else if (encoding == "ISO-8859-8" || encoding == "windows-1255")
{
encoding = "ibm-9447_P100-2002"; // Hebrew with Euro
}
}
#endif

return encoding;
return encodings;
}

std::string ReaderUtil::GetEncoding(const std::string& ini_file) {
Expand Down
14 changes: 12 additions & 2 deletions src/reader_util.h
Expand Up @@ -22,14 +22,24 @@ namespace ReaderUtil {
std::string CodepageToEncoding(int codepage);

/**
* Detects the encoding based on text analysis.
* Detects the encoding based on text analysis.
*
* @param text a string with few hundred of words to analyze.
*
* @return encoding or empty string if not detected.
*/
*/

This comment has been minimized.

Copy link
@fdelapena

fdelapena Aug 11, 2016

Stray spaces added

std::string DetectEncoding(const std::string& database_file);

/**
* Detects the encoding based on text analysis and returns a vector with
* possible candidates, highest candidate being at the beginning.
*
* @param text a string with few hundred of words to analyze.
*
* @return list of encodings or empty if not detected
*/
std::vector<std::string> DetectEncodings(const std::string& database_file);

/**
* Returns the encoding set in the ini file.
*
Expand Down

0 comments on commit 6602cf1

Please sign in to comment.