Skip to content

Commit

Permalink
added possiblity to pass the "und" language value to ucto. Language d…
Browse files Browse the repository at this point in the history
…etection code gets even messier
  • Loading branch information
kosloot committed Feb 7, 2023
1 parent 818c7e9 commit eeeb7e4
Showing 1 changed file with 23 additions and 11 deletions.
34 changes: 23 additions & 11 deletions src/FrogAPI.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ bool FrogAPI::collect_options( TiCC::CL_Options& Opts,
<< " option. " << endl;
return false;
}
auto it = find( lang_v.begin(), lang_v.end(), "und" );
if ( it != lang_v.end() ){
lang_v.erase( it );
}
language = lang_v[0]; // the first mentioned is the default.
for ( const auto& l : lang_v ){
options.languages.insert( l );
Expand All @@ -276,8 +280,8 @@ bool FrogAPI::collect_options( TiCC::CL_Options& Opts,
<< " with more then one language " << endl
<< "\t specified. These values will be handled to the tokenizer,"
<< " but Frog"<< endl
<< "\t will only handle the first language: " << language
<< " for further processing!" << endl;
<< "\t will only handle the first language: '" << language
<< "' for further processing!" << endl;
}
configFileName = FrogAPI::defaultConfigFile(language);
if ( !TiCC::isFile( configFileName ) ){
Expand Down Expand Up @@ -315,13 +319,9 @@ bool FrogAPI::collect_options( TiCC::CL_Options& Opts,
if ( !vers.empty() ){
LOG << "configuration version = " << vers << endl;
}
string langs = configuration.getatt( "languages", "tokenizer" );
if ( !langs.empty() ){
vector<string> lang_v = TiCC::split_at( langs, "," );
options.default_language = lang_v[0];
for ( const auto& l : lang_v ){
options.languages.insert( l );
}
string tmp = configuration.lookUp( "languages", "tokenizer" );
if ( !tmp.empty() ){
languages = tmp;
}
}
else {
Expand All @@ -333,17 +333,29 @@ bool FrogAPI::collect_options( TiCC::CL_Options& Opts,
if ( !languages.empty() ){
set<string> ucto_languages = Tokenizer::Setting::installed_languages();
vector<string> lang_v = TiCC::split_at( languages, "," );
bool add_und = false;
auto l = lang_v.begin();
while ( l != lang_v.end() ){
if ( ucto_languages.find( *l ) == ucto_languages.end() ){
LOG << "remove unknow language '" << *l << "'" << endl;
if ( *l == "und" ){
add_und = true;
l = lang_v.erase(l);
}
else if ( ucto_languages.find( *l ) == ucto_languages.end() ){
LOG << "remove unknown language '" << *l << "'" << endl;
l = lang_v.erase(l);
}
else {
++l;
}
}
options.default_language = lang_v[0];
for ( const auto& l : lang_v ){
options.languages.insert( l );
}
languages = TiCC::join( lang_v, "," );
if ( add_und ){
languages += ",und";
}
LOG << "configuring languages = '" << languages << "'" << endl;
configuration.setatt( "languages", languages, "tokenizer" );
}
Expand Down

0 comments on commit eeeb7e4

Please sign in to comment.