Skip to content

Commit

Permalink
lot of ugly hacking to fix language option issues.
Browse files Browse the repository at this point in the history
  • Loading branch information
kosloot committed Feb 6, 2023
1 parent 2cfffe1 commit fd3cb1a
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 18 deletions.
39 changes: 28 additions & 11 deletions src/FrogAPI.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,17 @@ string FrogAPI::defaultConfigDir( const string& language ){
if ( language.empty() ){
if (TiCC::isDir(localConfigDir)) {
return localConfigDir;
} else {
return configDir;
}
else {
return configDir;
}
}
else {
if (TiCC::isDir(localConfigDir + language)) {
return localConfigDir + language +"/";
} else {
return configDir+language+"/";
}
else {
return configDir+language+"/";
}
}
}
Expand Down Expand Up @@ -297,13 +299,14 @@ bool FrogAPI::collect_options( TiCC::CL_Options& Opts,
string localConfigFileName = localConfigDir + configFileName;
if (TiCC::isFile( localConfigFileName )) {
configFileName = localConfigFileName;
} else {
LOG << " not found locally (" << configFileName << ")" << endl;
//global (final fallback)
configFileName = configDir + configFileName;
if (!TiCC::isFile( configFileName )) {
LOG << " not found globally (" << configFileName << ")" << endl;
}
}
else {
LOG << " not found locally (" << configFileName << ")" << endl;
//global (final fallback)
configFileName = configDir + configFileName;
if (!TiCC::isFile( configFileName )) {
LOG << " not found globally (" << configFileName << ")" << endl;
}
}
}
if ( configuration.fill( configFileName ) ){
Expand All @@ -328,6 +331,20 @@ bool FrogAPI::collect_options( TiCC::CL_Options& Opts,
return false;
}
if ( !languages.empty() ){
set<string> ucto_languages = Tokenizer::Setting::installed_languages();
vector<string> lang_v = TiCC::split_at( languages, "," );
auto l = lang_v.begin();
while ( l != lang_v.end() ){
if ( ucto_languages.find( *l ) == ucto_languages.end() ){
LOG << "remove unknow language '" << *l << "'" << endl;
l = lang_v.erase(l);
}
else {
++l;
}
}
languages = TiCC::join( lang_v, "," );
LOG << "configuring languages = '" << languages << "'" << endl;
configuration.setatt( "languages", languages, "tokenizer" );
}
string opt_val;
Expand Down
32 changes: 25 additions & 7 deletions src/ucto_tokenizer_mod.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -140,27 +140,45 @@ bool UctoTokenizer::init( const TiCC::Configuration& config ){
}
// when a language (list) is specified on the command line,
// it overrules the language from the config file
string rulesName;
if ( language_list.empty() ){
rulesName = config.lookUp( "rulesFile", "tokenizer" );
}
string rulesName = config.lookUp( "rulesFile", "tokenizer" );
if ( rulesName.empty() ){
if ( language_list.empty() ){
LOG << "no 'rulesFile' or 'languages' found in configuration" << endl;
return false;
}
LOG << "init tokenizer for languages: " << language_list << endl;
if ( !tokenizer->init( language_list ) ){
return false;
}
tokenizer->setLanguage( language_list[0] );
}
else {
rulesName = resolve_configdir( rulesName, config.configDir() );
LOG << "using tokenizer configuration: " << rulesName << endl;
if ( !tokenizer->init( rulesName ) ){
return false;
string r_lang;
auto pos = rulesName.find( "tokconfig-" );
if ( pos != string::npos ){
r_lang = rulesName.substr( pos+10 );
}
if ( !r_lang.empty()
&& !language_list.empty()
&& *language_list.begin() != r_lang ){
language_list.insert( language_list.begin(), r_lang );
tokenizer->setLangDetection(false);
}
if ( !language_list.empty() ){
LOG << "init tokenizer for languages: " << language_list << endl;
if ( !tokenizer->init( language_list ) ){
return false;
}
}
else {
LOG << "using tokenizer configuration: " << rulesName << endl;
if ( !tokenizer->init( rulesName ) ){
return false;
}
}
if ( !language_list.empty() ){
LOG << "default tokenizer language = " << language_list[0] << endl;
tokenizer->setLanguage( language_list[0] );
}
}
Expand Down

0 comments on commit fd3cb1a

Please sign in to comment.