Skip to content

Commit

Permalink
Use UnicodeString inside mwu module
Browse files Browse the repository at this point in the history
  • Loading branch information
kosloot committed Mar 13, 2023
1 parent 179eb07 commit a636a63
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 27 deletions.
20 changes: 10 additions & 10 deletions include/frog/mwu_chunker_mod.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@
/// \brief a helper class for Mwu. Stores needed information.
class mwuAna {
friend std::ostream& operator<< (std::ostream&, const mwuAna& );
public:
mwuAna( const std::string&, bool, size_t );
public:
mwuAna( const icu::UnicodeString&, bool, size_t );
virtual ~mwuAna() {};

void merge( const mwuAna * );

std::string getWord() const {
icu::UnicodeString getWord() const {
return word;
}

Expand All @@ -58,18 +58,18 @@ class mwuAna {
size_t mwu_start;
size_t mwu_end;

protected:
mwuAna(){};
std::string word;
bool spec;
protected:
mwuAna(){};
icu::UnicodeString word;
bool spec;
};

#define mymap2 std::multimap<std::string, std::vector<std::string> >
#define mymap2 std::multimap<icu::UnicodeString, std::vector<icu::UnicodeString> >

/// \brief provide all functionality to detect MWU's
class Mwu {
friend std::ostream& operator<< (std::ostream&, const Mwu& );
public:
public:
explicit Mwu( TiCC::LogStream*, TiCC::LogStream* );
~Mwu();
void reset();
Expand All @@ -82,7 +82,7 @@ class Mwu {
/// return the value for \e mwu_tagset. (set via Configuration)
std::string getTagset() const { return mwu_tagset; };
std::string version() const { return _version; };
private:
private:
bool readsettings( const std::string&, const std::string&);
bool read_mwus( const std::string& );
void Classify();
Expand Down
41 changes: 24 additions & 17 deletions src/mwu_chunker_mod.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,12 @@

using namespace std;
using TiCC::operator<<;
using icu::UnicodeString;

#define LOG *TiCC::Log(errLog)
#define DBG *TiCC::Log(dbgLog)

mwuAna::mwuAna( const string& wrd,
mwuAna::mwuAna( const UnicodeString& wrd,
bool glue_tag,
size_t index ){
/// create a mwu Analysis record
Expand Down Expand Up @@ -110,14 +111,13 @@ void Mwu::add( frog_record& fd ){
/*!
\param fd The frog_data structure with the information to use
*/
icu::UnicodeString tmp = fd.word;
icu::UnicodeString word = fd.word;
if ( filter ){
tmp = filter->filter( tmp );
word = filter->filter( word );
}
string txt = TiCC::UnicodeToUTF8( tmp );
bool glue = ( fd.tag == glue_tag );
size_t index = mWords.size();
mWords.push_back( new mwuAna( txt, glue, index ) );
mWords.push_back( new mwuAna( word, glue, index ) );
}

bool Mwu::read_mwus( const string& fname) {
Expand All @@ -131,14 +131,14 @@ bool Mwu::read_mwus( const string& fname) {
LOG << "reading of " << fname << " FAILED" << endl;
return false;
}
string line;
while( getline( mwufile, line ) ) {
vector<string> res1 = TiCC::split_at(line, " ");
UnicodeString line;
while( TiCC::getline( mwufile, line ) ) {
vector<UnicodeString> res1 = TiCC::split_at(line, " ");
if ( res1.size() == 2 ){
vector<string> res2 = TiCC::split_at(res1[0], "_");;
vector<UnicodeString> res2 = TiCC::split_at(res1[0], "_");;
//res1 has mwus and tags, res2 has ind. words
if ( res2.size() >= 2 ){
string key = res2[0];
UnicodeString key = res2[0];
res2.erase(res2.begin());
MWUs.insert( make_pair( key, res2 ) );
}
Expand Down Expand Up @@ -276,9 +276,16 @@ void Mwu::Classify( frog_data& sent ){
sent.resolve_mwus();
}

string decap( const string& word ){
string result = word;
result[0] = tolower( result[0] );
UnicodeString decap( const UnicodeString& word ){
UnicodeString result;
for ( int i=0; i < word.length(); ++i ){
if ( i == 0 ){
result += u_tolower(word[0]);
}
else {
result += word[i];
}
}
return result;
}

Expand All @@ -300,19 +307,19 @@ void Mwu::Classify(){
// add all current sequences of the glue_tag words to MWUs
for ( size_t i=0; i < max-1; ++i ) {
if ( mWords[i]->isSpec() && mWords[i+1]->isSpec() ) {
vector<string> newmwu;
vector<UnicodeString> newmwu;
while ( i < max && mWords[i]->isSpec() ){
newmwu.push_back(mWords[i]->getWord());
i++;
}
string key = newmwu[0];
UnicodeString key = newmwu[0];
newmwu.erase( newmwu.begin() );
MWUs.insert( make_pair(key, newmwu) );
}
}
size_t i;
for ( i = 0; i < max; i++) {
string word = mWords[i]->getWord();
UnicodeString word = mWords[i]->getWord();
if ( debug > 1 ){
DBG << "checking word[" << i <<"]: " << word << endl;
}
Expand All @@ -335,7 +342,7 @@ void Mwu::Classify(){
}
while( current_match != matches.second
&& current_match != MWUs.end() ){
vector<string> match = current_match->second;
vector<UnicodeString> match = current_match->second;
size_t max_match = match.size();
size_t j = 0;
if ( debug > 1 ){
Expand Down

0 comments on commit a636a63

Please sign in to comment.