Skip to content

Commit

Permalink
MWU: small change to detect MWU's at start of the sentence. (with an …
Browse files Browse the repository at this point in the history
…uppercase fist letter)
  • Loading branch information
kosloot committed Mar 7, 2023
1 parent 2e7a77b commit 179eb07
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions src/mwu_chunker_mod.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "frog/Frog-util.h" // defines etc.

using namespace std;
using TiCC::operator<<;

#define LOG *TiCC::Log(errLog)
#define DBG *TiCC::Log(dbgLog)
Expand Down Expand Up @@ -126,7 +127,8 @@ bool Mwu::read_mwus( const string& fname) {
*/
LOG << "read mwus " + fname << endl;
ifstream mwufile(fname, ios::in);
if(mwufile.bad()){
if ( !mwufile ){
LOG << "reading of " << fname << " FAILED" << endl;
return false;
}
string line;
Expand Down Expand Up @@ -254,7 +256,7 @@ void Mwu::add_provenance( folia::Document& doc,
}

void Mwu::Classify( frog_data& sent ){
/// run the Mwu classifier on e sentence in frog_data format
/// run the Mwu classifier on a sentence in frog_data format
/*!
\param sent a frog_data structure with unresolved MWU's
*/
Expand All @@ -274,6 +276,12 @@ void Mwu::Classify( frog_data& sent ){
sent.resolve_mwus();
}

string decap( const string& word ){
string result = word;
result[0] = tolower( result[0] );
return result;
}

void Mwu::Classify(){
/// examine the Mwu's internal mwuAna nodes to determine the spans of
/// all mwu's found
Expand Down Expand Up @@ -308,8 +316,18 @@ void Mwu::Classify(){
if ( debug > 1 ){
DBG << "checking word[" << i <<"]: " << word << endl;
}
const auto matches = MWUs.equal_range(word);
if ( matches.first != MWUs.end() ) {
auto matches = MWUs.equal_range(word);
if ( i == 0
&& matches.first == matches.second ) {
// no match on first word. try decaped version.
// we do this ONLY for the very first word in the sentence!
word = decap( word );
if ( debug > 1 ){
DBG << "checking decapped word [" << i <<"]: " << word << endl;
}
matches = MWUs.equal_range(word);
}
if ( matches.first != matches.second ) {
//match
auto current_match = matches.first;
if ( debug > 1 ) {
Expand Down

0 comments on commit 179eb07

Please sign in to comment.