-
Notifications
You must be signed in to change notification settings - Fork 8
/
Unicode.h
117 lines (96 loc) · 3.61 KB
/
Unicode.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#ifndef TICC_UNICODE_H
#define TICC_UNICODE_H
/*
Copyright (c) 2006 - 2019
CLST - Radboud University
ILK - Tilburg University
This file is part of ticcutils
ticcutils is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
ticcutils is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, see <http://www.gnu.org/licenses/>.
For questions and suggestions, see:
https://github.com/LanguageMachines/ticcutils/issues
or send mail to:
lamasoftware (at ) science.ru.nl
*/
#include <string>
#include <vector>
#include "unicode/unistr.h"
#include "unicode/ustream.h"
#include "unicode/normalizer2.h"
#include "unicode/translit.h"
#include "unicode/regex.h"
namespace TiCC {
using namespace icu;
std::string UnicodeToUTF8( const UnicodeString& );
UnicodeString UnicodeFromEnc( const std::string& ,
const std::string& = "UTF8" );
inline UnicodeString UnicodeFromUTF8( const std::string& s ){
return UnicodeString::fromUTF8( s );
}
class UnicodeNormalizer {
public:
UnicodeNormalizer( const std::string& = "" );
~UnicodeNormalizer();
UnicodeString normalize( const UnicodeString& );
const std::string setMode( const std::string& );
const std::string getMode() const { return mode; };
private:
const Normalizer2 *_normalizer;
std::string mode;
};
class UnicodeRegexMatcher {
public:
UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" );
~UnicodeRegexMatcher();
bool match_all( const UnicodeString&, UnicodeString&, UnicodeString& );
const UnicodeString get_match( unsigned int ) const;
int NumOfMatches() const;
int split( const UnicodeString&, std::vector<UnicodeString>& );
UnicodeString Pattern() const;
bool set_debug( bool b ){ bool r = _debug; _debug = b; return r; };
private:
UnicodeRegexMatcher( const UnicodeRegexMatcher& ); // inhibit copies
UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies
RegexPattern *pattern;
RegexMatcher *matcher;
UnicodeRegexMatcher();
std::vector<UnicodeString> results;
const UnicodeString _name;
bool _debug;
};
class UniFilter {
friend std::ostream& operator<<( std::ostream&, const UniFilter& );
public:
UniFilter();
~UniFilter();
bool init( const UnicodeString&, const UnicodeString& );
bool is_initialized() const { return _trans != 0; };
bool fill( const std::string&, const std::string& = "" );
bool add( const std::string& );
bool add( const UnicodeString& );
UnicodeString filter( const UnicodeString& );
UnicodeString get_rules() const;
private:
Transliterator *_trans;
};
UnicodeString filter_diacritics( const UnicodeString& );
std::vector<UnicodeString> split_at( const UnicodeString&,
const UnicodeString&,
size_t = 0 );
std::vector<UnicodeString> split_at_first_of( const UnicodeString&,
const UnicodeString&,
size_t = 0 );
std::vector<UnicodeString> split( const UnicodeString&,
size_t = 0 );
std::string utf8_lowercase( const std::string& ); // Unicode safe version
std::string utf8_uppercase( const std::string& ); // Unicode safe version
}
#endif // TICC_UNICODE_H