-
Notifications
You must be signed in to change notification settings - Fork 191
/
detector_factory.py
137 lines (111 loc) · 4.27 KB
/
detector_factory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from os import path
import sys
try:
import simplejson as json
except ImportError:
import json
from .detector import Detector
from .lang_detect_exception import ErrorCode, LangDetectException
from .utils.lang_profile import LangProfile
class DetectorFactory(object):
'''
Language Detector Factory Class.
This class manages an initialization and constructions of Detector.
Before using language detection library,
load profiles with DetectorFactory.load_profile(str)
and set initialization parameters.
When the language detection,
construct Detector instance via DetectorFactory.create().
See also Detector's sample code.
'''
seed = None
def __init__(self):
self.word_lang_prob_map = {}
self.langlist = []
def load_profile(self, profile_directory):
list_files = os.listdir(profile_directory)
if not list_files:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Not found profile: ' + profile_directory)
langsize, index = len(list_files), 0
for filename in list_files:
if filename.startswith('.'):
continue
filename = path.join(profile_directory, filename)
if not path.isfile(filename):
continue
f = None
try:
if sys.version_info[0] < 3:
f = open(filename, 'r')
else:
f = open(filename, 'r', encoding='utf-8')
json_data = json.load(f)
profile = LangProfile(**json_data)
self.add_profile(profile, index, langsize)
index += 1
except IOError:
raise LangDetectException(ErrorCode.FileLoadError, 'Cannot open "%s"' % filename)
except:
raise LangDetectException(ErrorCode.FormatError, 'Profile format error in "%s"' % filename)
finally:
if f:
f.close()
def load_json_profile(self, json_profiles):
langsize, index = len(json_profiles), 0
if langsize < 2:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need more than 2 profiles.')
for json_profile in json_profiles:
try:
json_data = json.loads(json_profile)
profile = LangProfile(**json_data)
self.add_profile(profile, index, langsize)
index += 1
except:
raise LangDetectException(ErrorCode.FormatError, 'Profile format error.')
def add_profile(self, profile, index, langsize):
lang = profile.name
if lang in self.langlist:
raise LangDetectException(ErrorCode.DuplicateLangError, 'Duplicate the same language profile.')
self.langlist.append(lang)
for word in profile.freq:
if word not in self.word_lang_prob_map:
self.word_lang_prob_map[word] = [0.0] * langsize
length = len(word)
if 1 <= length <= 3:
prob = 1.0 * profile.freq.get(word) / profile.n_words[length - 1]
self.word_lang_prob_map[word][index] = prob
def clear(self):
self.langlist = []
self.word_lang_prob_map = {}
def create(self, alpha=None):
'''Construct Detector instance with smoothing parameter.'''
detector = self._create_detector()
if alpha is not None:
detector.set_alpha(alpha)
return detector
def _create_detector(self):
if not self.langlist:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need to load profiles.')
return Detector(self)
def set_seed(self, seed):
self.seed = seed
def get_lang_list(self):
return list(self.langlist)
PROFILES_DIRECTORY = path.join(path.dirname(__file__), 'profiles')
_factory = None
def init_factory():
global _factory
if _factory is None:
_factory = DetectorFactory()
_factory.load_profile(PROFILES_DIRECTORY)
def detect(text):
init_factory()
detector = _factory.create()
detector.append(text)
return detector.detect()
def detect_langs(text):
init_factory()
detector = _factory.create()
detector.append(text)
return detector.get_probabilities()