forked from songboyu/NLP-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
seg.py
92 lines (86 loc) · 2.87 KB
/
seg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding:utf-8 -*-
'''
author: songboyu
modify: 2014-12-06
summary: 正向、逆向最大匹配分词测试
'''
import os, re
from common import u,strQ2B,tsplit
from seg_method.fwd_max import fwd_mm_seg
from seg_method.bwd_max import bwd_mm_seg
CODEC = 'utf8'
def file_seg_process(filename, method):
'''
@param filename: 文件名
@param method: 分词算法 { 0:正向,1:逆向 }
'''
# 打开文件
fp_dict = open('dict.txt')
fp_input = open('corpus/'+filename)
fp_output = open('corpus_seg/'+filename, 'w')
wordDict = {}
# 读取字典到内存中
for eachWord in fp_dict:
wordDict[u(eachWord.split()[0].strip(), CODEC)] = 1
# 对input每一行操作
str = ''
for eachLine in fp_input:
line_out = ''
# 每一段作为一行输入给分词函数
sub = strQ2B(u(eachLine.strip(), CODEC))
if not sub.startswith(' '):
str += sub
continue
strlen = len(str)
while strlen > 0:
# 英文字符或数字--原文输出
m = re.match(r'\w+', str)
if m is not None:
subStr = m.group()
line_out += subStr.encode(CODEC)+'/'
subLen = len(subStr)
str = str[subLen:]
strlen = strlen - subLen
continue
# 短句结尾标志--输出换行
if str[0:1].encode(CODEC) in [',','。','!','?',':']:
subStr = str[0:1]
line_out += '\n'
subLen = len(subStr)
str = str[subLen:]
strlen = strlen - subLen
# 汉字--分词处理,输出 词/词
m = re.match(ur'[\u4e00-\u9fa5]+', str)
if m is not None:
subStr = m.group()
if method == 0:
# 正向最大匹配
wordList = fwd_mm_seg(wordDict, 8, subStr)
else:
# 逆向最大匹配
wordList = bwd_mm_seg(wordDict, 8, subStr)
line_out += wordList[0].encode(CODEC)+'/'
for eachWord in wordList[1:]:
line_out += eachWord.encode(CODEC)+'/'
subLen = len(subStr)
str = str[subLen:]
strlen = strlen - subLen
continue
# 其他特殊字符--跳过
str = str[1:]
strlen = strlen - 1
# 跳过处理后为空行的段落
if len(line_out.strip()) == 0:
continue
# 写入文件
fp_output.write(line_out + '\n')
str = sub
# close file
fp_input.close()
fp_dict.close()
fp_output.close()
if __name__ == '__main__':
for _,_,filenames in os.walk('corpus'):
for filename in filenames:
print filename
file_seg_process(filename, 1)