forked from songboyu/NLP-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram.py
98 lines (86 loc) · 2.65 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding:utf-8 -*-
'''
author: songboyu
modify: 2014-12-06
summary: 统计unigram,bigram词频
'''
import os,re
CODEC = 'utf8'
class NGram(object):
'''n元词频统计'''
def __init__(self):
self.unigram = {}
self.bigram = {}
self.wordDict = []
# dict = open('dict.txt')
# for line in dict:
# if len(line.strip()) > 0:
# self.wordDict.append(line.strip())
def scan(self, lines):
'''
逐行扫描,ngram结果记录到文件中
@param sentence list{str}
@return none
'''
words = []
for line in lines:
# 统计n元词频
words.append('<li>')
wordlist = [
w.encode(CODEC)
for w in list(line.decode(CODEC).split('/'))
if len(w.strip())>0
]
words.extend(wordlist)
words.append('</li>')
self.ngram(words)
print '[ Hashed ]'
#unigram
file = open("freq/word_freq.txt","w")
for key,value in self.unigram.items():
file.write("%s\t%d\n" % (key, value))
file.close()
print '[ Unigram file finish ]'
#bigram
file = open("freq/bigram_freq.txt","w")
for key,value in self.bigram.items():
file.write("%s\t%d\n" % (key, value))
file.close()
print '[ Bigram file finish ]'
def ngram(self, words):
'''
统计ngram
@param words list{str}
@return none
'''
partten = ur'([\u4e00-\u9fa5]|<li>|</li>)+'
# unigram
for i in range(0,len(words)):
if not re.search(partten, words[i].decode(CODEC)):
continue
key = words[i]
if key not in self.unigram:
self.unigram[key] = 0
self.unigram[key] += 1
# bigram
for i in range(1,len(words)):
if not re.search(partten, words[i].decode(CODEC)):
continue
if not re.search(partten, words[i-1].decode(CODEC)):
continue
key = words[i] + '|' + words[i-1]
if key not in self.bigram:
self.bigram[key] = 0
self.bigram[key] += 1
if __name__== '__main__':
lines = []
for parent,_,filenames in os.walk('corpus_seg'):
for filename in filenames:
print filename
path = os.path.join(parent,filename)
file = open(path)
for line in file:
if len(line.strip()) > 0:
lines.append(line.strip())
n = NGram()
n.scan(lines)