In [1]:
import collections
import tarfile
import re
import string
import jieba
__all__ = ['build_dict', 'train', 'test', 'convert']
def word_dict(cutoff=150):
    """
    从语料中建立一个. Build a word dictionary from the corpus.
    :return: Word dictionary
    :rtype: dict
    """
    return build_dict(re.compile("glodon/((train)|(test))/key./.*?/.*\.txt$"), cutoff)
def build_dict(pattern, cutoff):
    """
    Build a word dictionary from the corpus. Keys of 
    the dictionary are words, and values are zero-based
    IDs of these words.
    """
    word_freq = collections.defaultdict(int)
    for doc, _ in tokenize(pattern):
        for word in doc:
            word_freq[word] += 1
    # not sure if we should prune less-frequent words here.
    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
    
    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
    word_idx = dict(zip(words, xrange(len(words))))
    word_idx['<unk>'] = len(words)
    return word_idx
def tokenize(pattern):
    """
    Read files that match the given pattern.
    Tokenize and yield each file.
    :return:(list,label)
    """
    labelpattern = re.compile("glodon/((train)|(test))/key./(?P<label>.*?)/.*\.txt$")
    with tarfile.open('glodon.tar.gz') as tarf:
        tf = tarf.next()
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
#                 print("filename is %s" % tf.name)
                match = labelpattern.match(tf.name)
                label_idx = match.group("label")
#                 print('<>'.join(jieba.cut(tarf.extractfile(tf).read().rstrip("\n\r").translate(None, string.punctuation).lower())))
                yield (jieba.cut(tarf.extractfile(tf).read().rstrip("\n\r").translate(None, string.punctuation).lower()), label_idx)
            tf = tarf.next()

In [2]:
x = word_dict()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.575 seconds.
Prefix dict has been built succesfully.


In [4]:
for y in sorted(x, key=x.get, reverse=True):
    print("key---%s----value--%s" % (y,x[y]))

key---<unk>----value--1840
key---错位----value--1839
key---钱----value--1838
key---通讯----value--1837
key---视图----value--1836
key---表是----value--1835
key---群----value--1834
key---码----value--1833
key---楼盖----value--1832
key---显示图----value--1831
key---旧----value--1830
key---开----value--1829
key---建设----value--1828
key---字母----value--1827
key---多条----value--1826
key---地坪----value--1825
key---分摊----value--1824
key---分为----value--1823
key---server----value--1822
key---80----value--1821
key---通知----value--1820
key---资料----value--1819
key---调价----value--1818
key---设定----value--1817
key---角筋----value--1816
key---角柱----value--1815
key---网格----value--1814
key---绘制图----value--1813
key---窗户----value--1812
key---比重----value--1811
key---梅花----value--1810
key---放到----value--1809
key---折梁----value--1808
key---序列----value--1807
key---展开----value--1806
key---导出来----value--1805
key---单梁----value--1804
key---内蒙----value--1803
key---倾斜----value--1802
key---《----value--1801
key---gqi2015----value--1800
key---1

key---广东地区----value--1224
key---对于----value--1223
key---办法----value--1222
key---l----value--1221
key---网页----value--1220
key---给排水----value--1219
key---文字----value--1218
key---换----value--1217
key---墙洞----value--1216
key---课堂----value--1215
key---草图----value--1214
key---看见----value--1213
key---栏----value--1212
key---墩----value--1211
key---厚----value--1210
key---一笔----value--1209
key---端部----value--1208
key---横向----value--1207
key---提醒----value--1206
key---才----value--1205
key---运输----value--1204
key---概况----value--1203
key---方案----value--1202
key---平面----value--1201
key---帮助----value--1200
key---局部----value--1199
key---北京地区----value--1198
key---踢脚----value--1197
key---绑扎----value--1196
key---目录----value--1195
key---招投标----value--1194
key---双向----value--1193
key---加载----value--1192
key---随着----value--1191
key---检验----value--1190
key---延伸----value--1189
key---平行----value--1188
key---客户----value--1187
key---会议----value--1186
key---于----value--1185
key---竖向----value--1184
key---石化----value

key---门框----value--725
key---加----value--724
key---广材网----value--723
key---北京----value--722
key---执行----value--721
key---先----value--720
key---输出----value--719
key---年----value--718
key---乱码----value--717
key---最新----value--716
key---一直----value--715
key---外墙----value--714
key---不变----value--713
key---驱动----value--712
key---加筋----value--711
key---砂浆----value--710
key---电线----value--709
key---其----value--708
key---权限----value--707
key---功能键----value--706
key---电缆----value--705
key---密码----value--704
key---发送----value--703
key---乘以----value--702
key---not----value--701
key---材料费----value--700
key---进入----value--699
key---智能----value--698
key---断开----value--697
key---拉----value--696
key---地方----value--695
key---一根----value--694
key---成功----value--693
key---图标----value--692
key---整数----value--691
key---上去----value--690
key---g----value--689
key---放坡----value--688
key---面筋----value--687
key---另----value--686
key---预留----value--685
key---运行----value--684
key---大样----value--683
key---设计----va

key---点----value--210
key---金额----value--209
key---【----value--208
key---好----value--207
key---节点----value--206
key---算量----value--205
key---at----value--204
key---支座----value--203
key---图形----value--202
key---出现----value--201
key---该----value--200
key---时报----value--199
key---一致----value--198
key---of----value--197
key---要----value--196
key---框架----value--195
key---解决----value--194
key---这个----value--193
key---管道----value--192
key---广材----value--191
key---怎样----value--190
key---所有----value--189
key---提取----value--188
key---截面----value--187
key---原因----value--186
key---excel----value--185
key---错误----value--184
key---对----value--183
key---按----value--182
key---造价----value--181
key---承台----value--180
key---或----value--179
key---区别----value--178
key---绘图----value--177
key---两个----value--176
key---失败----value--175
key---0----value--174
key---扣减----value--173
key---不到----value--172
key---出来----value--171
key---数据----value--170
key---市场价----value--169
key---功能----value--168
key---spanspan--

In [5]:
def train(word_idx, level=3):
    """
    training set creator..
    It returns a reader creator , each sample
    in the reader is an zero-based ID sequence 
    and label in [0,1]
    """
    return reader_creator(re.compile("glodon/train/key%d/.*?/.*\.txt$" % level), word_idx)
def reader_creator(pattern, word_idx):
    UNK = word_idx['<unk>']
    INS = []
    def load(pattern, out):
        for doc, label in tokenize(pattern):
            out.append(([word_idx.get(w, UNK) for w in doc], label))
    load(pattern, INS)
#     def reader():
#         for doc, label in INS:
#             yield doc, label
#     return reader
#         def reader():
    for doc, label in INS:
        yield doc, label
#     return reader
def test(word_idx, level=3):
    return reader_creator(re.compile("glodon/test/key%d/.*?/.*\.txt$" % level), word_idx)

In [7]:
y = train()
for doc, label in train(x):
    print("---%s----%s---" % ('='.join([str(e) for e in doc]), str(label)))


TypeError: 'function' object is not iterable