In [1]:
import pandas as pd
import math
import re

In [2]:
word_len = 2
part_num = 2

In [3]:
book_name = "大秦帝国2"
whole_txt = book_name + ".txt"
part_txt = book_name + "/第" + str(part_num) + "章.txt"
res_txt = book_name + "/key_words.txt"


In [4]:
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)

def txtcnt(txt):
    cnt = 0
    blocks = re_han.split(txt)
    for blk in blocks:
        if not blk:
            continue
        if re_han.match(blk):
            cnt += len(blk)
    return cnt

def tokens(txt,word_len):
    words = []
    blocks = re_han.split(txt)
    for blk in blocks:
        if not blk:
            continue
        if re_han.match(blk):
            words += ngram(blk,word_len)
    return words

def ngram(txt,word_len):
    words = []
    tmp_txt = txt
    remain_len = len(tmp_txt) - word_len
    if remain_len <= 0:
        words.append(tmp_txt)
    else:
        inx = 0
        while inx <= remain_len:
            word = tmp_txt[inx:inx+word_len]
            words.append(word)
            inx += 1
    return words

In [5]:
whole_word_dic={}
part_word_dic={}
part_lword_dic = {}
part_word_all_cnt = 0

with open(part_txt,'r') as f:
    for line in f:
        part_word_all_cnt += txtcnt(line)
        
        words = tokens(line,word_len)
        for word in words:
            if len(word) == word_len:
                v = part_word_dic.get(word)
                if v:
                    part_word_dic.update({word:v+1})
                else:
                    part_word_dic.update({word:1})
            
        words = tokens(line,word_len+1)
        for word in words:
            if len(word) == (word_len+1):
                v = part_lword_dic.get(word)
                if v:
                    part_lword_dic.update({word:v+1})
                else:
                    part_lword_dic.update({word:1})
        
with open(whole_txt,'r') as f:
    for line in f:
        words = tokens(line,word_len)
        for word in words:
            if len(word) == word_len:
                v = whole_word_dic.get(word)
                if v:
                    whole_word_dic.update({word:v+1})
                else:
                    whole_word_dic.update({word:1})                

In [6]:
partLWords = pd.DataFrame(part_lword_dic.items(),columns=['part_lword','part_lword_cnt'])
partLWords['prefix'] = partLWords['part_lword'].apply(lambda item:item[:word_len])
partLWords['suffix'] = partLWords['part_lword'].apply(lambda item:item[-word_len:])
partLWords[:1]

Unnamed: 0,part_lword,part_lword_cnt,prefix,suffix
0,第二章,1,第二,二章


In [7]:
prefix = partLWords.groupby('prefix').agg({'prefix':'count'})
prefix[:1]

Unnamed: 0_level_0,prefix
prefix,Unnamed: 1_level_1
一一,1


In [8]:
suffix = partLWords.groupby('suffix').agg({'suffix':'count'})
suffix[:1]

Unnamed: 0_level_0,suffix
suffix,Unnamed: 1_level_1
一上,1


In [9]:
tmpWords = suffix.join(prefix,how='outer')
tmpWords = tmpWords.fillna(0)
tmpWords[:5]

Unnamed: 0,suffix,prefix
一一,0.0,1.0
一上,1.0,1.0
一下,2.0,1.0
一丛,1.0,1.0
一丝,5.0,4.0


In [10]:
partWords = pd.DataFrame(part_word_dic.items(),columns=['part_word','part_word_cnt'])

partWords = partWords.set_index('part_word').join(tmpWords,how='outer')
partWords = partWords.fillna(0)
partWords = partWords.reset_index()

partWords[:1]

Unnamed: 0,index,part_word_cnt,suffix,prefix
0,一一,1,0.0,1.0


In [11]:
partWords.loc[:,'edge'] = partWords.loc[:,'suffix'] + partWords.loc[:,'prefix'] 


partWords = partWords[(partWords['prefix']>0) \
                      & (partWords['suffix']>0) \
                      & (partWords['prefix'] >= partWords['suffix']) \
                      & (partWords['part_word_cnt'] >= partWords['edge'])]

partWords = partWords.reset_index(drop=True)

partWords[:10]

Unnamed: 0,index,part_word_cnt,suffix,prefix,edge
0,一切,10,3.0,4.0,7.0
1,一思,3,1.0,1.0,2.0
2,一抖,2,1.0,1.0,2.0
3,一拱,2,1.0,1.0,2.0
4,一看,2,1.0,1.0,2.0
5,一说,3,1.0,1.0,2.0
6,一顾,2,1.0,1.0,2.0
7,万里,3,1.0,1.0,2.0
8,三人,2,1.0,1.0,2.0
9,三千,2,1.0,1.0,2.0


In [12]:
partWords = partWords.filter(items=['index','part_word_cnt']).rename(columns={'index':'part_word'})
partWords[:1]

Unnamed: 0,part_word,part_word_cnt
0,一切,10


In [13]:
wholeWords = pd.DataFrame(whole_word_dic.items(),columns=['whole_word','whole_word_cnt'])
wholeWords[:1]

Unnamed: 0,whole_word,whole_word_cnt
0,大秦,18


In [14]:
partWords['part_word_all_cnt'] = part_word_all_cnt
partWords[:1]

Unnamed: 0,part_word,part_word_cnt,part_word_all_cnt
0,一切,10,38599


In [15]:
words = partWords.set_index('part_word').join(wholeWords.set_index('whole_word'),how='left')
words = words.reset_index()

In [16]:
words[:1]

Unnamed: 0,part_word,part_word_cnt,part_word_all_cnt,whole_word_cnt
0,一切,10,38599,70


In [17]:
words = words.reset_index(drop=True)
def score(row):
    tf = (row['part_word_cnt']+1)/(row['part_word_all_cnt']+1)
    idf = math.log((row['whole_word_cnt']+1)/(row['part_word_cnt']+1))
    score = tf * idf
    return score

words['score'] = words.apply(score,axis=1)

In [18]:
res = words.sort_values('score',ascending=False)[:20]
res = res.reset_index(drop=True)
res = res.sort_values('part_word_cnt',ascending=False)
res.filter(items=['part_word','part_word_cnt','whole_word_cnt'])

Unnamed: 0,part_word,part_word_cnt,part_word_all_cnt,whole_word_cnt,score
0,苏秦,289,38599,2785,0.016998
1,张仪,227,38599,2619,0.014422
2,洛阳,110,38599,225,0.002045
3,父亲,84,38599,197,0.001862
4,孟子,66,38599,139,0.001279
5,如此,59,38599,684,0.003785
6,张兄,59,38599,205,0.001917
7,如何,58,38599,834,0.00405
8,绯云,56,38599,453,0.003064
9,国人,52,38599,192,0.001775
