In [1]:
import pandas as pd
import numpy as np

In [2]:
#参数设置
topk = 10 #返回近义词的个数
rrange = 10 #上下文范围

# 1.导入数据

In [3]:
table = pd.read_csv('wordlist_v3.csv')
table = table.drop(['content'], axis = 1)

In [4]:
table

Unnamed: 0,ID,Poem_id,line_number,simple,words
0,1,4371,-100,##饯唐永昌( 一作饯唐郎中洛阳令),饯 唐 永昌 一 作 饯 唐 郎中 洛阳 令
1,2,4371,-1,$$沈佺期,沈 期
2,3,4371,1,洛阳旧有( 一作出) 神明宰,洛阳 旧有 一 作 出 神明 宰
3,4,4371,2,辇毂由来天地中,辇毂 由来 天地 中
4,5,4371,3,馀邑政成何足贵,馀 邑 政成 何足 贵
...,...,...,...,...,...
46272,46273,39205,-1,$$李舜弦,李 舜弦
46273,46274,39205,1,饮筵中散酒微醒,饮 筵 中 散 酒 微 醒
46274,46275,39205,2,蒙蒙雨草瑶阶溼,蒙蒙 雨 草 瑶阶 溼
46275,46276,39205,3,钟晓愁吟独倚屏,钟 晓 愁吟 独倚 屏


In [5]:
# 按照空格分开，stack
split_words = table['words'].str.split(' ', expand=True).stack().rename('word').reset_index()
new_data = pd.merge(table['Poem_id'], split_words, left_index=True, right_on='level_0')
new_data

Unnamed: 0,Poem_id,level_0,level_1,word
0,4371,0,0,饯
1,4371,0,1,唐
2,4371,0,2,永昌
3,4371,0,3,一
4,4371,0,4,作
...,...,...,...,...
200373,39205,46275,4,屏
200374,39205,46276,0,尽日
200375,39205,46276,1,池边
200376,39205,46276,2,钓


In [6]:
new_data = new_data[new_data.word != '一作']

# 2 计算上下文TF-IDF矩阵

In [7]:
new_data1 = new_data.drop('level_0', axis = 1)
#记录位置
def process(x):
    x['position'] = range(len(x))
    return x
new_data2 = new_data1.groupby('Poem_id').apply(lambda x:process(x))
new_data3 = new_data2.drop('level_1', axis =1)
new_data4 = new_data3.merge(new_data3, left_on = 'Poem_id', right_on = 'Poem_id', how = 'left')

In [None]:
#上下文范围
new_data5 = new_data4[abs(new_data4.position_x - new_data4.position_y) <= rrange]
new_data5 = new_data5[new_data5.position_x != new_data5.position_y]

## 2.1计算TF

In [None]:
punish_coef = 0.4
new_data5['same_len'] = new_data5.apply(lambda x: 1 if len(x.word_x) == len(x.word_y) else punish_coef, axis = 1)

In [None]:
new_data5

In [None]:
#计算tf
new_data5['weight'] = 1 #距离等权
#new_data5['weight'] = 1/abs(new_data5['position_x'] - new_data5['position_y'])
#new_data5['weight'] = 1/np.sqrt(abs(new_data5['position_x'] - new_data5['position_y']))
new_data5['weight'] = new_data5['weight'] * new_data5['same_len']

In [None]:
new_data6 = new_data5.groupby(['word_x', 'word_y']).apply(lambda x:sum(x.weight))
new_data7 = pd.DataFrame(new_data6, columns = ['weight'])

In [None]:
#展开成tf矩阵
new_data8 = new_data7.unstack()
new_data9=new_data8.fillna(0)

In [None]:
#频率大于10进行挖掘
new_data['freq']=1
a=new_data.groupby('word')['freq'].sum()
target_list = a[a>10].index.values

In [None]:
new_data10 = new_data9.loc[target_list]
arr = new_data10.values #tf矩阵

# 2.2计算IDF

In [None]:
N = len(new_data5.groupby(['Poem_id', 'position_y']))

In [None]:
#IDF表示“词 b 出现在多少个词的上下文中”，N表示总共有多少个不同位置的词
IDF = np.log(N/new_data5.groupby('word_x')['word_y'].count())

In [None]:
idf = IDF.values
idf = idf.reshape(1,idf.shape[0])

In [None]:
word_vec = arr * idf

# 3.计算相关系数，对结果排序

In [None]:
d={}
for i in range(arr.shape[0]):
    cur = word_vec[i]
    res = np.dot(word_vec,cur)/(np.linalg.norm(cur)*np.linalg.norm(word_vec, axis=1))
    b = {x:y for x, y in zip(target_list,res)}
    c = sorted(b.items(), key = lambda d:d[1], reverse=True)
    key = c[0][0]
    d[key] = list(c[1:6])
    print(i)

In [None]:
d['明月']

# 4.和benchmark进行比较

In [None]:
import gensim

## 4.1处理输入

In [None]:
df = pd.read_csv('wordlist_v3.csv')
#存储所有分词
wl = df['simple'].tolist()
weight = pd.read_csv('weight1.csv', index_col = 'word', engine = 'c', encoding = "utf-8")

#根据id索引它包含的所有分词
id_word_dict = {}
for i in weight.columns:
    l=weight[i]
    t = l[l==l]
    l1 = t.index.values
    id_word_dict[i] = l1

sentences = []
for value in id_word_dict.values():
    sentences.append(list(value))

In [None]:
model=gensim.models.Word2Vec(sentences,sg=1,window=rrange,min_count=10,negative=3,sample=0.001,hs=1,workers=4)

#该步骤也可分解为以下三步（但没必要）：
#model=gensim.model.Word2Vec() 建立一个空的模型对象
#model.build_vocab(sentences) 遍历一次语料库建立词典
#model.train(sentences) 第二次遍历语料库建立神经网络模型

#sg=1是skip—gram算法，对低频词敏感，默认sg=0为CBOW算法
#size是神经网络层数，值太大则会耗内存并使算法计算变慢，一般值取为100到200之间。
#window是句子中当前词与目标词之间的最大距离，3表示在目标词前看3-b个词，后面看b个词（b在0-3之间随机）
#min_count是对词进行过滤，频率小于min-count的单词则会被忽视，默认值为5。
#negative和sample可根据训练结果进行微调，sample表示更高频率的词被随机下采样到所设置的阈值，默认值为1e-3,
#negative: 如果>0,则会采用negativesamping，用于设置多少个noise words
#hs=1表示层级softmax将会被使用，默认hs=0且negative不为0，则负采样将会被选择使用。
#workers是线程数，此参数只有在安装了Cpython后才有效，否则只能使用单核
model.wv.save_word2vec_format("benchmark")	 
#通过该方式保存的模型，能通过文本格式打开，也能通过设置binary是否保存为二进制文件。但该模型在保存时丢弃了树的保存形式（详情参加word2vec构建过程，以类似哈夫曼树的形式保存词），所以在后续不能对模型进行追加训练
model = model.wv.load_word2vec_format('benchmark')

In [None]:
all_words = weight.index.values

In [None]:
topn_selected = [5, 10, 30, 50, 100]
res = {}
for n in topn_selected:
    l = []
    for word in target_list:
        try:
            x = [t[0] for t in model.most_similar(word,topn=n)] 
            y = [t[0] for t in d[word][0:5]] 
            count = 0
            for i in x:
                if i in y:
                    count+=1
            l.append(count/5)
        except:
            continue
    res[n] = np.mean(l)

In [None]:
res

In [None]:
#np.save('similar_dict.npy', d) 

# Load


In [None]:
#read_dictionary = np.load('similar_dict.npy',allow_pickle=True).item()
#print(read_dictionary['明月']) # displays "world"