##  导入相关包

In [1]:
import pandas as pd 
import numpy as np
import math
import jieba
from gensim.models import Word2Vec

## 读取数据

In [5]:
# 所有实体
all_entity = pd.read_csv('data/所有实体.csv')
# OTA,UGC数据
df1 = pd.read_csv('data/游记攻略标注后的数据.csv')
df2 = pd.read_csv('data/酒店评论标注后的数据.csv')
df3 = pd.read_csv('data/景区标注后的数据.csv')
df4 = pd.read_csv('data/餐饮标注后的数据.csv')
df_entity = pd.read_csv('data/result2-2.csv')

In [6]:
df_entity

Unnamed: 0,产品ID,产品名称,产品热度,年份
0,ID26,海滨公园,0.85,2021
1,ID56,叉烧饭,0.44,2021
2,ID72,脆皮热狗卷,0.41,2020
3,ID54,榴莲千层,0.38,2019
4,ID33,御水古温泉,0.38,2018
...,...,...,...,...
355,ID37,玉湖,0.05,2018
356,ID14,红树林栈道,0.05,2018
357,ID13,荔枝,0.05,2020
358,ID67,蛋挞,0.05,2020


In [7]:
texts = df1['正文'].astype(str).tolist() + df2['评论内容'].astype(str).tolist() + df3['评论内容'].astype(str).tolist() + df4['评论内容'].astype(str).tolist()

## 关联分析

### PMI互信息值

In [8]:
def pmi(texts=None, entity1=None, entity2=None):
    '''
    texts：语料文档
    entity：某个实体
    '''
    tl = len(texts)  # 文档长度
    pmi_value = 0
    w1 = 0
    w2 = 0
    w12 = 0
    for text in texts:
        if entity1 in text:
            w1 += 1
        if entity2 in text:
            w2 += 1
        if entity1 in text and entity2 in text:
            w12 += 1
    try:
        p = np.log2((w12/tl) / ((w1/tl) * (w2/tl)))
        if p == -np.inf:
            p = 0
        pmi_value += p # 计算PMI
    except:
        pass
    return round(pmi_value, 2)

In [9]:
entitys_pmis = {}
idxs = df_entity['产品ID'].tolist()
entitys = df_entity['产品名称'].tolist()
for e1 in  range(len(entitys)-1):
    for e2 in  range(e1+1, len(entitys)):
        entitys_pmis[entitys[e1] + '-' + entitys[e2]] = [pmi(texts, entitys[e1], entitys[e2]), idxs[e1] + '-' + idxs[e2]]



In [10]:
# entitys_pmis

{'海滨公园-叉烧饭': [0, 'ID26-ID56'],
 '海滨公园-脆皮热狗卷': [0, 'ID26-ID72'],
 '海滨公园-榴莲千层': [0, 'ID26-ID54'],
 '海滨公园-御水古温泉': [0, 'ID26-ID33'],
 '海滨公园-大角湾': [0, 'ID26-ID27'],
 '海滨公园-雅米乐园': [0, 'ID26-ID51'],
 '海滨公园-甜点': [0, 'ID26-ID12'],
 '海滨公园-蛋挞': [0, 'ID26-ID67'],
 '海滨公园-文化广场': [0, 'ID26-ID6'],
 '海滨公园-小东江': [0, 'ID26-ID9'],
 '海滨公园-拉面': [0, 'ID26-ID65'],
 '海滨公园-东南码头': [0, 'ID26-ID40'],
 '海滨公园-骑楼': [6.05, 'ID26-ID31'],
 '海滨公园-凉拌菜': [0, 'ID26-ID89'],
 '海滨公园-甜品': [0, 'ID26-ID49'],
 '海滨公园-UNPOCO柏蔻': [0, 'ID26-ID80'],
 '海滨公园-沙滩': [3.98, 'ID26-ID5'],
 '海滨公园-烧鸭饭': [0, 'ID26-ID57'],
 '海滨公园-海滨公园': [9.05, 'ID26-ID26'],
 '海滨公园-煲仔饭': [0, 'ID26-ID88'],
 '海滨公园-北洛湾': [0, 'ID26-ID25'],
 '海滨公园-面包': [0, 'ID26-ID11'],
 '海滨公园-水东湾': [5.15, 'ID26-ID45'],
 '海滨公园-炸鸡': [0, 'ID26-ID59'],
 '海滨公园-火腿': [0, 'ID26-ID71'],
 '海滨公园-肉松': [0, 'ID26-ID63'],
 '海滨公园-东湖': [6.25, 'ID26-ID15'],
 '海滨公园-I': [0, 'ID26-ID8'],
 '海滨公园-黄油': [0, 'ID26-ID75'],
 '海滨公园-三元塔': [0, 'ID26-ID32'],
 '海滨公园-湖光岩': [5.05, 'ID26-ID36'],
 '海滨公园-蔬菜': [0, 'ID26-ID5

### word2vec词向量相似度

In [11]:
all_entity.head()

Unnamed: 0,实体
0,炒乌冬
1,凤凰酒店
2,兰州拉面
3,甜品
4,中国第一滩一家人公寓


In [12]:
# 分词
with open('data/myworddict.txt', 'w+', newline='', encoding='utf-8') as fp:
    for e in all_entity['实体'].tolist():
        fp.write(e)
        fp.write('\r')

In [13]:
# 停用词
with open('data/stopword.txt', encoding='utf-8') as fp:
    words = fp.readlines()
    stopwords = [s.replace('\n', '').strip() for s in words] + ['\n', ' ', '茂名', '茂名市', '广东', '广东省', '中国',
                '国家', '全国', '哒']

In [14]:
jieba.load_userdict('data/myworddict.txt')
text_cut = list(map(jieba.lcut, texts))
text_stop = [[w for w in tc if w not in stopwords] for tc in text_cut]
text_stop

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HTHDS\AppData\Local\Temp\jieba.cache
Loading model cost 0.901 seconds.
Prefix dict has been built successfully.


[['放鸡岛',
  '放鸡岛',
  '原名',
  '湾',
  '舟岛',
  '又名',
  '汾洲岛',
  '位于',
  '电',
  '白区',
  '博贺镇',
  '东南',
  '14.5',
  '公里',
  '岛屿',
  '岛上',
  '最高点',
  '高',
  '122',
  '米',
  '面积',
  '1.9',
  '平方公里',
  '海岛',
  '惠州',
  '巽',
  '寮',
  '湾',
  '惠州',
  '巽',
  '寮',
  '湾',
  '读',
  'x',
  'ù',
  'nli',
  'á',
  'ow',
  'ā',
  'n',
  '位于',
  '惠州市',
  '巽',
  '寮',
  '湾',
  '绵延',
  '11',
  '公里',
  '海滩',
  '沙质',
  '洁白',
  '细幼',
  '形',
  '白金',
  '海沙',
  '含硅量',
  '达',
  '99%',
  '世界',
  '罕见',
  '海水',
  '更是',
  '青翠',
  '碧绿',
  '原生态',
  '级',
  '水质',
  '更属',
  '国内',
  '罕有',
  '海滩',
  '百米',
  '水深',
  '超过',
  '米',
  '属',
  '全世界',
  '顶级',
  '天然',
  '海水浴场',
  '空气',
  '负氧离子',
  '含量',
  '达',
  '65856',
  'm3',
  '高于',
  '三亚',
  '国内',
  '顶级',
  '天然',
  '氧吧'],
 ['位于',
  '西部',
  '简称',
  '粤西',
  '介绍',
  '说',
  '湛江',
  '旁',
  '广西',
  '很近',
  '哈哈哈哈',
  '说',
  '潮汕',
  '美食',
  '聚集地',
  '去过',
  '汕头',
  '发现',
  '粤东',
  '粤西',
  '美食',
  '地方',
  '东西',
  '好吃',
  '分散',
  '当地人',
  '很难',
  '找到',
  '帖子',
  '外面',
  '太',
  '想念',
  '家里',


In [15]:
def word2vec_model(text_list=None):
    model = Word2Vec(text_list, vector_size=200, window=5, min_count=1, seed=1, workers=4)
    model.save('data/word2vec.model')

In [18]:
word2vec_model(text_stop)

In [21]:
model = Word2Vec.load('data/word2vec.model')
wv = model.wv

In [22]:
wv.similarity('海滩', '沙滩')

0.9982938

### 删除关联度为0的实体

In [16]:
# new_entitys_pmis = {i:j for i, j in zip(entitys_pmis.keys(), list(entitys_pmis.values())) if j[0] != 0}

In [23]:
new_entitys_pmis = entitys_pmis

### 保存数据

In [42]:
entity_values = list(new_entitys_pmis.values())
entity_values

[[0, 'ID26-ID56'],
 [0, 'ID26-ID72'],
 [0, 'ID26-ID54'],
 [0, 'ID26-ID33'],
 [0, 'ID26-ID27'],
 [0, 'ID26-ID51'],
 [0, 'ID26-ID12'],
 [0, 'ID26-ID67'],
 [0, 'ID26-ID6'],
 [0, 'ID26-ID9'],
 [0, 'ID26-ID65'],
 [0, 'ID26-ID40'],
 [6.05, 'ID26-ID31'],
 [0, 'ID26-ID89'],
 [0, 'ID26-ID49'],
 [0, 'ID26-ID80'],
 [3.98, 'ID26-ID5'],
 [0, 'ID26-ID57'],
 [9.05, 'ID26-ID26'],
 [0, 'ID26-ID88'],
 [0, 'ID26-ID25'],
 [0, 'ID26-ID11'],
 [5.15, 'ID26-ID45'],
 [0, 'ID26-ID59'],
 [0, 'ID26-ID71'],
 [0, 'ID26-ID63'],
 [6.25, 'ID26-ID15'],
 [0, 'ID26-ID8'],
 [0, 'ID26-ID75'],
 [0, 'ID26-ID32'],
 [5.05, 'ID26-ID36'],
 [0, 'ID26-ID58'],
 [0, 'ID26-ID78'],
 [0, 'ID26-ID84'],
 [2.5, 'ID26-ID66'],
 [0, 'ID26-ID50'],
 [0, 'ID26-ID38'],
 [0, 'ID26-ID77'],
 [0, 'ID26-ID16'],
 [0, 'ID26-ID39'],
 [0, 'ID26-ID62'],
 [0, 'ID26-ID42'],
 [0, 'ID26-ID74'],
 [0, 'ID26-ID17'],
 [0, 'ID26-ID70'],
 [0, 'ID26-ID85'],
 [7.21, 'ID26-ID23'],
 [0, 'ID26-ID3'],
 [6.55, 'ID26-ID24'],
 [0, 'ID26-ID79'],
 [0, 'ID26-ID19'],
 [8.05, 'I

In [43]:
entity_names = [str(entity).split('-') for entity in list(new_entitys_pmis.keys())]
idxs = [str(idx[1]).split('-') for idx in entity_values]
sims = [i[0] for i in entity_values]

In [44]:
entity_names

[['海滨公园', '叉烧饭'],
 ['海滨公园', '脆皮热狗卷'],
 ['海滨公园', '榴莲千层'],
 ['海滨公园', '御水古温泉'],
 ['海滨公园', '大角湾'],
 ['海滨公园', '雅米乐园'],
 ['海滨公园', '甜点'],
 ['海滨公园', '蛋挞'],
 ['海滨公园', '文化广场'],
 ['海滨公园', '小东江'],
 ['海滨公园', '拉面'],
 ['海滨公园', '东南码头'],
 ['海滨公园', '骑楼'],
 ['海滨公园', '凉拌菜'],
 ['海滨公园', '甜品'],
 ['海滨公园', 'UNPOCO柏蔻'],
 ['海滨公园', '沙滩'],
 ['海滨公园', '烧鸭饭'],
 ['海滨公园', '海滨公园'],
 ['海滨公园', '煲仔饭'],
 ['海滨公园', '北洛湾'],
 ['海滨公园', '面包'],
 ['海滨公园', '水东湾'],
 ['海滨公园', '炸鸡'],
 ['海滨公园', '火腿'],
 ['海滨公园', '肉松'],
 ['海滨公园', '东湖'],
 ['海滨公园', 'I'],
 ['海滨公园', '黄油'],
 ['海滨公园', '三元塔'],
 ['海滨公园', '湖光岩'],
 ['海滨公园', '蔬菜'],
 ['海滨公园', '日式蟹肉卷'],
 ['海滨公园', '油条碎'],
 ['海滨公园', '牛肉'],
 ['海滨公园', '小炒'],
 ['海滨公园', '野菠萝公园'],
 ['海滨公园', '和牛盖饭'],
 ['海滨公园', '饮料'],
 ['海滨公园', '温德姆酒店'],
 ['海滨公园', '泡芙'],
 ['海滨公园', '牛排'],
 ['海滨公园', '牛肚'],
 ['海滨公园', '晏镜岭'],
 ['海滨公园', '刺身'],
 ['海滨公园', '自助菜'],
 ['海滨公园', '游乐场'],
 ['海滨公园', '中心广场'],
 ['海滨公园', '摩天轮'],
 ['海滨公园', '高钙大骨鸳鸯锅'],
 ['海滨公园', '天马山'],
 ['海滨公园', '赤坎老街'],
 ['海滨公园', '玉湖'],
 ['海滨公园', '菠萝皮'],
 ['海滨公园', '热狗'],
 ['海滨公园

In [45]:
idxs

[['ID26', 'ID56'],
 ['ID26', 'ID72'],
 ['ID26', 'ID54'],
 ['ID26', 'ID33'],
 ['ID26', 'ID27'],
 ['ID26', 'ID51'],
 ['ID26', 'ID12'],
 ['ID26', 'ID67'],
 ['ID26', 'ID6'],
 ['ID26', 'ID9'],
 ['ID26', 'ID65'],
 ['ID26', 'ID40'],
 ['ID26', 'ID31'],
 ['ID26', 'ID89'],
 ['ID26', 'ID49'],
 ['ID26', 'ID80'],
 ['ID26', 'ID5'],
 ['ID26', 'ID57'],
 ['ID26', 'ID26'],
 ['ID26', 'ID88'],
 ['ID26', 'ID25'],
 ['ID26', 'ID11'],
 ['ID26', 'ID45'],
 ['ID26', 'ID59'],
 ['ID26', 'ID71'],
 ['ID26', 'ID63'],
 ['ID26', 'ID15'],
 ['ID26', 'ID8'],
 ['ID26', 'ID75'],
 ['ID26', 'ID32'],
 ['ID26', 'ID36'],
 ['ID26', 'ID58'],
 ['ID26', 'ID78'],
 ['ID26', 'ID84'],
 ['ID26', 'ID66'],
 ['ID26', 'ID50'],
 ['ID26', 'ID38'],
 ['ID26', 'ID77'],
 ['ID26', 'ID16'],
 ['ID26', 'ID39'],
 ['ID26', 'ID62'],
 ['ID26', 'ID42'],
 ['ID26', 'ID74'],
 ['ID26', 'ID17'],
 ['ID26', 'ID70'],
 ['ID26', 'ID85'],
 ['ID26', 'ID23'],
 ['ID26', 'ID3'],
 ['ID26', 'ID24'],
 ['ID26', 'ID79'],
 ['ID26', 'ID19'],
 ['ID26', 'ID48'],
 ['ID26', 'ID37']

In [46]:
# 关联度归一化
def MaxMinNormalization(x,Max,Min):
    x = (x - Min) / (Max - Min)
    return x

In [47]:
new_sims = [MaxMinNormalization(i, max(sims), min(sims)) for i in sims] 
new_sims 

[0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.5226364846870838,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.38482023968042606,
 0.11984021304926763,
 0.72237017310253,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.46271637816245004,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.5359520639147803,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.4560585885486018,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.28628495339547266,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.11984021304926763,
 0.1198402130492

In [49]:
result3 = pd.DataFrame(data=idxs, columns=['产品1ID', '产品2ID'])
result3['关联度'] = sims
result3

Unnamed: 0,产品1ID,产品2ID,关联度
0,ID26,ID56,0.00
1,ID26,ID72,0.00
2,ID26,ID54,0.00
3,ID26,ID33,0.00
4,ID26,ID27,0.00
...,...,...,...
7903,ID29,ID19,0.00
7904,ID29,ID62,0.00
7905,ID29,ID12,0.00
7906,ID29,ID76,4.83


In [50]:
prods = pd.DataFrame(data=entity_names, columns=['产品1', '产品2'])
prods

Unnamed: 0,产品1,产品2
0,海滨公园,叉烧饭
1,海滨公园,脆皮热狗卷
2,海滨公园,榴莲千层
3,海滨公园,御水古温泉
4,海滨公园,大角湾
...,...,...
7903,露天矿,天马山
7904,露天矿,泡芙
7905,露天矿,甜点
7906,露天矿,酱汁


In [94]:
# result3.to_csv('data/result3.csv', index=False)

In [53]:
result_name = result3
result_name['产品1'] = prods['产品1']
result_name['产品2'] = prods['产品2']
result_name.head()

Unnamed: 0,产品1ID,产品2ID,关联度,产品1,产品2
0,ID26,ID56,0.0,海滨公园,叉烧饭
1,ID26,ID72,0.0,海滨公园,脆皮热狗卷
2,ID26,ID54,0.0,海滨公园,榴莲千层
3,ID26,ID33,0.0,海滨公园,御水古温泉
4,ID26,ID27,0.0,海滨公园,大角湾


In [96]:
# result_name.to_csv('data/new_result3.csv', index=False)

In [54]:
result_name.shape

(7908, 5)

In [55]:
dup_result = []
for i, j in zip(result_name['产品1'].values, result_name['产品2'].values):
    if i == j:
        dup_result.append(False)
    else:
        dup_result.append(True)
dup_result

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 Tru

In [56]:
sum(dup_result)

7818

In [57]:
new_result = result3[dup_result]
all_result = result_name[dup_result]

In [61]:
submit = new_result[['产品1ID', '产品2ID', '关联度']]
submit.head()

Unnamed: 0,产品1ID,产品2ID,关联度
0,ID26,ID56,0.0
1,ID26,ID72,0.0
2,ID26,ID54,0.0
3,ID26,ID33,0.0
4,ID26,ID27,0.0


In [62]:
submit.to_csv('data/result3.csv', index=False)
all_result.to_csv('data/result3(包含产品名).csv', index=False)