In [2]:
import pandas as pd
import numpy as np
import itertools
from sklearn.cluster import KMeans
import pprint

## 1. Prepare input for node2vec

> We'll use a CSV file where each row represents a single recommendable item: it contains a comma separated list of the named entities that appear in the item's title.

一个样本为一个序列特征。

In [59]:
named_entities_df = pd.read_csv('../output/event_features.csv')
named_entities_df.columns = ['named_entities']
# named_entities_df['named_entities'] = named_entities_df.named_entities.str.replace(" ", ",")
# 为了适配代码才做的
named_entities_df.head()

Unnamed: 0,named_entities
0,3bfd1a65 a52b92d5 7da34a02 25fa8af4
1,f56e0afc ec138c1c 4a4c3d21 e37a2b78 17113b36 4...
2,c74f40cd c74f40cd a52b92d5 7da34a02 7da34a02 f...
3,3bfd1a65 a1e4395d c74f40cd 28ed704e a1e4395d 7...
4,7525289a f6947f54 bdf49a58


> First, we'll have to tokenize the named entities, since `node2vec` expects integers.

处理成节点特征。

In [60]:
tokenizer = dict()
named_entities_df['named_entities'] = named_entities_df['named_entities'].astype(str).apply(
    lambda named_entities: [tokenizer.setdefault(named_entitie, len(tokenizer)) for named_entitie in named_entities.split(' ')]
)
named_entities_df.head()
# https://blog.csdn.net/u012535605/article/details/81709834
# astype(str)

Unnamed: 0,named_entities
0,"[0, 1, 2, 3]"
1,"[4, 5, 6, 7, 8, 6, 9, 6, 7, 8, 8, 9, 9, 6]"
2,"[10, 10, 1, 2, 2, 11, 12, 10]"
3,"[0, 13, 10, 14, 13, 2, 12, 15]"
4,"[16, 17, 18]"


In [61]:
pprint.pprint(list(tokenizer.items())[0:5])

[('3bfd1a65', 0),
 ('a52b92d5', 1),
 ('7da34a02', 2),
 ('25fa8af4', 3),
 ('f56e0afc', 4)]


In [92]:
print( '一共有',len(list(tokenizer.items())) ,'event','对它们进行 embedding')

一共有 98 event 对它们进行 embedding


In order to construct the graph on which we'll run node2vec, we first need to understand which named entities appear together.

In [70]:
named_entities_df.shape

(17690, 1)

In [62]:
pairs_df = named_entities_df['named_entities'].apply(lambda named_entities: list(itertools.combinations(named_entities, 2)))
pairs_df = pairs_df[pairs_df.apply(len) > 0]
pairs_df = pd.DataFrame(np.concatenate(pairs_df.values), columns=['named_entity_1', 'named_entity_2'])
pairs_df.head()

Unnamed: 0,named_entity_1,named_entity_2
0,0,1
1,0,2
2,0,3
3,1,2
4,1,3


In [71]:
pairs_df.shape

(1011209, 2)

Now we can construct the graph. The weight of an edge connecting two named entities will be the number of times these named entities appear together in our dataset.

In [69]:
pairs_df.groupby(['named_entity_1', 'named_entity_2']).size().reset_index(name='weight').head()

Unnamed: 0,named_entity_1,named_entity_2,weight
0,0,1,452
1,0,2,1251
2,0,3,229
3,0,10,454
4,0,11,645


In [93]:
NAMED_ENTITIES_CO_OCCURENCE_THRESHOLD = 0
# By default, 25

edges_df = pairs_df.groupby(['named_entity_1', 'named_entity_2']).size().reset_index(name='weight')
edges_df = edges_df[edges_df['weight'] > NAMED_ENTITIES_CO_OCCURENCE_THRESHOLD]
edges_df[['named_entity_1', 'named_entity_2', 'weight']].to_csv('edges.csv', header=False, index=False, sep=' ')
# 为了作为文本输入，这里需要按照`' '`进行切分
# https://github.com/aditya-grover/node2vec/issues/42
edges_df.head()

Unnamed: 0,named_entity_1,named_entity_2,weight
0,0,1,452
1,0,2,1251
2,0,3,229
3,0,10,454
4,0,11,645


In [94]:
edges_df.shape

(1381, 3)

Next, we'll run `node2vec`, which will output the result embeddings in a file called `emb`.  
We'll use the open source implementation developed by [Stanford](https://github.com/snap-stanford/snap/tree/master/examples/node2vec).

In [95]:
# !git clone https://github.com/JiaxiangBU/node2vec.git
# 下载后，调用 node2vec 代码，基于 word2vec 开发，我调整了 Python 3 版本适用。

In [96]:
!python node2vec/src/main.py --input edges.csv --output emb --weighted

Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


## 2. Read embedding and run KMeans clusterring:

In [97]:
emb_df = pd.read_csv('emb', sep=' ', skiprows=[0], header=None)
emb_df.set_index(0, inplace=True)
emb_df.index.name = 'named_entity'
emb_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,119,120,121,122,123,124,125,126,127,128
named_entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51,0.201396,0.396243,-0.266897,-0.580919,0.309138,-0.215859,0.177556,-0.281016,-0.131219,-0.013089,...,-0.158429,0.280645,0.427394,-0.277015,-0.268398,0.309646,0.308136,0.149765,0.139561,0.137682
2,-0.37751,0.1178,-0.56841,-0.389599,0.084741,-0.241372,-0.283782,-0.20328,-0.153969,-0.230429,...,0.138216,0.316665,0.105284,-0.261846,-0.60253,0.031347,-0.658672,-0.371573,-0.082173,-0.06179
35,0.086804,-0.426829,-0.11655,-0.151494,-0.107983,-0.178468,0.257112,-0.354095,0.221099,-0.24283,...,-0.095036,0.616571,0.214014,-0.083181,-0.456401,0.249818,0.158148,0.136226,0.003339,-0.519082
24,0.049161,-0.354999,-0.16584,0.084412,-0.240874,-0.380403,0.409854,0.238315,-0.26564,-0.33351,...,0.04381,0.042521,0.2631,0.179329,-0.232597,0.271666,0.244623,-0.231822,-0.327294,0.041148
9,0.034572,-0.339031,-0.173325,0.068606,-0.245434,-0.380075,0.41075,0.238807,-0.27912,-0.34191,...,0.054638,0.022407,0.265012,0.188955,-0.232845,0.279593,0.242585,-0.242018,-0.338466,0.055907


In [98]:
emb_df.shape

(97, 128)

基本上每个类别都有 embedding 了。

> Each column is a dimension in the embedding space. Each row contains the dimensions of the embedding of one named entity.  

每一列是一个 embedding 的维度。

> We'll now cluster the embeddings using a simple clustering algorithm such as k-means.

下面利用 embedding 进行聚类。

In [81]:
NUM_CLUSTERS = 2
# By default 10

kmeans = KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(emb_df)
labels = kmeans.predict(emb_df)
emb_df['cluster'] = labels
clusters_df = emb_df.reset_index()[['named_entity','cluster']]
clusters_df.head()

Unnamed: 0,named_entity,cluster
0,51,0
1,35,1
2,2,1
3,24,0
4,9,0


## 3. Prepare input for Gephi:

[Gephi](https://gephi.org) (Java 1.8 or higher) is a nice visualization tool for graphical data.  
We'll output our data into a format recognizable by Gephi.

In [82]:
id_to_named_entity = {named_entity_id: named_entity
                      for named_entity, named_entity_id in tokenizer.items()}

with open('clusters.gdf', 'w') as f:
    f.write('nodedef>name VARCHAR,cluster_id VARCHAR,label VARCHAR\n')
    for index, row in clusters_df.iterrows():
        f.write('{},{},{}\n'.format(row['named_entity'], row['cluster'], id_to_named_entity[row['named_entity']]))
    f.write('edgedef>node1 VARCHAR,node2 VARCHAR, weight DOUBLE\n')
    for index, row in edges_df.iterrows(): 
        f.write('{},{},{}\n'.format(row['named_entity_1'], row['named_entity_2'], row['weight']))

Finally, we can open `clusters.gdf` using Gephi in order to inspect the clusters.