In [36]:
# 数据处理
import pandas as pd
import numpy as np
import json
import csv
import random
import math
from itertools import product


# 网络分析与可视化
import networkx as nx
from networkx.readwrite import json_graph
from pyvis import network as net
import matplotlib.pyplot as plt
import powerlaw # Power laws are probability distributions with the form:p(x)∝x−α

In [37]:
# with open('02_Graph/MasterConceptNetwork.json') as f:
with open('02_Graph/MasterConceptNetwork_Word2Vec-0.62_Baseline-Test.json') as f:
    js_graph = json.load(f)
    G_MasterConceptNetwork = json_graph.node_link_graph(js_graph)

df_concept_info = pd.read_csv('00_Raw Data/CCFD_concept_info.csv')

In [66]:
# 查看全部节点在某个属性的所有选项
def NodeAttributeValueList(G,attribute):
    return set(np.array([G.nodes[n][attribute] for n in G.nodes]).flatten())

def get_children(df, row_name):
    classes = set()
    for i, row in df.iterrows():
        classes.add(row[row_name])
    return classes

In [67]:
# maincategory 和 subcategory 的列表
maincategorys = NodeAttributeValueList(G_MasterConceptNetwork,'maincategory')
subcategorys = NodeAttributeValueList(G_MasterConceptNetwork,'subcategory')

### Series for echars parallel coordinates

In [63]:
# 将完整数据中转化为echarts中paralle coordinates可用的series数据

maincategories = [
        "人造物",
        "交通工具",
        "动物",
        "植物",
        "食物",
        "自然物",
        "身体部位",
      ]

series = []

for i in maincategories:
    series.append({
        'name':i,
        'type':'parallel',
        'lineStyle':{
                'width': 1,
                'opacity': 0.01
            },
        'data':[]
    })

# df_ParallelCoordinates = []
for node in js_graph['nodes']:
    array = [node['id'], node['laplacian_centrality'],node['closeness_centrality'],node['degree_centrality'],df_concept_info[df_concept_info['Id']==node['id']]['familiarity'].values[0]]
    # df_ParallelCoordinates.append(array)
    [serie for serie in series if serie.get('name')==node['maincategory']][0]['data'].append(array)


with open('02_VASystemData/ParallelCoordinate.txt','w+') as f:
    f.write(str(series))

### Hierarchy for yEd

In [96]:
# 将完整数据中转化为yEd可用的概念层级树


G_Tree = nx.Graph()

for i in get_children(df_concept_info, '上级类别'):
    # print(i)
    G_Tree.add_node(i,label=i)

    data_subclass = df_concept_info[df_concept_info['上级类别']==i]

    for j in get_children(data_subclass,'下级类别'):
        G_Tree.add_node(j,label=j)
        G_Tree.add_edge(i,j)


for n,v in G_MasterConceptNetwork.nodes(data=True):
    # print(v['subcategory'])
    # print(G_Tree.nodes[v['subcategory']])
    G_Tree.add_node(n,label=n)
    G_Tree.add_edge(n,v['subcategory'])


nx.write_graphml_lxml(G_Tree, "tree.graphml")

### Hierarchy for D3

In [121]:
# 将完整数据中转化为d3可用的概念层级树

root = {
    "name": "semantic network",
    "children": []
}

for i in get_children(df_concept_info, '上级类别'):
    data_subclass = df_concept_info[df_concept_info['上级类别']==i]
    class_dic = {
        "name": i,
        "children": []
    }
    root['children'].append(class_dic)
    
    for j in get_children(data_subclass,'下级类别'):
        # if j not in get_children(df_concept_info, '上级类别'):
        data_word = data_subclass[data_subclass['下级类别']==j]
        subclass_dic = {
            "name": j,
            "children": []
        }
        class_dic['children'].append(subclass_dic)

for n,v in G_MasterConceptNetwork.nodes(data=True):
    concept_dict = {
        "name":n,
        "value":v['closeness_centrality']+v['laplacian_centrality']+v['degree_centrality']
    }
    main = [child for child in root['children'] if child.get('name')==v['maincategory']][0]['children']
    sub = [child for child in main if child.get('name')==v['subcategory']][0]['children']
    sub.append(concept_dict)


# # 保存
with open("02_Graph/Concepts_Tree.json", "w") as f: 
    json.dump(root, f, ensure_ascii=False)
    

### Graph Summary

In [None]:
# 抽取subcategorys子图间的关联网络 Graph Summary for subcategorys
G_Summary = nx.Graph()
G_Summary.add_nodes_from(subcategorys)
G_Summary.add_edges_from((a,b) for a,b in product(subcategorys,subcategorys) if a!=b)

# add node count into each node
for n,v in G_Summary.nodes(data=True):
    # print(n)
    v['count'] = len(df_concept_info[df_concept_info['下级类别']==n])
    v['subcategory'] = n
    v['maincategory'] = df_concept_info[df_concept_info['下级类别']==n].上级类别.unique()[0]
    v['picture_naming_tested'] = len([x for x,y in G_MasterConceptNetwork.nodes(data=True) if y['subcategory']==n and 'picture_naming_status' in y.keys() and y['picture_naming_status']==True])
    v['picture_naming_successed'] = len([x for x,y in G_MasterConceptNetwork.nodes(data=True) if y['subcategory']==n and 'picture_naming_result' in y.keys() and y['picture_naming_result']==True])

# add edge weight into each edge
for u,v,e in G_Summary.edges(data=True):
    e['weight'] = 0

for u,v,e in G_MasterConceptNetwork.edges(data=True):
    u_subcategory = G_MasterConceptNetwork.nodes[u]['subcategory']
    v_subcategory = G_MasterConceptNetwork.nodes[v]['subcategory']
    if  u_subcategory!=v_subcategory:
        G_Summary[u_subcategory][v_subcategory]['weight']+=e['weight']

removelist = []
for u,v,e in G_Summary.edges(data=True):
    if e['weight'] < 5.9:
        removelist.append([u,v])

for item in removelist:
    G_Summary.remove_edge(item[0],item[1])

# 导出 for G6
# with open('02_Graph/MasterConceptNetwork_Summary_G6.json','w+') as f:
with open('02_Graph/MasterConceptNetwork_Word2Vec-0.62_Baseline-Test_Summary-Subcategory_G6.json','w+') as f:
    f.write(json.dumps(nx.node_link_data(G_Summary), ensure_ascii=False).replace('links','edges'))

# with open('02_Graph/Wenyue/ConceptNetwork_Summary.json','w+') as f:
#     f.write(json.dumps(nx.node_link_data(G_Summary), ensure_ascii=False))

# 导出 for Gephi
# nx.write_gexf(G_Summary, "02_Graph/ConceptNetwork_Summary_CCFD-Similarity-Matrix_Weight-0.52.gexf")

In [9]:
# 抽取maincategorys子图间的关联网络 Graph Summary for 抽取maincategorys子图间的关联网络
G_Summary = nx.Graph()
G_Summary.add_nodes_from(maincategorys)
G_Summary.add_edges_from((a,b) for a,b in product(maincategorys,maincategorys) if a!=b)

# add node count into each node
for n,v in G_Summary.nodes(data=True):
    # print(n)
    v['count'] = len(df_concept_info[df_concept_info['上级类别']==n])
    v['maincategory'] = n
    # v['maincategory'] = df_concept_info[df_concept_info['上级类别']==n].上级类别.unique()[0]
    v['picture_naming_tested'] = len([x for x,y in G_MasterConceptNetwork.nodes(data=True) if y['maincategory']==n and 'picture_naming_status' in y.keys() and y['picture_naming_status']==True])
    v['picture_naming_successed'] = len([x for x,y in G_MasterConceptNetwork.nodes(data=True) if y['maincategory']==n and 'picture_naming_result' in y.keys() and y['picture_naming_result']==True])

# add edge weight into each edge
for u,v,e in G_Summary.edges(data=True):
    e['weight'] = 0

for u,v,e in G_MasterConceptNetwork.edges(data=True):
    u_maincategory = G_MasterConceptNetwork.nodes[u]['maincategory']
    v_maincategory = G_MasterConceptNetwork.nodes[v]['maincategory']
    if  u_maincategory!=v_maincategory:
        G_Summary[u_maincategory][v_maincategory]['weight']+=e['weight']

removelist = []
for u,v,e in G_Summary.edges(data=True):
    if e['weight'] < 5.9:
        removelist.append([u,v])

for item in removelist:
    G_Summary.remove_edge(item[0],item[1])

# 导出 for G6
# with open('02_Graph/MasterConceptNetwork_Summary_G6.json','w+') as f:
with open('02_Graph/MasterConceptNetwork_Word2Vec-0.62_Baseline-Test_Summary-Maincategory_G6.json','w+') as f:
    f.write(json.dumps(nx.node_link_data(G_Summary), ensure_ascii=False).replace('links','edges'))

# with open('02_Graph/Wenyue/ConceptNetwork_Summary.json','w+') as f:
#     f.write(json.dumps(nx.node_link_data(G_Summary), ensure_ascii=False))

# 导出 for Gephi
# nx.write_gexf(G_Summary, "02_Graph/ConceptNetwork_Summary_CCFD-Similarity-Matrix_Weight-0.52.gexf")