In [1]:
# 数据处理
import pandas as pd
import numpy as np
import json
import csv
import random
import math
from itertools import product


# 网络分析与可视化
import networkx as nx
from pyvis import network as net
import matplotlib.pyplot as plt
import powerlaw # Power laws are probability distributions with the form:p(x)∝x−α

In [2]:
# 读取已经处理好的完整语义网络数据

df_complete = pd.read_csv('01_Processed Data/Complete-Data.csv')
df_complete.head()

Unnamed: 0,Concept,Rel,Feature,Frequency,familiarity,concretness,上级类别,下级类别,coverage,cue_validity,categorical_distinctiveness
0,安全带,可以,可以-保护,27.096774,9.0,9.0,人造物,工具,0.851345,0.03832,0.158537
1,安全带,是,是-带子,22.258065,9.0,9.0,人造物,工具,0.851345,0.286019,0.012195
2,安全带,其他,其他-车,17.419355,9.0,9.0,人造物,工具,0.851345,0.122012,0.021341
3,安全带,是,是-安保的,16.451613,9.0,9.0,人造物,工具,0.851345,0.089938,0.041159
4,安全带,可以,可以-保护人,15.483871,9.0,9.0,人造物,工具,0.851345,0.149063,0.035061


In [3]:
# with open('02_Graph/MasterConceptNetwork.json') as f:
with open('02_Graph/MasterConceptNetwork_Word2Vec-0.62_Baseline-Test.json') as f:
    js_graph = json.load(f)
    G_MasterConceptNetwork = nx.json_graph.node_link_graph(js_graph)

In [12]:
G_MasterConceptNetwork.nodes['安全带']

{'subcategory': '工具',
 'maincategory': '人造物',
 'degree_centrality': 0.05494505494505494,
 'pagerank': 0.06060983413463747,
 'closeness_centrality': 0.5241060346953804,
 'laplacian_centrality': 0.021495841590250742,
 'clustering_coefficent': 0.23774567830802595,
 'cluster': 0,
 'forceAtalasX': 1936.9679,
 'forceAtalasY': -3296.95}

In [15]:
data={}
data['node']=[x[0] for x in G_MasterConceptNetwork.nodes(data=True)]
data['maincategory'] = [x[1]['maincategory'] for x in G_MasterConceptNetwork.nodes(data=True)]
data['subcategory'] = [x[1]['subcategory'] for x in G_MasterConceptNetwork.nodes(data=True)]
data['closeness_centrality'] = [x[1]['closeness_centrality'] for x in G_MasterConceptNetwork.nodes(data=True)]
df1 = pd.DataFrame(data)

In [30]:
def getBaselineOriginal(x):
    if x in cue_words:
        return 1
    else:
        return 0

def getBaselineNew(x):
    if x in cue_words_generated:
        return 1
    else:
        return 0

In [32]:
df1['baseline_original']=df1['node'].apply(lambda x :getBaselineOriginal(x))
df1['baseline_new']=df1['node'].apply(lambda x :getBaselineNew(x))
df1.to_csv('cuewords_compare.csv',header=True, index=False, encoding='utf_8_sig')

In [4]:
# 将子图内的节点按重要性排序，以logspace采样出指定数量的节点
def NodeSampling(G,num_of_sample, mode):
    # pr = nx.pagerank(G, alpha=0.85,weight='weight')
    # words = list(dict(sorted(pr.items(), key=lambda item: item[1], reverse=True)))
    words = list(sorted(G.nodes(), key=lambda n: G.nodes[n]['closeness_centrality'], reverse=True))
    if len(words) > num_of_sample:
        if mode == 'log':
            # np.logspace(start, end, num_of_samples, endpoint=True, base=10.0)
                index_list = np.logspace(0, math.log(len(words)-1,10), num_of_sample, endpoint=True)
                index_list = sorted(set([int(x) for x in index_list]))
                print(index_list)
                words = [words[i] for i in index_list]
        elif mode == 'top':
                words = words[0:num_of_sample]

    return words

# 查看全部节点在某个属性的所有选项
def NodeAttributeValueList(G,attribute):
    return set(np.array([G.nodes[n][attribute] for n in G.nodes]).flatten())

# 类别子图
def ClusterFilter(G,nodeAttribute, nodeValue, edgeAttribute, edgeValue, writeFile, format):

    # 筛选子图 Node & Edge Attribute Filter
    def filter_node(node):
        if G.nodes[node][nodeAttribute] == nodeValue:
            return node 

    def filter_edge(u,v):
        if G[u][v][edgeAttribute] > edgeValue:
            return G[u][v]

    view = nx.subgraph_view(G,filter_node=filter_node, filter_edge=filter_edge,)
    largest_cc = max(nx.connected_components(view), key=len)
    view = view.subgraph(largest_cc)

    # 储存文件
    if writeFile == True:
        if format == 'Gephi':
            fileName = '02_Graph/ConceptNetwork_'+nodeAttribute+'_'+str(nodeValue)+'_'+edgeAttribute+str(edgeValue)+'.gexf'
            nx.write_gexf(view, fileName)

        else:
            fileName = '02_Graph/ConceptNetwork_'+nodeAttribute+'_'+str(nodeValue)+'_'+edgeAttribute+str(edgeValue)+'.json'
            # fileName = '02_Graph/Wenyue/ConceptNetwork_'+nodeAttribute+'_'+str(nodeValue)+'.json'
            with open(fileName,'w+') as f:
                if format == 'G6':
                    f.write(json.dumps(nx.node_link_data(view), ensure_ascii=False).replace('links','edges'))
                if format == 'D3':
                    f.write(json.dumps(nx.node_link_data(view), ensure_ascii=False))

    # 打印提示
    print("Number of Nodes:",len(view.nodes))
    # print("Node Degree Hist")
    # NodeDegreeHist(view)
    # print("Edge Weight Hist")
    # EdgeWeightHist(view)
    
    return view

## 初始基线测试用词

In [None]:
# 每个大category20词
# 按subcategory的总词数来比例分配词汇
# 选每个subcategory中最重要的一些词

In [7]:
maincategorys

{'交通工具', '人造物', '动物', '植物', '自然物', '身体部位', '食物'}

In [24]:
cue_words_generated = []

maincategorys = NodeAttributeValueList(G_MasterConceptNetwork,'maincategory')
for i in maincategorys:
    print(i)
    view = ClusterFilter(G_MasterConceptNetwork,'maincategory',i,'weight',0,True,'Gephi')
    # subcategorys = NodeAttributeValueList(view,'subcategory')
    # for j in subcategorys:
    #     nodes = [x for x,y in view.nodes(data=True) if y['subcategory']==j]
    #     print(j, len(nodes))
    cue_words_maincategory = NodeSampling(view,20,'log')
    cue_words_generated = cue_words_generated + cue_words_maincategory
    print(cue_words_maincategory)

身体部位
Number of Nodes: 32
[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 18, 21, 25, 30]
['胡须', '牙齿', '头发', '皮肤', '脚趾', '膝盖', '头', '耳朵', '手指', '关节', '脚后跟', '眉毛', '头皮', '眼睛', '拇指']
人造物
Number of Nodes: 540
[1, 2, 3, 5, 7, 10, 14, 19, 27, 38, 53, 73, 102, 143, 199, 278, 387, 539]
['水壶', '风衣', '枕头', '羽绒服', '衬衫', '盘子', '杯子', '牙签', '杯垫', '礼服', '护手霜', '皮带', '剃须刀', '风扇', '洗手液', '净化器', '刀', '短笛']
植物
Number of Nodes: 190
[1, 2, 3, 5, 6, 9, 11, 15, 20, 27, 36, 47, 62, 82, 108, 143, 188]
['红薯', '黄瓜', '西红柿', '胡萝卜', '西兰花', '莲藕', '蚕豆', '苹果', '海带', '番茄', '卷心菜', '土豆', '杨桃', '豌豆', '四季豆', '牡丹', '杉木']
自然物
Number of Nodes: 10
['石头', '冰块', '宝石', '钻石', '珍珠', '岩石', '水晶', '大理石', '白银', '沙粒']
交通工具
Number of Nodes: 46
[1, 2, 3, 4, 6, 7, 9, 11, 13, 16, 20, 24, 30, 36, 44]
['汽车', '自行车', '飞机', '房车', '马车', '飞船', '直升机', '三轮车', '游艇', '手推车', '轿车', '消防车', '碰碰车', '航母', '皮艇']
食物
Number of Nodes: 178
[1, 2, 3, 5, 6, 8, 11, 15, 20, 26, 34, 45, 59, 78, 102, 134, 176]
['红酒', '鸡肉', '火腿肠', '番茄酱', '香肠', '调料', '果酱', '红茶', '红糖', '红烧肉', '冰红茶'

In [6]:
# 完整的词汇列表
concepts = []
maincategorys = []
subcategorys = []

for i in list(G_MasterConceptNetwork.nodes()):
    concepts.append(i)
    maincategorys.append(G_MasterConceptNetwork.nodes()[i]['maincategory'])
    subcategorys.append(G_MasterConceptNetwork.nodes()[i]['subcategory'])

df_nodes = pd.DataFrame(concepts,columns=["concept"])
df_nodes['maincategory'] = maincategorys
df_nodes['subcategory'] = subcategorys
df_nodes = df_nodes.sort_values(['maincategory','subcategory'])
# df_nodes.to_csv('02_Graph/G_MasterConceptNetwork_nodes.csv',index=False)

In [34]:
list_full_current = ['蕨菜',
 '青菜',
 '橄榄',
 '生菜',
 '番茄',
 '卷心菜',
 '花菜',
 '大头菜',
 '西兰花',
 '蓝莓',
 '茼蒿',
 '红薯',
 '花',
 '笋',
 '草',
 '仙人掌',
 '油麦菜',
 '莴笋',
 '草莓',
 '浮萍',
 '煎蛋卷',
 '奶酪',
 '牛奶',
 '肉丸',
 '饼干',
 '油饼',
 '蛋卷',
 '火腿肠',
 '奶油',
 '清汤',
 '火腿',
 '香肠',
 '甜点',
 '巧克力牛奶',
 '棉花糖',
 '粥',
 '馒头',
 '腊肠',
 '酸奶',
 '烤排骨',
 '刺猬',
 '鲤鱼',
 '猪',
 '海豹',
 '豚鼠',
 '狐狸',
 '鲸鱼',
 '海狮',
 '松鼠',
 '狼',
 '犀牛',
 '鲶鱼',
 '美洲狮',
 '猴子',
 '老虎',
 '羊驼',
 '网纹蟒',
 '熊',
 '鲫鱼',
 '黄鼠狼',
 '便盆',
 '漏斗',
 '清洁球',
 '针线篮',
 '桶',
 '陀螺',
 '扫帚',
 '猫砂',
 '簸箕',
 '管道',
 '储蓄罐',
 '沙漏',
 '万花筒',
 '骨灰盒',
 '积木',
 '地图',
 '滑轮',
 '排水管',
 '哑铃',
 '壁画',
 '汽车',
 '卡车',
 '吉普车',
 '赛车',
 '轿车',
 '跑车',
 '车辆',
 '豪华轿车',
 '公交车',
 '摩托车',
 '电动汽车',
 '卡丁车',
 '出租车',
 '三轮车',
 '巴士',
 '敞篷跑车',
 '房车',
 '自行车',
 '行李车',
 '手推车',
 '沟渠',
 '岩石',
 '水晶',
 '宝石',
 '月亮',
 '沙坑',
 '煤田',
 '大理石',
 '钻石',
 '雪花',
 '煤炭',
 '流星',
 '白银',
 '珍珠',
 '冰雹',
 '星星',
 '露珠',
 '沙尘暴'
 '石头',
 '雪球',
 '臀部',
 '肩膀',
 '脚趾',
 '脚背',
 '嘴唇',
 '拇指',
 '膝盖',
 '大腿',
 '心脏',
 '腿',
 '关节',
 '手臂',
 '脚后跟',
 '血管',
 '手指',
 '脚',
 '牙齿',
 '喉',
 '手掌',
 '睫毛']

 # 得到反馈要删除的词汇

list_delete_current = ['蕨菜',
 '橄榄',
 '生菜',
 '大头菜',
 '茼蒿',
 '花',
 '油麦菜',
 '浮萍',
 '煎蛋卷',
 '油饼',
 '奶油',
 '清汤',""
 '火腿',
 '甜点',
 '巧克力牛奶',
 '腊肠',
 '烤排骨',
 '鲤鱼',
 '豚鼠',
 '海狮',
 '鲶鱼',
 '羊驼',
 '网纹蟒',
 '鲫鱼',
 '黄鼠狼',
 '便盆',
 '清洁球',
 '猫砂',
 '簸箕',
 '管道',
 '万花筒',
 '骨灰盒',
 '地图',
 '滑轮',
 '排水管',
 '壁画',
 '汽车',
 '跑车',
 '车辆',
 '豪华轿车',
 '电动汽车',
 '卡丁车',
 '敞篷跑车',
 '房车',
 '行李车',
 '沟渠',
 '宝石',
 '沙坑',
 '煤田',
 '煤炭',
 '臀部',
 '肩膀',
 '脚趾',
 '脚背',
 '大腿',
 '关节',
 '脚后跟',
 '手指',
 '睫毛']

In [41]:
list_valid_current = list(set(list_full_current) ^ set(list_delete_current))
list_valid_next = list(set(concepts).intersection(set(list_valid_current)))
list(set(list_valid_next).intersection(set(cue_words)))

In [74]:
list_checklist_next = set(cue_words).union(set(list_valid_next)).difference(set(list_delete_current))

In [75]:
df_nodes[df_nodes['concept'].isin(list_checklist_next)].to_csv("02_VASystemData/check_list.csv",index=False)

## 生成命名测试用数据 Picture Naming Test Data Generation

In [18]:
with open('02_VASystemData/cue_words.csv', newline='') as f:
    reader = csv.reader(f)
    cue_words = [x.replace('\ufeff','') for x in sum(list(reader),[])]
# print(data)

FileNotFoundError: [Errno 2] No such file or directory: '02_VASystemData/cue_words.csv'

In [19]:
# 读取基线测试词汇列表
df = pd.read_excel('02_VASystemData/Baseline_CueWords.xlsx').dropna()
cue_words = list(df.concept.unique())

In [20]:
cue_words

['帆船',
 '火车',
 '出租车',
 '地铁',
 '公交车',
 '吉普车',
 '轿车',
 '卡车',
 '摩托车',
 '碰碰车',
 '赛车',
 '三轮车',
 '手推车',
 '消防车',
 '自行车',
 '救护车',
 '航母',
 '马车',
 '挖掘机',
 '婴儿车',
 '沙发',
 '镜子',
 '椅子',
 '漏斗',
 '扫帚',
 '沙漏',
 '桶',
 '钢笔',
 '皮帽',
 '拖鞋',
 '运动装',
 '盾牌',
 '枪',
 '陀螺',
 '电池',
 '餐桌',
 '窗帘',
 '地毯',
 '洗衣机',
 '筷子',
 '竹子',
 '草莓',
 '蓝莓',
 '脐橙',
 '桑葚',
 '山楂',
 '荷花',
 '狗尾草',
 '菊花',
 '仙人掌',
 '番茄',
 '红薯',
 '卷心菜',
 '萝卜',
 '青菜',
 '笋',
 '莴笋',
 '西兰花',
 '开心果',
 '甘蔗',
 '大理石',
 '露珠',
 '水晶',
 '雪花',
 '岩石',
 '珍珠',
 '钻石',
 '大脑',
 '鼻子',
 '胡须',
 '脚',
 '泪珠',
 '眉毛',
 '拇指',
 '睫毛',
 '手臂',
 '头',
 '腿',
 '膝盖',
 '心脏',
 '血管',
 '牙齿',
 '指甲',
 '嘴唇',
 '头发',
 '手掌',
 '耳朵',
 '冰棍',
 '饼干',
 '蛋卷',
 '馒头',
 '棉花糖',
 '奶酪',
 '草莓酱',
 '白酒',
 '白水',
 '豆浆',
 '果汁',
 '牛奶',
 '葡萄酒',
 '汽水',
 '爆米花',
 '大米',
 '蛋糕',
 '方便面',
 '饺子',
 '粥']

In [35]:
## Input 
# word_list = ['豆浆', '菠萝汁', '咖啡']
# word_list = cue_words

data = {
    'currentStep': 0,
    'steps': []
}

for word in word_list:
    new_step = {
        'name':word,
        'image':'/test_images/'+word+'.jpeg',
        'countdown':20,
        'result':'fail',
        'status':'unchecked',
    }
    data['steps'].append(new_step)

with open("02_VASystemData/test_picture-naming_new.json", "w") as f: 
    json.dump(data, f, ensure_ascii=False)

In [33]:
word_list = ["帆船","自行车","马车","摩托车","三轮车","火车","手推车","公交车","婴儿车","轿车","挖掘机","出租车","消防车","卡车","救护车","碰碰车","地铁","飞机","直升机","航母","钢琴","鼓","创可贴","体温计","枕头","沙发","窗帘","餐桌","书包","钢笔","镜子","手表","降落伞","扫帚","救生圈","指南针","车库","电话亭","羽绒服","衬衫","运动装","拖鞋","枪","拼图","摩天轮","洗衣机","电池","电视","鼠标","复印机","篮球","羽毛球拍","水壶","杯子","筷子","毛毛虫","蚂蚁","蜗牛","萤火虫","螳螂","鱼","乌龟","小龙虾","螃蟹","水母","狮子","猴子","大象","刺猬","兔","鸭子","企鹅","猫头鹰","鸵鸟","燕子","松树","枫树","竹子","蓝莓","苹果","草莓","山楂","脐橙","甘蔗","仙人掌","荷花","向日葵","菊花","梅花","红薯","西兰花","萝卜","番茄","卷心菜","笋","钻石","岩石","水晶","沙粒","露珠","胡须","牙齿","头发","膝盖","头","耳朵","脚","手指","泪珠","眉毛","腿","嘴唇","眼睛","鼻子","手臂","手掌","拇指","大脑","心脏","血管","米饭","饺子","方便面","蛋糕","三明治","奶酪","棉花糖","鸡肉","火腿肠","红烧肉","火腿","草莓酱","醋","冰糖","牛奶","果汁","葡萄酒","白酒","粥","汽水"]

In [36]:
len(word_list)

140

## 生成语义特征分析训练用数据 SFA Data Generation

In [7]:
 #列出共有某个语义特征的同类概念

def List_Related_Concepts(concept,feature):
    main_category = df_complete[df_complete['Concept'] == concept]['上级类别'].unique()[0]
    related_concepts = list(df_complete[(df_complete['上级类别'] == main_category) & (df_complete['Feature'] == feature)]['Concept'].unique())
    related_concepts = list(filter(lambda x: x != concept, related_concepts))

    return related_concepts

# List_Related_Concepts('安全带','可以-保护人')


# 易混淆的语义特征
# 给到（概念，语义特征）
#   选出所有的同类概念，选出所有同类语义特征，计算每个同类语义特征在该类概念中的线索度
#     按照线索度高低排序
#         如果该特征不为该概念所有
#             则作为混淆项

def List_Confused_Feature(concept, feature):
    # 改成同Subcategory
    sub_category = df_complete[df_complete['Concept'] == concept]['下级类别'].unique()[0]
    related_concepts = list(df_complete[df_complete['下级类别'] == sub_category]['Concept'].unique())
    related_concepts = list(filter(lambda x: x != concept, related_concepts))

    # related_concepts = List_Related_Concepts(concept,feature)
    df_related = df_complete[(df_complete['Concept'].isin(related_concepts))]

    relationship = df_complete[(df_complete['Concept'] == concept) & (df_complete['Feature'] == feature)]['Rel'].unique()[0]
    confused_feature_list = list(df_related[df_related['Rel'] == relationship].sort_values(by=['categorical_distinctiveness','cue_validity'],ascending=False)['Feature'].unique())
    confused_feature_list = list(filter(lambda x: x not in list(df_complete[df_complete['Concept'] == concept]['Feature']), confused_feature_list))

    return confused_feature_list

# List_Confused_Feature('安全带','可以-保护人')[0:10]


In [9]:
## Input 
word_list = ['轿车',
  '摩托车',
  '消防车',
  '消防栓',
  '水',
  '冰红茶',
  '谷子',
  '薄荷',
  '白菜']
# word_list = cue_words


data = {
 "steps": []
}
for word in word_list:
    new_step = {
        'result':'',
        'status':'unchecked',
        "graph": {
            "id": "root",
            "label": word,
            'img':'/test_images/'+word+'.jpeg',
            # 'img':'/test_images/'+word+'.jpeg',

            "children": []
        },
        "mention": {
            "属于":[],
            "用于":[],
            "做":[],
            "有":[],
            "在":[],
            "联想到":[],
        }
    }


    concept = word
    maincategory = df_complete[df_complete['Concept'] == concept]['上级类别'].unique()[0]
    mention_belongto = list(df_complete[df_complete['上级类别'] == maincategory]['下级类别'].unique())[0:3]
    # rel_list = list(df_complete['Rel'].unique())

    feature_function = df_complete[(df_complete['Concept']==concept)&(df_complete['Rel']=='可以')].sort_values(by=['cue_validity'],ascending=False)['Feature'].unique()[0]
    mention_function = List_Confused_Feature(concept,feature_function)[0:3]
    mention_function.append(feature_function)

    feature_need = df_complete[(df_complete['Concept']==concept)&(df_complete['Rel']=='需要')].sort_values(by=['cue_validity'],ascending=False)['Feature'].unique()[0]
    mention_need = List_Confused_Feature(concept,feature_need)[0:3]
    mention_need.append(feature_need)

    feature_have = df_complete[(df_complete['Concept']==concept)&(df_complete['Rel']=='有')].sort_values(by=['cue_validity'],ascending=False)['Feature'].unique()[0]
    mention_have = List_Confused_Feature(concept,feature_have)[0:3]
    mention_have.append(feature_have)

    feature_similar = df_complete[(df_complete['Concept']==concept)&(df_complete['Rel']=='像')].sort_values(by=['cue_validity'],ascending=False)['Feature'].unique()[0]
    mention_similar = List_Confused_Feature(concept,feature_similar)
    if '像-'+concept in mention_similar:
        mention_similar.remove('像-'+concept)
    mention_similar=mention_similar[0:3]
    mention_similar.append(feature_similar)

    new_step['mention']['属于'] = mention_belongto
    new_step['mention']['用于'] = [i.replace('可以-','')  for i in mention_function]
    new_step['mention']['做'] = [i.replace('需要-','')  for i in mention_need]
    new_step['mention']['有'] = [i.replace('有-','')  for i in mention_have]
    new_step['mention']['联想到'] = [i.replace('像-','')  for i in mention_similar]
    
    data['steps'].append(new_step)


with open("02_VASystemData/test_SFA.json", "w") as f: 
    json.dump(data, f, ensure_ascii=False)