In [18]:
import networkx as nx
import matplotlib.pyplot as plt
import time, json, re
import numpy as np
from scipy.stats import powerlaw
import community  # python-louvain 库
from tqdm import tqdm
from itertools import combinations

In [19]:
'''读取MetaData'''
MetaData_path = r"D:\vscode_workspace\database\MetaData.json"
with open(MetaData_path, 'r', encoding='utf-8') as f:
    MetaData = json.load(f)

In [20]:
authors_primitive = set({})
affiliations_primitive = set({})
for uid in tqdm(MetaData[0]):
    for author in MetaData[1][uid]['authors']:
        authors_primitive.add(author)
    for affiliation in MetaData[1][uid]['affiliations']:
        affiliations_primitive.add(affiliation)

100%|██████████| 445165/445165 [00:01<00:00, 274189.43it/s]


In [21]:
'''获取作者name-id映射关系'''
def name_initialize(author):
    pattern = re.compile(r'[^\u4e00-\u9fff]')
    author = author.replace("，", "").replace(" ", "")
    author = re.sub(pattern, '', author)
    if len(author)<2:
        return None
    return author

authors = []
for author in list(authors_primitive):
    author = name_initialize(author)
    if author:
        authors.append(author)
name2id = {element: index for index, element in enumerate(authors)}
id2name = {index: element for index, element in enumerate(authors)}

In [22]:
def affiliation_cleaning(affiliation:list)->list:
    if 'Kyungpook' in affiliation:
        return ['北京药理毒理研究所', '《中国药理学与毒理学杂志》编辑部', '江苏康缘药业股份有限公司', '江西中医药大学', '国家卫生和计划生育委员会科教司重大专项处', '山西省中医药研究院', \
            'Department of Pharmacology School of Medicine Kyungpook National University', '浙江大学药学院', '南京中医药大学', '浙江中医药大学现代中药与健康产品研究所', \
                '广东药科大学中药学院', '中国医学科学院药用植物研究所', 'University de Strasbourg', 'Spedding Research Solutions SAS', 'Robert Gordon University', \
                    '中国医学科学院药物研究所', '中国药科大学新药安全评价研究中心', '南方医科大学', \
                    '河北科技大学', '北京大学基础医学院']
    if len(affiliation)==1 and affiliation[0]=='浙江中医药大学第一临床医学院':
        return ['浙江中医药大学第一临床医学院', '浙江医院神经内科']
    if len(affiliation)==6 and affiliation[-1]=='三骨科保髋区':
        return ['广州中医药大学第一临床医学院', '广州中医药大学第一附属医院医学检验科','广州中医药大学第一附属医院三骨科保髋区','广州中医药大学髋关节研究中心']
    if len(affiliation)>0 and affiliation[0] == '美国中医学院':
        return ['美国中医学院', '成都中医药大学', '加拿大汉博学院', '赫华德休斯医学研究院', '美国中医药针灸学会', '纽约妙手针灸理疗中心', '美京中医中心', \
                '北京大学神经科学研究所', '波士顿整合健康中心', '旧金山湾区自然疗法中心', '香港国际激痛点中心', '美国纽约中医学院', '温州医科大学中美针灸康复研究所', '诺克斯中医中心']
    if len(affiliation)>2 and affiliation[0] == '云南中医学院天然药物生物转化研究中心' and affiliation[1] == '山东大学药学院天然药物化学研究所':
        return ['云南中医学院天然药物生物转化研究中心', '山东大学药学院天然药物化学研究所', 'Laboratory of Pharm acognosy and Phytochem istry', 'School of Pharm aceutical Sciences', 'University of Geneva', 'University of Lausanne']
    if affiliation == ['北京中医药大学中医学养生学研究所', '北京中医药大学针灸推拿学院', '北京中', '']:
        return ['北京中医药大学中医学养生学研究所','北京中医药大学针灸推拿学院','北京中医药大学附属护国寺中医医院']
    if affiliation == ['浙江大学医学院附属杭州市第一人民医院', '上海中医药大学附属龙华医院', '浙江中医药大学附属第一医院', '31021部队', '上海中医药大学附属曙光医院']:
        return ['浙江大学医学院附属杭州市第一人民医院', '上海中医药大学附属龙华医院', '浙江中医药大学附属第一医院', '31021部队', '上海中医药大学附属曙光医院']
    if affiliation == ['天津中医药大学', '', '天津', '', '（300193）', '']:
        return ['天津中医药大学', '天津中医药大学第一附属医院']
    # if affiliation == :
    #     return ['山东中医药大学临床学院', '山东中医药大学附属医院']
    affiliation = [i.strip() for i in affiliation if i.strip() != '' and i[0]!='（' and i[0]!='(']
    eng_before = False
    affiliation0 = affiliation
    affiliation = []
    for i in range(len(affiliation0)):
        if affiliation0[i] == '复旦大学附属华东医院17河北省中医院':
            affiliation.append('复旦大学附属华东医院')
            affiliation.append('河北省中医院')
        # 合并英文单词
        if re.match("^[a-zA-Z]+$", affiliation0[i]):
            if eng_before:
                affiliation[-1] = affiliation[-1] + ' ' + affiliation0[i]
            else:
                affiliation.append(affiliation0[i])
                eng_before = True
        # 去除省份（一般字数小于等于3的都是省份）
        elif len(re.sub(r'[^\u4e00-\u9fff]', '', affiliation0[i]))>3:
            affiliation.append(affiliation0[i])
            eng_before = False
    # affiliation[i] = re.sub(r'[^\u4e00-\u9fff]', '', affiliation[i])
    # affiliation = [i for i in affiliation if len(re.sub(r'[^\u4e00-\u9fff]', '', i))>3]
    return affiliation

def name_cleaning(names:list)->list:
    # 去除<sup>和</sup>，去除“综述”，去除英文逗号
    names = [i.replace("<sup>", "").replace("</sup>", "").replace("综述", "").replace("指导", "").replace(",", "") for i in names]
    # 去除括号内的内容
    pattern = r"\([^()]*\)|（[^（）]*）"
    names = [re.sub(pattern, "", i) for i in names]
    # 去除数字
    names = [i for i in names if i.isdigit()==False]
    # 去除空字符串，去除空格
    names = [i.strip() for i in names if i.strip() != '']
    res = []
    if len(names)==0:
        return res
    suffix = names[0][-1].isdigit()
    # 合并作者以及单位编号
    for i in range(len(names)):
        if (not suffix) and names[i][0].isdigit() and len(res)>0:
            res[-1] = res[-1]+names[i]
        else:
            res.append(names[i])
    return res

def name_affiliation_dict(names:list, affiliations:list)->dict:
    def extract_numbers(string):
        pattern = r'\d+'
        numbers = re.findall(pattern, string)
        numbers = [int(num) for num in numbers]
        return numbers
    N_A_dict = {}
    suffix = names[0][-1].isdigit()
    index = 0
    for name in names:
        if name not in N_A_dict:
            N_A_dict[name] = []
        if suffix:
            nums = extract_numbers(name)
            # print(affiliations)
            for num in nums:
                try:
                    N_A_dict[name].append(affiliations[num-1])
                except:
                    # TODO
                    # return {'error': '机构数量少于作者suffix'}
                    pass
        else:
            if len(affiliations)==0:
                pass
            elif len(affiliations)==1 or len(affiliations)<len(names):
                N_A_dict[name].append(affiliations[0])
            else:
                N_A_dict[name].append(affiliations[index])
                index+=1
    return N_A_dict

# count = 0
# for uid in tqdm(MetaData[0]):
#     count+=1
#     if count>10:
#         break
#     names = name_cleaning(MetaData[1][uid]['authors'])
#     affiliations = affiliation_cleaning(MetaData[1][uid]['affiliations'])
#     if len(names)==0:
#         continue
#     tmp = name_affiliation_dict(names, affiliations)
#     # print([re.sub(r'[^\u4e00-\u9fff]', '', i) for i in names])
#     # print(tmp, '\n')
#     # if tmp == {'error': '机构数量少于作者suffix'}:
#     #     print(MetaData[1][uid])
#     #     print(MetaData[1][uid]['authors'])
#     #     print(names)
#     #     print(MetaData[1][uid]['affiliations'])
#     #     print(affiliations)
#     #     print()

In [23]:
'''创建总图G'''
'''
node属性:
    affiliation_list
    paper_num
    citation
    years
edge属性:
    papers = {uid: {'year':, 'citation':, 'name_id_list':, 'name_id_len':}}
    paper_num_weight
    citation_weight
'''
G = nx.Graph()

# 作者名字列表
name_list = []
# 作者名字-机构列表
name_dict_list = []
# 作者名字-作者id列表
name_dict_id_list = {}
# 机构集合
affiliation_set = set()
# uid-作者id列表
uid_nameId_list = {}

for uid in tqdm(MetaData[0]):
    # 去除年份不合规范的文章
    if MetaData[1][uid]['year']<1980 or MetaData[1][uid]['year']>2023:
        continue
    names = name_cleaning(MetaData[1][uid]['authors'])
    affiliations = affiliation_cleaning(MetaData[1][uid]['affiliations'])
    # 去除没有名字或者没有机构的文章
    if len(names)==0 or len(affiliations)==0:
        continue
    N_A_dict = name_affiliation_dict(names, affiliations)
    for name in N_A_dict.keys():
        for affiliation in N_A_dict[name]:
            affiliation_set.add(affiliation)
    name_id_list = []
    for name in N_A_dict.keys():
        name_pure = re.sub(r'\d+', '', name) # 去除了数字
        if name_pure not in name_dict_id_list:
            # 新建node
            name_list.append(name_pure)
            name_dict_list.append(N_A_dict[name])
            name_id_list.append(len(name_list)-1)
            name_dict_id_list[name_pure] = [len(name_list)-1]
            G.add_node(len(name_list)-1, name=name_pure, affiliation_list=N_A_dict[name], paper_num=1, citation=MetaData[1][uid]['citation'], years={MetaData[1][uid]['year']})
        else:
            indices = name_dict_id_list[name_pure]
            tmp_set = set(N_A_dict[name])
            intersection = False
            for index in indices:
                if tmp_set.intersection(set(name_dict_list[index])):
                    # 维护node
                    intersection = True
                    name_dict_list[index] = list(set(name_dict_list[index]).union(tmp_set))
                    name_id_list.append(index)
                    G.nodes[index]['affiliation_list'] = name_dict_list[index]
                    G.nodes[index]['paper_num'] += 1
                    G.nodes[index]['citation'] += MetaData[1][uid]['citation']
                    G.nodes[index]['years'].add(MetaData[1][uid]['year'])
                    break
            if not intersection:
                # 新建node
                name_list.append(name_pure)
                name_dict_list.append(N_A_dict[name])
                name_id_list = [len(name_list)-1]
                name_dict_id_list[name_pure].append(len(name_list)-1)
                G.add_node(len(name_list)-1, name=name_pure, affiliation_list=N_A_dict[name], paper_num=1, citation=MetaData[1][uid]['citation'], years={MetaData[1][uid]['year']})
    for id_pair in combinations(name_id_list, 2):
        uid_dict = {'year':MetaData[1][uid]['year'], \
                    'citation':MetaData[1][uid]['citation'], \
                    'name_id_list':name_id_list, \
                    'name_id_len':len(name_id_list)}
        if G.has_edge(id_pair[0], id_pair[1]):
            # 维护edge
            G[id_pair[0]][id_pair[1]]['papers'][uid] = uid_dict
            G[id_pair[0]][id_pair[1]]['paper_num_weight'] += 1
            G[id_pair[0]][id_pair[1]]['citation_weight'] += MetaData[1][uid]['citation']
        else:
            # 新建edge
            G.add_edge(id_pair[0], id_pair[1], papers={uid:uid_dict}, paper_num_weight=1, citation_weight=MetaData[1][uid]['citation'])
    # 维护 uid_nameId_list
    uid_nameId_list[uid] = name_id_list

100%|██████████| 445165/445165 [00:37<00:00, 11963.72it/s]


In [24]:
'''创建不同年份的分图'''
graphs_per_year = [nx.Graph() for i in range(2024-1980)]

for uid in tqdm(MetaData[0]):
    year = MetaData[1][uid]['year']
    if (uid not in uid_nameId_list) or (year<1980 or year>2023):
        continue
    for name_id in uid_nameId_list[uid]:
        if graphs_per_year[year-1980].has_node(name_id):
            graphs_per_year[year-1980].nodes[name_id]['affiliation_list'] = name_dict_list[name_id]
            graphs_per_year[year-1980].nodes[name_id]['paper_num'] += 1
            graphs_per_year[year-1980].nodes[name_id]['citation'] += MetaData[1][uid]['citation']
            graphs_per_year[year-1980].nodes[name_id]['years'].add(year)
        else:
            graphs_per_year[year-1980].add_node(name_id, name=name_list[name_id], affiliation_list=name_dict_list[name_id], paper_num=1, citation=MetaData[1][uid]['citation'], years={year})

    for id_pair in combinations(uid_nameId_list[uid], 2):
        if graphs_per_year[year-1980].has_edge(id_pair[0], id_pair[1]):
            graphs_per_year[year-1980][id_pair[0]][id_pair[1]]['papers'][uid] = G[id_pair[0]][id_pair[1]]['papers'][uid]
            graphs_per_year[year-1980][id_pair[0]][id_pair[1]]['paper_num_weight'] += 1
            graphs_per_year[year-1980][id_pair[0]][id_pair[1]]['citation_weight'] += MetaData[1][uid]['citation']
        else:
            graphs_per_year[year-1980].add_edge(id_pair[0], id_pair[1], papers={uid:G[id_pair[0]][id_pair[1]]['papers'][uid]}, paper_num_weight=1, citation_weight=MetaData[1][uid]['citation'])

100%|██████████| 445165/445165 [00:17<00:00, 25834.40it/s]


In [25]:
'''机构id，以及机构包含的作者id列表'''
affiliation_dict = {affiliation: index for index, affiliation in enumerate(affiliation_set)}
affiliation_list = list(affiliation_set)
affiliation_id_list = [[] for _ in range(len(affiliation_set))]
for id in tqdm(range(len(name_dict_list))):
    for affiliation in name_dict_list[id]:
        affiliation_id_list[affiliation_dict[affiliation]].append(id)

100%|██████████| 796781/796781 [00:00<00:00, 1198715.80it/s]


In [26]:
'''保存graph和各种list'''
from networkx.readwrite import json_graph
import pickle

save_path = r"D:\vscode_workspace\ZhongYiPapers\network\data"

# # 将数据保存到JSON文件
# data = json_graph.node_link_data(G)
# with open(save_path + r"\data.json", "w") as file:
#     json.dump(data, file)

# 保存图
with open(save_path + r'\G.pkl', 'wb') as f:
    pickle.dump(G, f)
with open(save_path + r'\graphs_per_year.pkl', 'wb') as f:
    pickle.dump(graphs_per_year, f)

# nx.write_gml(G, save_path + r'\data.gml')

with open(save_path + r'\name_list.pkl', 'wb') as f:
    pickle.dump(name_list, f)
with open(save_path + r'\name_dict_list.pkl', 'wb') as f:
    pickle.dump(name_dict_list, f)
with open(save_path + r'\name_dict_id_list.pkl', 'wb') as f:
    pickle.dump(name_dict_id_list, f)
with open(save_path + r'\affiliation_set.pkl', 'wb') as f:
    pickle.dump(affiliation_set, f)
with open(save_path + r'\uid_nameId_list.pkl', 'wb') as f:
    pickle.dump(uid_nameId_list, f)
with open(save_path + r'\affiliation_dict.pkl', 'wb') as f:
    pickle.dump(affiliation_dict, f)
with open(save_path + r'\affiliation_list.pkl', 'wb') as f:
    pickle.dump(affiliation_list, f)
with open(save_path + r'\affiliation_id_list.pkl', 'wb') as f:
    pickle.dump(affiliation_id_list, f)

In [27]:
# G
# name_list
# name_dict_list
# name_dict_id_list

In [28]:
'''创建作者合作网络'''
# 创建无向图
G = nx.Graph()

# 添加节点和边
for author in tqdm(authors):
    G.add_node(name2id[author])

for paper in tqdm(MetaData[0]):
    author_list = []
    for author in MetaData[1][paper]['authors']:
        author = name_initialize(author)
        if author:
            author_list.append(name2id[author])
    author_pairs = list(combinations(author_list, 2))
    for pair in author_pairs:
        if G.has_edge(pair[0], pair[1]):
            G[pair[0]][pair[1]]['weight'] += 1
        else:
            G.add_edge(pair[0], pair[1], weight=1)


100%|██████████| 516997/516997 [00:00<00:00, 730828.71it/s] 
100%|██████████| 445165/445165 [00:09<00:00, 45653.31it/s]


In [29]:
'''不同weight的边的数量'''
# Get the weight histogram
weight_hist = nx.degree_histogram(G)

# Print the number of edges with each weight
for weight, count in enumerate(weight_hist):
    print(f"Weight {weight}: {count} edges")


Weight 0: 6564 edges
Weight 1: 20590 edges
Weight 2: 21278 edges
Weight 3: 22430 edges
Weight 4: 25463 edges
Weight 5: 24767 edges
Weight 6: 20390 edges
Weight 7: 16853 edges
Weight 8: 13025 edges
Weight 9: 10635 edges
Weight 10: 8890 edges
Weight 11: 7314 edges
Weight 12: 6345 edges
Weight 13: 5492 edges
Weight 14: 4938 edges
Weight 15: 4348 edges
Weight 16: 3793 edges
Weight 17: 3382 edges
Weight 18: 3051 edges
Weight 19: 2702 edges
Weight 20: 2484 edges
Weight 21: 2287 edges
Weight 22: 2032 edges
Weight 23: 1864 edges
Weight 24: 1715 edges
Weight 25: 1590 edges
Weight 26: 1488 edges
Weight 27: 1288 edges
Weight 28: 1227 edges
Weight 29: 1129 edges
Weight 30: 1064 edges
Weight 31: 1016 edges
Weight 32: 927 edges
Weight 33: 864 edges
Weight 34: 797 edges
Weight 35: 812 edges
Weight 36: 730 edges
Weight 37: 721 edges
Weight 38: 662 edges
Weight 39: 639 edges
Weight 40: 598 edges
Weight 41: 631 edges
Weight 42: 550 edges
Weight 43: 534 edges
Weight 44: 537 edges
Weight 45: 517 edges
Wei

In [30]:
# Get the edges of the graph
edges = G.edges(data=True)

# Sort the edges based on their weight in descending order
sorted_edges = sorted(edges, key=lambda x: x[2]['weight'], reverse=True)

# Get the top k edges with the highest weight
k = 10  # Replace with the desired value of k
top_k_edges = sorted_edges[:k]

# Get the names of the nodes connected by each edge
top_k_nodes_names = [(id2name[edge[0]], id2name[edge[1]], edge[2]['weight']) for edge in top_k_edges]

# Print the names of the nodes connected by each edge
for nodes in top_k_nodes_names:
    print(f"Nodes: {nodes[0]}, {nodes[1]}, weight: {nodes[2]}")


Nodes: 刘密, 常小荣, weight: 235
Nodes: 彭清华, 彭俊, weight: 229
Nodes: 孟宪生, 包永睿, weight: 218
Nodes: 林志健, 张冰, weight: 208
Nodes: 陈素红, 吕圭源, weight: 202
Nodes: 张哲, 杨关林, weight: 200
Nodes: 吕晓东, 庞立健, weight: 198
Nodes: 陆兔林, 毛春芹, weight: 189
Nodes: 石岩, 杨宇峰, weight: 185
Nodes: 王帅, 包永睿, weight: 184


In [31]:
'''社区划分'''
partition=community.best_partition(G)
for node,community_index in partition.items():
    G.nodes[node]['label']=community_index

In [None]:
'''社区的人数'''
from collections import Counter

# Get the labels of the nodes in G
labels = nx.get_node_attributes(G, 'label')

# Count the number of nodes in each label group
label_counts = Counter(labels.values())

# Print the number of nodes in each label group
for label, count in label_counts.items():
    print(f"Label {label}: {count} nodes")


In [None]:
'''节点的度分布'''
# Get the degree histogram
degree_hist = nx.degree_histogram(G)

# Plot the distribution
plt.bar(range(len(degree_hist)), degree_hist)
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.title('Distribution of Weight')
plt.show()


In [None]:
'''度最大的k个节点'''
# Get the degrees of all nodes in the graph
degrees = G.degree()

# Sort the nodes based on their degree in descending order
sorted_nodes = sorted(degrees, key=lambda x: x[1], reverse=True)

# Get the top k nodes with the highest degree
k = 50  # Replace with the desired value of k
top_k_nodes = sorted_nodes[:k]

# Get the names and degrees of the top k nodes
top_k_names_degrees = [(id2name[node[0]], node[1]) for node in top_k_nodes]

# Print the names and degrees of the top k nodes
for name, degree in top_k_names_degrees:
    print(f"Node: {name}, Degree: {degree}")
