In [1]:
#数据加载与处理
import metaknowledge as mk
import pandas as pd
import cntext as ct

#统计图表绘制
import matplotlib.pyplot as plt
import seaborn as sns

#图形内嵌到Notebook中
%matplotlib inline


#交互式图表绘制
import chart_studio.plotly as py
import plotly.graph_objs as go


#英文为罗马字体并显示负号，图形分辨率为140
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False 
plt.rcParams['figure.dpi'] = 140

In [2]:
RC = mk.RecordCollection(r'D:\python科学计量可视化\数据\Demo data\Python-PubMed',cached=True)
df = pd.DataFrame(RC.makeDict())
df.head(2)

Unnamed: 0,PMID,OWN,STAT,DCOM,LR,IS,VI,IP,DP,TI,...,IR,UIN,SI,PMCR,RIN,RPI,num-Authors,num-Male,num-Female,num-Unknown
0,PMID:25052820,[NLM],[MEDLINE],[20151117],[20211217],"[1751-7176 (Electronic), 1524-6175 (Print), 15...",16,[9],2014 Sep,Brachial pulse pressure and cardiovascular or ...,...,,,,,,,6,0,0,6
1,PMID:33249865,[NLM],[MEDLINE],[20210706],[20210706],"[1524-4563 (Electronic), 0194-911X (Print), 01...",77,[1],2021 Jan,Plasma Trough Concentrations of Antihypertensi...,...,,,,,,,6,2,2,2


In [3]:
df.columns

Index(['PMID', 'OWN', 'STAT', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'DP', 'TI', 'PG',
       'LID', 'AB', 'CI', 'FAU', 'AU', 'AD', 'LA', 'PT', 'DEP', 'TA', 'JT',
       'JID', 'SB', 'MH', 'PMC', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST',
       'SO', 'RN', 'OTO', 'OT', 'GR', 'PL', 'TT', 'COIS', 'CIN', 'AUID', 'EIN',
       'UOF', 'MID', 'OID', 'CN', 'OAB', 'OABL', 'FIR', 'IR', 'UIN', 'SI',
       'PMCR', 'RIN', 'RPI', 'num-Authors', 'num-Male', 'num-Female',
       'num-Unknown'],
      dtype='object')

为了避免引入噪音数据，使用split之后的列表进行计数，而不是直接对字符串数据进行计数

In [4]:
text_TI = ''
filter_ls = ['\n',',','.',':','=','(',')','[',']']
for i in df.TI:
    for j in i:
        if j in filter_ls:
            i = i.replace(j,'')
    text_TI+=f'{i} '
text_TI

'Brachial pulse pressure and cardiovascular or all-cause mortality in the general population a meta-analysis of prospective observational studies Plasma Trough Concentrations of Antihypertensive Drugs for the Assessment of Treatment Adherence A Meta-Analysis Time course for blood pressure lowering of dihydropyridine calcium channel blockers The effect of antihypertensive treatment on patients with diabetes depends on the values of blood pressure a systematic survey and meta-analyses The A allele of the rs1990760 polymorphism in the IFIH1 gene is associated with protection for arterial hypertension in type 1 diabetic patients and with expression of this gene in human mononuclear cells Blood pressure lowering efficacy of nonselective beta-blockers for primary hypertension Which cuff should I use? Indirect blood pressure measurement for the diagnosis of hypertension in patients with obesity a diagnostic accuracy review Effects of sodium-glucose co-transporter 2 inhibitors on blood pressur

In [5]:
from collections import Counter
Counter(text_TI.split()).most_common()

[('of', 2096),
 ('and', 2047),
 ('meta-analysis', 1157),
 ('in', 992),
 ('a', 832),
 ('hypertension', 721),
 ('review', 713),
 ('systematic', 666),
 ('A', 620),
 ('with', 503),
 ('the', 460),
 ('pressure', 414),
 ('for', 411),
 ('on', 380),
 ('blood', 377),
 ('risk', 281),
 ('patients', 263),
 ('Systematic', 236),
 ('Meta-Analysis', 222),
 ('trials', 220),
 ('Review', 212),
 ('Hypertension', 175),
 ('The', 174),
 ('Blood', 173),
 ('randomized', 167),
 ('studies', 142),
 ('to', 139),
 ('pulmonary', 135),
 ('Meta-analysis', 134),
 ('controlled', 131),
 ('between', 122),
 ('Pressure', 119),
 ('disease', 116),
 ('Association', 114),
 ('hypertensive', 114),
 ('Effects', 111),
 ('cardiovascular', 107),
 ('Effect', 103),
 ('treatment', 100),
 ('diabetes', 92),
 ('pregnancy', 92),
 ('Risk', 87),
 ('Patients', 83),
 ('factors', 79),
 ('gene', 78),
 ('arterial', 74),
 ('essential', 72),
 ('from', 71),
 ('With', 70),
 ('association', 69),
 ('therapy', 68),
 ('clinical', 66),
 ('polymorphism', 64)

In [6]:
ct.term_freq(text=text_TI, lang='english').most_common(200)

[('meta-analysis', 1512),
 ('review', 925),
 ('systematic', 897),
 ('hypertension', 896),
 ('blood', 550),
 ('pressure', 533),
 ('risk', 369),
 ('patients', 346),
 ('trials', 278),
 ('randomized', 217),
 ('association', 183),
 ('studies', 179),
 ('pulmonary', 178),
 ('controlled', 170),
 ('effect', 165),
 ('effects', 163),
 ('cardiovascular', 156),
 ('disease', 154),
 ('hypertensive', 151),
 ('treatment', 123),
 ('diabetes', 114),
 ('pregnancy', 108),
 ('antihypertensive', 106),
 ('clinical', 102),
 ('factors', 96),
 ('arterial', 91),
 ('gene', 90),
 ('efficacy', 90),
 ('prevalence', 89),
 ('therapy', 83),
 ('essential', 80),
 ('outcomes', 79),
 ('chronic', 71),
 ('adults', 70),
 ('associated', 69),
 ('polymorphism', 67),
 ('receptor', 67),
 ('renal', 67),
 ('disorders', 64),
 ('versus', 62),
 ('mortality', 60),
 ('analysis', 60),
 ('cohort', 59),
 ('prospective', 58),
 ('type', 57),
 ('2', 56),
 ('blockers', 54),
 ('safety', 54),
 ('chinese', 54),
 ('inhibitors', 53),
 ('angiotensin',

#### 承接7.2.1标题及摘要分词字数与频数统计（上）的内容

进行单词单复数计数

In [54]:
result = ct.term_freq(text=text_TI, lang='english')
result.get('effect')

165

In [55]:
result.get('effects')

163

In [56]:
result.get('review')

925

In [57]:
result.get('reviews')

7

In [58]:
for i in df['DP']:
    print(i)

2014 Sep
2021 Jan
2014 Aug 31
2016 Apr
2013
2014 Feb 28
2016 Nov 3
2014 Apr
2015
2018 Nov/Dec
2020 Apr 7
2012 Sep 26
2020 Aug 28
2015 Sep
2017 Sep
2018 Jul 19
2017 Jan
2020 Jan 21
2013 Nov
2017 Aug
2019 Apr 27
2013 Apr 3
2015 Jul
2012 Sep 25
2015 Feb
2020 Apr
2011 Mar 14
2012 Feb
2019 Feb
2013 Dec
2020 Jul 24
2014 Dec
2020
2021 Feb
2020 May 28
2020
2019 Jul
2017 Nov
2017 Nov
2014
2013 Apr
2016 Aug
2020 Feb
2020 Jan 22
2013 Jul
2012 Nov 14
2016 Dec 22
2012 Oct
2019 May 1
2012 Apr
2016 Dec
2017 Jul 9
2014 Oct
2019 Jul
2013 Sep
2013 Jul
2012 Jan
2019 Dec
2018 Dec
2020 Nov
2020 Mar
2017 Mar 16
2014
2014 Sep
2020 Apr
2014 May
2018 Oct 15
2017 Apr
2021 Jun
2011 Nov
2020 Oct
2020 Feb
2017 Jun 27
2019 Jan 5
2018 Feb 9
2013 Feb
2020 Jan
2015 Jul 23
2011
2017 Aug
2020 Sep-Oct
2019 Sep 15
2015 Jun
2020 Jul
2017 Oct 11
2020 Jan-Dec
2020 Oct
2016 May 15
2018 Jul 20
2020 May
2016 Aug
2014 Dec
2018 Feb
2015 Feb 3
2020 Aug 21
2013 Oct 29
2019 Dec
2013 May
2020 Oct
2017 Aug
2019 Mar
2013 Aug 2
2014
202

In [59]:
df['DP'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2000 entries, 0 to 1999
Series name: DP
Non-Null Count  Dtype 
--------------  ----- 
2000 non-null   object
dtypes: object(1)
memory usage: 15.8+ KB


In [60]:
df['DP'].apply(lambda x:int(str(x)[:4]))
#需要提前前四个字符

0       2014
1       2021
2       2014
3       2016
4       2013
        ... 
1995    2021
1996    2020
1997    2016
1998    2013
1999    2018
Name: DP, Length: 2000, dtype: int64

In [61]:
def term_freq_by_text(df,year='all',mode = 'TI',
                      lang = 'english',most_common_num=15,
                     output_figure = False):
    
    
    if isinstance(year,int): 
        df['DP'] = df['DP'].apply(lambda x:int(str(x)[:4]))
        df_year = df[df['DP']==year]
    else:
        df_year = df
    
    if mode in ['TI+AB','AB+TI']:
        df_year[mode] = df_year['AB'].map(str) + ' ' + df_year['TI'].map(str)
        
    text = '' 
    filter_ls = ['\n',',','.',':','=','(',')','[',']']
    for i in df_year[mode].dropna():
        for j in i:
            if j in filter_ls:
                i = i.replace(j,'')
        text+=f'{i} '
    words = ct.term_freq(text=text, lang=lang).most_common(most_common_num) 
    
    if output_figure:
        from pyecharts import options as opts
        from pyecharts.charts import WordCloud
        from pyecharts.globals import SymbolType

        c = (
            WordCloud()
            .add(f"{mode}", words, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
            .set_global_opts(title_opts=opts.TitleOpts(title=f"{mode}",pos_left='center'))
        )
        return words,c
    else:
        return words  

In [62]:
term_freq_by_text(df)
#对标题进行频次统计

[('meta-analysis', 1512),
 ('review', 925),
 ('systematic', 897),
 ('hypertension', 896),
 ('blood', 550),
 ('pressure', 533),
 ('risk', 369),
 ('patients', 346),
 ('trials', 278),
 ('randomized', 217),
 ('association', 183),
 ('studies', 179),
 ('pulmonary', 178),
 ('controlled', 170),
 ('effect', 165)]

In [63]:
term_freq_by_text(df,mode = 'AB')
#对摘要进行频次统计 

[('95%', 5258),
 ('studies', 4863),
 ('hypertension', 4567),
 ('ci', 4510),
 ('risk', 3775),
 ('patients', 3263),
 ('blood', 2946),
 ('pressure', 2923),
 ('trials', 2473),
 ('meta-analysis', 2438),
 ('bp', 2353),
 ('results', 2223),
 ('included', 1897),
 ('', 1813),
 ('data', 1671)]

In [64]:
term_freq_by_text(df,mode = 'AB+TI')
#对摘要和关键词进行频数统计

[('hypertension', 5463),
 ('95%', 5258),
 ('studies', 5042),
 ('ci', 4510),
 ('risk', 4144),
 ('meta-analysis', 3950),
 ('patients', 3609),
 ('blood', 3496),
 ('pressure', 3456),
 ('trials', 2751),
 ('bp', 2357),
 ('results', 2235),
 ('review', 2218),
 ('included', 1898),
 ('systematic', 1850)]

In [65]:
term_freq_by_text(df,year=2021,most_common_num=20)

[('meta-analysis', 42),
 ('systematic', 29),
 ('review', 29),
 ('hypertension', 19),
 ('pressure', 16),
 ('blood', 15),
 ('risk', 10),
 ('patients', 9),
 ('pulmonary', 7),
 ('association', 6),
 ('clinical', 6),
 ('randomized', 6),
 ('trials', 6),
 ('studies', 5),
 ('outcomes', 5),
 ('mortality', 5),
 ('covid-19', 5),
 ('effects', 5),
 ('chronic', 4),
 ('disease', 4)]

In [66]:
term_freq_by_text(df,year=2021,mode='AB',most_common_num=20,output_figure=True)[0]

[('studies', 130),
 ('hypertension', 111),
 ('patients', 106),
 ('95%', 106),
 ('ci', 99),
 ('risk', 84),
 ('blood', 75),
 ('pressure', 75),
 ('results', 65),
 ('meta-analysis', 64),
 ('included', 50),
 ('bp', 49),
 ('covid-19', 46),
 ('study', 42),
 ('pooled', 40),
 ('analysis', 37),
 ('trials', 37),
 ('effects', 36),
 ('clinical', 36),
 ('significant', 34)]

In [67]:
term_freq_by_text(df,year=2021,mode='AB',most_common_num=40,output_figure=True)[1].render_notebook()

## （2）CSSCI文献数据分析

In [68]:
import pandas as pd
from sqlalchemy import create_engine
connect = create_engine('mysql+pymysql://root:123@localhost:3306/')
df_cssci = pd.read_sql_table('cssci',connect,schema='Scientometrics')
df_cssci.head(2)

Unnamed: 0,index,来源篇名,英文篇名,来源作者,基 金,期 刊,第一机构,机构名称,第一作者,中图类号,年代卷期,关 键 词,基金类别,参考文献
0,0,人工智能时代背景下的国家安全治理:应用范式、风险识别与路径选择,National Security Governance in the Era of Art...,阙天舒/张纪腾,海国图智研究院研究基金,国际安全研究,华东政法大学,[阙天舒]华东政法大学.中国法治战略研究中心/[张纪腾]华东政法大学.政治学研究院,阙天舒,D815.5,"2020,38(010):4-38",人工智能/国家安全/应用范式/安全悖论/路径选择,,"\n1.Jervis,Robert.Cooperation under the Securi..."
1,1,“总体国家安全观”思想对情报方法研究的影响,"The Influence of \\\""A Holistic View of Nation...",杨建林,2017年度国家社会科学基金重大项目(17ZDA291)/2018年度国家社会科学基金重点项...,现代情报,南京大学,[杨建林]南京大学.信息管理学院,杨建林,G250.2,"2020,40(030):3-13,37",情报学/国家安全/总体国家安全观/情报方法/情报工作/技术方法,,\n1..习近平谈治国理政.北京:外文出版社\n2.杨建林.情报学学科建设面临的主要问题与发...


In [69]:
term_freq_by_text(df_cssci,mode = '来源篇名',lang='chinese',most_common_num=30,output_figure=True)[0]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86177\AppData\Local\Temp\jieba.cache
Loading model cost 0.650 seconds.
Prefix dict has been built successfully.


[('国家', 850),
 ('安全', 777),
 ('战略', 152),
 ('中国', 145),
 ('美国', 140),
 ('安全观', 131),
 ('总体', 95),
 ('研究', 90),
 ('新', 62),
 ('问题', 57),
 ('视角', 52),
 ('审查', 52),
 ('影响', 48),
 ('分析', 45),
 ('治理', 43),
 ('我国', 43),
 ('制度', 43),
 ('体系', 40),
 ('发展', 36),
 ('国际', 35),
 ('情报', 33),
 ('时代', 32),
 ('政策', 31),
 ('理论', 28),
 ('思想', 27),
 ('建设', 25),
 ('教育', 24),
 ('思考', 23),
 ('经济', 23),
 ('利益', 22)]

In [70]:
term_freq_by_text(df_cssci,mode = '来源篇名',lang='chinese',most_common_num=30,output_figure=True)[1].render_notebook()

In [71]:
df_cnki = pd.read_sql_table('cnki',connect,schema='Scientometrics')
df_cnki.head(2)

Unnamed: 0,index,Reference Type,Auhor,Author Address,Title,Journal Name,Keywords,Abstract,Pages,ISBN/ISSN,Notes,URL,DOI,Database Provider,Year,Number (Issue),Volume
0,0,Journal Article,"张正媚,高宏杰,王梅杰,程亚清,曲海顺,张献之,廖春满,李靖",北京中医药大学东直门医院;中国中医科学院中医药信息研究所;北京中医药大学东方医院;广西壮族自...,基于VOSviewer和CiteSpace的白芍总苷研究热点可视化分析,中国中医药信息杂志,白芍总苷;VOSviewer;CiteSpace;可视化分析,目的分析白芍总苷研究现状和热点，为白芍总苷研究与应用提供参考。方法计算机检索中国知识资源总库...,1-7,1005-5304,11-3519/R,https://kns.cnki.net/kcms/detail/11.3519.R.202...,10.19879/j.cnki.1005-5304.202108578,CNKI,,,
1,1,Journal Article,"党真,杨明义,张加琼",中国科学院水利部水土保持研究所黄土高原土壤侵蚀与旱地农业国家重点实验室;中国科学院大学;西北...,基于文献计量学分析泥沙来源研究进展与热点,水土保持研究,泥沙来源;土壤侵蚀;可视化分析;CiteSpace;复合指纹识别,明确流域或区域泥沙来源对水土保持措施科学布局有重要意义。为了更好地掌握泥沙来源研究的发展动态...,1-6,1005-3409,61-1272/P,https://kns.cnki.net/kcms/detail/61.1272.P.202...,10.13869/j.cnki.rswc.20211217.001,CNKI,,,


In [72]:
term_freq_by_text(df_cnki,mode = 'Abstract',lang='chinese',most_common_num=30,output_figure=True)[0]

[('研究', 14569),
 ('文献', 3960),
 ('分析', 3485),
 ('领域', 3188),
 ('热点', 2774),
 ('进行', 2255),
 ('发展', 2095),
 ('机构', 1608),
 ('可视化', 1519),
 ('相关', 1451),
 ('主要', 1448),
 ('图谱', 1406),
 ('关键词', 1390),
 ('中国', 1342),
 ('知识', 1319),
 ('方法', 1290),
 ('发文', 1198),
 ('作者', 1191),
 ('核心', 1134),
 ('我国', 1111),
 ('数据库', 1103),
 ('合作', 1088),
 ('软件', 1075),
 ('前沿', 1052),
 ('期刊', 1041),
 ('趋势', 1031),
 ('篇', 990),
 ('方面', 980),
 ('国内', 933),
 ('主题', 914)]

In [73]:
#如果要统计中文中摘要和标题中的关键词频，只需要将摘要和标题的字段名称修改一下即可
df_cnki['AB'] = df_cnki['Abstract']
df_cnki['TI'] = df_cnki['Title']

In [74]:
term_freq_by_text(df_cnki,mode = 'AB+TI',lang='chinese',most_common_num=30,output_figure=True)[1].render_notebook()