In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
import sklearn.metrics.pairwise as pairwise
import scipy
from tqdm import tqdm
import json

In [2]:
data_dir = '2021-07-19'
# method = 'scibert'
# method = 'tfidf'
method = 'glove'

In [3]:
# 论文被引频次
cited_rank_papers_dict = {}
for file in os.listdir('../scibert编码文章/cited_rank'):
    if file.endswith(f'_{data_dir}.tsv'):
        month = file.split('.')[0].split('_')[-2]
        df = pd.read_csv(os.path.join('../scibert编码文章/cited_rank',file),sep='\t')
        cited_rank_papers_dict[str(month)] = df

In [4]:
# 高被引论文
top_cited_papers_dict = {}
for file in os.listdir('../scibert编码文章/top_cited'):
    if file.endswith(f'_{data_dir}.tsv'):
        month = file.split('.')[0].split('_')[-2]
        df = pd.read_csv(os.path.join('../scibert编码文章/top_cited',file),sep='\t')
        top_cited_papers_dict[str(month)] = df

In [5]:
# SCIBERT Embedding
if method == 'scibert':
    print('scibert')
    # needed_metadata = pd.read_csv('../scibert编码文章/needed_paper_metadata_0530.csv',index_col=0)
    title_embs_df = pd.read_csv(f'../scibert编码文章/title_embs_df_{data_dir}.csv',index_col=0)
    abstract_embs_df = pd.read_csv(f'../scibert编码文章/abstract_embs_df_{data_dir}.csv',index_col=0)
    title_embs_df = title_embs_df[~title_embs_df.index.duplicated(keep='first')]
    abstract_embs_df = abstract_embs_df[~abstract_embs_df.index.duplicated(keep='first')]

# Glove Embedding
if method == 'glove':
    print('glove')
    # needed_metadata = pd.read_csv('../scibert编码文章/needed_paper_metadata_0530.csv',index_col=0)
    title_embs_df = pd.read_csv(f'../title_embs_df_glove_{data_dir}.csv',index_col=0)
    abstract_embs_df = pd.read_csv(f'../abstract_embs_df_glove_{data_dir}.csv',index_col=0)
    title_embs_df = title_embs_df[~title_embs_df.index.duplicated(keep='first')]
    abstract_embs_df = abstract_embs_df[~abstract_embs_df.index.duplicated(keep='first')]

# TF-IDF Embedding
if method == 'tfidf':
    print('tfidf')
    values = scipy.sparse.load_npz(f'../tf-idf-results/values_{data_dir}_scibert_token.npz')
    index = json.load(open(f"../tf-idf-results/index_{data_dir}_scibert_token",'r',encoding='utf-8'))
    column = json.load(open(f"../tf-idf-results/column_{data_dir}_scibert_token",'r',encoding='utf-8'))
    df = pd.DataFrame.sparse.from_spmatrix(values,columns=column,index=index)
    title_embs_df, abstract_embs_df = df.iloc[:,:10000],df.iloc[:,10000:] # nonsense just for fitting foloowing code

glove


In [6]:
# 2020年的所有论文
pmc_metadata = pd.read_csv(f'../scibert编码文章/valid_metadata_{data_dir}.csv',sep='\t')
pmc_metadata['publish_time'] = pd.to_datetime(pmc_metadata['publish_time'],format="%Y-%m-%d")

In [8]:
pmc_metadata.shape

(47869, 19)

In [7]:
pmc_metadata = pmc_metadata[['cord_uid','title','publish_time']]

In [8]:
pmc_metadata = pmc_metadata.drop_duplicates(keep='first').reset_index(drop=True)
pmc_metadata = pmc_metadata.loc[~pmc_metadata['publish_time'].isna()].reset_index(drop=True)

In [9]:
pmc_metadata['citing_year_month'] = \
                [f"{y}-{m:02d}" for y,m in zip(pmc_metadata['publish_time'].dt.year,pmc_metadata['publish_time'].dt.month)]

In [10]:
pmc_metadata['min_cos_distance'] = 0.
pmc_metadata['mean_cos_distance'] = 0.
pmc_metadata = pmc_metadata.set_index('cord_uid')

In [11]:
pmc_metadata = pmc_metadata.rename(columns={'title':'citing_paper'})

In [12]:
# 每个月最小值和mean值 的变化

In [13]:
month_max_mean = []
month_mean_mean = []
month_min_mean = []
month_max_min = []
month_mean_min = []
month_min_min = []
year_months = ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06','2020-07','2020-08','2020-09','2020-10',
                 '2020-11','2020-12','2021-01','2021-02','2021-03','2021-04','2021-05','2021-06']
for month_idx, cur_year_month in enumerate(year_months):
    if month_idx == 0:
        continue
    prev_year_month = year_months[month_idx-1]
    
    # 这个月发表的论文
    cur_month_papers_uids = pmc_metadata.loc[pmc_metadata['citing_year_month']==(cur_year_month)].index
    # print(cur_year_month)
    # print(cur_month_papers_uids)
    # break
    # 上个月的高被引论文
    top_papers = top_cited_papers_dict[prev_year_month].loc[~top_cited_papers_dict[prev_year_month]['uid'].isna()]['uid'].tolist()
    # print(prev_year_month)
    # print(top_papers)
    # break
    
    all_top = []
    for p in top_papers:
        all_top.append(np.hstack([title_embs_df.loc[p].values,abstract_embs_df.loc[p].values]))
    all_top = np.vstack(all_top)
    # 存储当前月所有的文章各自和上个月高被引论文的距离
    cos_dist_min = []
    cos_dist_mean = []
    # 循环计算
    for paper_id in tqdm(cur_month_papers_uids):
        paper_emb = np.hstack([title_embs_df.loc[paper_id].values,abstract_embs_df.loc[paper_id].values]).reshape(1,-1)
        # paper_id 这篇文章与上个月高被引论文的距离cos距离的均值
        cos_dist_mean_ = (1-pairwise.cosine_similarity(paper_emb,all_top)).mean()
        # 1-余弦相似度作为距离
        cos_dist_mean.append(cos_dist_mean_)
        # 给这篇文章的值赋值
        pmc_metadata.at[paper_id,'mean_cos_distance'] = cos_dist_mean_
        # paper_id 这篇文章与上个月高被引论文的距离cos距离的最小值
        cos_dist_min_ = (1-pairwise.cosine_similarity(paper_emb,all_top)).min()
        # 1-余弦相似度作为距离
        cos_dist_min.append(cos_dist_min_)
        # 给这篇文章的值赋值
        pmc_metadata.at[paper_id,'min_cos_distance'] = cos_dist_min_
        
        
    # 计算所有文章的距离的最大最小以及均值
    month_max_mean.append(np.max(cos_dist_mean))
    month_mean_mean.append(np.mean(cos_dist_mean))
    month_min_mean.append(np.min(cos_dist_mean))
    month_max_min.append(np.max(cos_dist_min))
    month_mean_min.append(np.mean(cos_dist_min))
    month_min_min.append(np.min(cos_dist_min))

100%|██████████████████████████████████████████████████████████████████████████████| 374/374 [00:00<00:00, 1806.78it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 615/615 [00:00<00:00, 1993.89it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1736/1736 [00:00<00:00, 1843.51it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2789/2789 [00:02<00:00, 1177.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2921/2921 [00:03<00:00, 956.66it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2473/2473 [00:03<00:00, 816.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2613/2613 [00:03<00:00, 816.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3233/3233 [00:04<00:00, 649.33it/s]
100%|███████████████████████████████████

In [14]:
with open(f'global_info_{data_dir}_{method}.csv','w') as fw:
    fw.write('year_month,max_mean,mean_mean,min_mean,max_min,mean_min,min_min\n')
    for month_idx, cur_year_month in enumerate(year_months):
        if month_idx == 0:
            continue
        # month_idx - 1 因为month_idx是从1开始index的
        fw.write(f'{cur_year_month},{month_max_mean[month_idx-1]},{month_mean_mean[month_idx-1]},{month_min_mean[month_idx-1]},{month_max_min[month_idx-1]},{month_mean_min[month_idx-1]},{month_min_min[month_idx-1]}\n')

In [15]:
pmc_metadata

Unnamed: 0_level_0,citing_paper,publish_time,citing_year_month,min_cos_distance,mean_cos_distance
cord_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6jbjwl8j,Hypoxia-induced amniotic fluid stem cell secre...,2021-01-08,2021-01,0.102433,0.254304
yz3od5m7,Impfbereitschaft unter intensivmedizinischem P...,2021-02-19,2021-02,0.302829,0.578760
8rdo9wgc,Use of machine learning and artificial intelli...,2020-06-16,2020-06,0.038311,0.161213
npcz4qr2,Emerging COVID‐19 vaccines: A rheumatology per...,2021-02-01,2021-02,0.155061,0.424857
k0xi1agw,Scientific societies fostering inclusivity thr...,2020-11-01,2020-11,0.048567,0.217095
...,...,...,...,...,...
ir1s5yfp,Impact of COVID‐19 on dental education: How co...,2020-08-16,2020-08,0.078021,0.341866
9m3efjpx,Improved Glycemic Control With a Digital Healt...,2021-06-02,2021-06,0.039122,0.156415
tlxews76,Novel therapeutics for the treatment of hypert...,2021-03-17,2021-03,0.044856,0.136940
kg1daxuu,The impact of policy mixes on new energy vehic...,2021-02-22,2021-02,0.045703,0.161242


In [16]:
pmc_metadata.to_csv(f'paper_self_info_{data_dir}_{method}.csv')

In [17]:
pmc_metadata

Unnamed: 0_level_0,citing_paper,publish_time,citing_year_month,min_cos_distance,mean_cos_distance
cord_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6jbjwl8j,Hypoxia-induced amniotic fluid stem cell secre...,2021-01-08,2021-01,0.102433,0.254304
yz3od5m7,Impfbereitschaft unter intensivmedizinischem P...,2021-02-19,2021-02,0.302829,0.578760
8rdo9wgc,Use of machine learning and artificial intelli...,2020-06-16,2020-06,0.038311,0.161213
npcz4qr2,Emerging COVID‐19 vaccines: A rheumatology per...,2021-02-01,2021-02,0.155061,0.424857
k0xi1agw,Scientific societies fostering inclusivity thr...,2020-11-01,2020-11,0.048567,0.217095
...,...,...,...,...,...
ir1s5yfp,Impact of COVID‐19 on dental education: How co...,2020-08-16,2020-08,0.078021,0.341866
9m3efjpx,Improved Glycemic Control With a Digital Healt...,2021-06-02,2021-06,0.039122,0.156415
tlxews76,Novel therapeutics for the treatment of hypert...,2021-03-17,2021-03,0.044856,0.136940
kg1daxuu,The impact of policy mixes on new energy vehic...,2021-02-22,2021-02,0.045703,0.161242
