In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
import sklearn.metrics.pairwise as pairwise
import scipy
from tqdm import tqdm
import json

In [2]:
data_dir = '2022-06-02'
#method = 'scibert'
#method = 'tfidf'
# method = 'glove'
method = 'biotert'

In [3]:
# citation cnt
cited_rank_papers_dict = {}
for file in os.listdir('./scibert_encoded_papers/cited_rank'):
    if file.endswith(f'_{data_dir}.tsv'):
        month = file.split('.')[0].split('_')[-2]
        df = pd.read_csv(os.path.join('./scibert_encoded_papers/cited_rank',file),sep='\t')
        cited_rank_papers_dict[str(month)] = df

In [4]:
# highly-impact paper
top_cited_papers_dict = {}
for file in os.listdir('./scibert_encoded_papers/top_cited'):
    if file.endswith(f'_{data_dir}.tsv'):
        month = file.split('.')[0].split('_')[-2]
        df = pd.read_csv(os.path.join('./scibert_encoded_papers/top_cited',file),sep='\t')
        top_cited_papers_dict[str(month)] = df

In [6]:
# SCIBERT Embedding
if method == 'scibert':
    print('scibert')
    # needed_metadata = pd.read_csv('../scibert_encoded_papers/needed_paper_metadata_0530.csv',index_col=0)
    title_embs_df = pd.read_csv(f'./scibert_encoded_papers/title_embs_df_{data_dir}.csv',index_col=0)
    abstract_embs_df = pd.read_csv(f'./scibert_encoded_papers/abstract_embs_df_{data_dir}.csv',index_col=0)
    title_embs_df = title_embs_df[~title_embs_df.index.duplicated(keep='first')]
    abstract_embs_df = abstract_embs_df[~abstract_embs_df.index.duplicated(keep='first')]

# Biobert Embedding
if method == 'biotert':
    print('biotert')
    # needed_metadata = pd.read_csv('../scibert_encoded_papers/needed_paper_metadata_0530.csv',index_col=0)
    title_embs_df = pd.read_csv(f'./title_embs_df_glove_{data_dir}.csv',index_col=0)
    abstract_embs_df = pd.read_csv(f'./abstract_embs_df_glove_{data_dir}.csv',index_col=0)
    title_embs_df = title_embs_df[~title_embs_df.index.duplicated(keep='first')]
    abstract_embs_df = abstract_embs_df[~abstract_embs_df.index.duplicated(keep='first')]

# Glove Embedding
if method == 'glove':
    print('glove')
    # needed_metadata = pd.read_csv('../scibert_encoded_papers/needed_paper_metadata_0530.csv',index_col=0)
    title_embs_df = pd.read_csv(f'./title_embs_df_glove_{data_dir}.csv',index_col=0)
    abstract_embs_df = pd.read_csv(f'./abstract_embs_df_glove_{data_dir}.csv',index_col=0)
    title_embs_df = title_embs_df[~title_embs_df.index.duplicated(keep='first')]
    abstract_embs_df = abstract_embs_df[~abstract_embs_df.index.duplicated(keep='first')]

# TF-IDF Embedding
if method == 'tfidf':
    print('tfidf')
    values = scipy.sparse.load_npz(f'./tf-idf-results/values_{data_dir}_scibert_token.npz')
    index = json.load(open(f"./tf-idf-results/index_{data_dir}_scibert_token",'r',encoding='utf-8'))
    column = json.load(open(f"./tf-idf-results/column_{data_dir}_scibert_token",'r',encoding='utf-8'))
    df = pd.DataFrame.sparse.from_spmatrix(values,columns=column,index=index)
    title_embs_df, abstract_embs_df = df.iloc[:,:10000],df.iloc[:,10000:] # nonsense just for fitting foloowing code

biotert


In [7]:
# 2020 papers
pmc_metadata = pd.read_csv(f'./scibert_encoded_papers/valid_metadata_{data_dir}.csv',sep='\t')
pmc_metadata['publish_time'] = pd.to_datetime(pmc_metadata['publish_time'],format="%Y-%m-%d")

In [8]:
pmc_metadata.shape

(152164, 19)

In [9]:
pmc_metadata = pmc_metadata[['cord_uid','title','publish_time']]

In [10]:
pmc_metadata = pmc_metadata.drop_duplicates(keep='first').reset_index(drop=True)
pmc_metadata = pmc_metadata.loc[~pmc_metadata['publish_time'].isna()].reset_index(drop=True)

In [11]:
pmc_metadata['citing_year_month'] = \
                [f"{y}-{m:02d}" for y,m in zip(pmc_metadata['publish_time'].dt.year,pmc_metadata['publish_time'].dt.month)]

In [12]:
pmc_metadata['min_cos_distance'] = 0.
pmc_metadata['mean_cos_distance'] = 0.
pmc_metadata = pmc_metadata.set_index('cord_uid')

In [13]:
pmc_metadata = pmc_metadata.rename(columns={'title':'citing_paper'})

In [14]:
title_embs_df

Unnamed: 0_level_0,emb0,emb1,emb2,emb3,emb4,emb5,emb6,emb7,emb8,emb9,...,emb290,emb291,emb292,emb293,emb294,emb295,emb296,emb297,emb298,emb299
cord_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bufbjdmw,0.143879,0.086925,-0.134158,0.209102,-0.043729,0.007200,-2.501030,0.477032,-0.063554,0.387861,...,0.169964,-0.318902,0.054195,0.159778,-0.113306,-0.438461,-0.001894,0.061043,-0.034218,0.319471
a564l6vs,-0.070781,-0.331083,-0.077161,-0.068825,0.097242,-0.242156,-2.923097,0.310597,0.091399,0.104764,...,-0.066512,-0.030937,-0.030437,0.256698,-0.239533,0.110826,0.037702,-0.081857,-0.010339,0.110707
y86m85pe,-0.040309,-0.286443,0.122344,0.021420,0.085995,0.218103,-2.219017,0.057577,0.071140,0.212667,...,0.051367,0.084611,-0.025061,0.207049,-0.044242,0.035877,-0.039462,-0.477077,-0.158963,0.001546
6fw4thkq,-0.007261,-0.096704,0.027877,-0.101949,0.044087,-0.017849,-2.155464,0.667624,0.033105,0.129223,...,0.080665,-0.315238,0.010727,0.226187,-0.118834,-0.308326,0.083231,-0.142933,0.046738,-0.063914
zaeg1ujv,-0.053632,0.012062,0.017653,-0.094258,-0.027011,0.002073,-2.372232,0.424586,0.122673,0.035330,...,0.055477,-0.313407,-0.000244,0.097170,-0.037736,0.105899,0.142581,-0.168838,0.015142,0.063750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3b63pnxq,0.016952,-0.045539,0.045284,-0.076458,-0.084646,-0.075598,-2.596671,0.453632,-0.008521,-0.012598,...,0.110303,-0.003807,-0.146222,0.070687,0.006368,-0.153386,-0.073581,-0.099559,-0.014522,0.035987
3x1mm80o,0.245648,-0.082950,-0.019357,0.001547,-0.080289,-0.133065,-2.377775,0.606762,-0.024630,-0.051796,...,0.243647,-0.017300,-0.316128,-0.074905,0.019364,-0.300043,-0.078664,-0.076680,-0.111530,0.122431
aeuy92bx,-0.063190,0.053873,0.225955,-0.180395,0.092646,0.010669,-1.680442,0.656217,-0.025914,-0.038311,...,-0.079485,-0.220916,-0.328277,-0.168963,-0.048497,0.066669,-0.058134,-0.065406,-0.053725,0.071831
05dxn54t,0.136512,0.039701,-0.223225,0.006171,0.025650,-0.189124,-2.784173,0.745896,-0.096982,-0.155317,...,-0.077740,-0.115208,-0.114855,-0.058503,-0.053875,-0.081492,-0.094155,0.088063,-0.001340,0.012734


In [15]:
month_max_mean = []
month_mean_mean = []
month_min_mean = []
month_max_min = []
month_mean_min = []
month_min_min = []
year_months = ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06','2020-07','2020-08','2020-09','2020-10',
                 '2020-11','2020-12','2021-01','2021-02','2021-03','2021-04','2021-05','2021-06','2021-07','2021-08','2021-09','2021-10',
                 '2021-11','2021-12','2022-01','2022-02','2022-03','2022-04']
for month_idx, cur_year_month in enumerate(year_months):
    if month_idx == 0:
        continue
    prev_year_month = year_months[month_idx-1]
    
    # paper in current month
    cur_month_papers_uids = pmc_metadata.loc[pmc_metadata['citing_year_month']==(cur_year_month)].index
    # print(cur_year_month)
    # print(cur_month_papers_uids)
    # break
    # prev month highly-impact papers
    top_papers = top_cited_papers_dict[prev_year_month].loc[~top_cited_papers_dict[prev_year_month]['uid'].isna()]['uid'].tolist()
    # print(prev_year_month)
    # print(top_papers)
    # break
    
    all_top = []
    for p in top_papers:
        all_top.append(np.hstack([title_embs_df.loc[p].values,abstract_embs_df.loc[p].values]))
    all_top = np.vstack(all_top)
    # dist to highly-impact papers (of prev months)
    cos_dist_min = []
    cos_dist_mean = []
    # iterate papers in this paper
    for paper_id in tqdm(cur_month_papers_uids):
        paper_emb = np.hstack([title_embs_df.loc[paper_id].values,abstract_embs_df.loc[paper_id].values]).reshape(1,-1)
        # paper_id mean
        cos_dist_mean_ = (1-pairwise.cosine_similarity(paper_emb,all_top)).mean()
        # 1 - cos_sim
        cos_dist_mean.append(cos_dist_mean_)
        # assign distance
        pmc_metadata.at[paper_id,'mean_cos_distance'] = cos_dist_mean_
        # paper_id min
        cos_dist_min_ = (1-pairwise.cosine_similarity(paper_emb,all_top)).min()
        # 1 - cos_sim
        cos_dist_min.append(cos_dist_min_)
        # assign distance
        pmc_metadata.at[paper_id,'min_cos_distance'] = cos_dist_min_
        
        
    # global pooling 
    month_max_mean.append(np.max(cos_dist_mean))
    month_mean_mean.append(np.mean(cos_dist_mean))
    month_min_mean.append(np.min(cos_dist_mean))
    month_max_min.append(np.max(cos_dist_min))
    month_mean_min.append(np.mean(cos_dist_min))
    month_min_min.append(np.min(cos_dist_min))

100%|██████████| 404/404 [00:00<00:00, 1551.12it/s]
100%|██████████| 605/605 [00:00<00:00, 1795.87it/s]
100%|██████████| 1758/1758 [00:01<00:00, 1551.06it/s]
100%|██████████| 2814/2814 [00:03<00:00, 821.48it/s]
100%|██████████| 2974/2974 [00:04<00:00, 729.91it/s]
100%|██████████| 2556/2556 [00:03<00:00, 655.20it/s]
100%|██████████| 2694/2694 [00:04<00:00, 669.83it/s]
100%|██████████| 3316/3316 [00:05<00:00, 558.84it/s]
100%|██████████| 3535/3535 [00:05<00:00, 618.78it/s]
100%|██████████| 2905/2905 [00:04<00:00, 600.94it/s]
100%|██████████| 2746/2746 [00:05<00:00, 539.38it/s]
100%|██████████| 3861/3861 [00:06<00:00, 612.88it/s]
100%|██████████| 3674/3674 [00:09<00:00, 388.60it/s]
100%|██████████| 4511/4511 [00:13<00:00, 342.30it/s]
100%|██████████| 4072/4072 [00:10<00:00, 385.68it/s]
100%|██████████| 4478/4478 [00:14<00:00, 304.74it/s]
100%|██████████| 6273/6273 [00:16<00:00, 384.41it/s]
100%|██████████| 9977/9977 [00:49<00:00, 200.46it/s]
100%|██████████| 9597/9597 [01:08<00:00, 140.28

In [16]:
with open(f'global_info_{data_dir}_{method}.csv','w') as fw:
    fw.write('year_month,max_mean,mean_mean,min_mean,max_min,mean_min,min_min\n')
    for month_idx, cur_year_month in enumerate(year_months):
        if month_idx == 0:
            continue
        # month_idx - 1 due to month_idx start from 1
        fw.write(f'{cur_year_month},{month_max_mean[month_idx-1]},{month_mean_mean[month_idx-1]},{month_min_mean[month_idx-1]},{month_max_min[month_idx-1]},{month_mean_min[month_idx-1]},{month_min_min[month_idx-1]}\n')

In [47]:
pmc_metadata

Unnamed: 0_level_0,citing_paper,publish_time,citing_year_month,min_cos_distance,mean_cos_distance
cord_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
kby4wprm,Deservingness: migration and health in social ...,2021-04-07,2021-04,0.194475,0.312436
6pso5sa4,"Editorial, special issue on Advances in Robus...",2021-06-28,2021-06,0.182984,0.304553
r1wh54q1,COVID-19–Associated Hospitalizations Among Adu...,2022-03-25,2022-03,0.101844,0.245925
ji9pc0fz,Changes in activity and content of messages of...,2021-09-01,2021-09,0.110365,0.263062
cgrs0yqy,Measuring depression and anxiety prevalence am...,2021-05-07,2021-05,0.164819,0.284729
...,...,...,...,...,...
h4ojr2pp,Denture Acrylic Resin Material with Antibacter...,2022-01-07,2022-01,0.167859,0.272415
3b63pnxq,2022 ACC Expert Consensus Decision Pathway on ...,2022-03-16,2022-03,0.044524,0.388097
3x1mm80o,Chronic granulomatous invasive fungal rhinosin...,2021-12-02,2021-12,0.152131,0.258908
05dxn54t,A Patient With Bilateral Conjunctivitis Positi...,2020-07-02,2020-07,0.132122,0.257045


In [19]:
f'paper_self_info_{data_dir}_{method}'

'paper_self_info_2022-06-02_biotert'

In [17]:
pmc_metadata.to_csv(f'paper_self_info_{data_dir}_{method}.csv')

In [21]:
pmc_metadata

Unnamed: 0_level_0,citing_paper,publish_time,citing_year_month,min_cos_distance,mean_cos_distance
cord_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
kby4wprm,Deservingness: migration and health in social ...,2021-04-07,2021-04,0.055330,0.172040
6pso5sa4,"Editorial, special issue on Advances in Robus...",2021-06-28,2021-06,0.058322,0.183652
r1wh54q1,COVID-19–Associated Hospitalizations Among Adu...,2022-03-25,2022-03,0.044342,0.163410
ji9pc0fz,Changes in activity and content of messages of...,2021-09-01,2021-09,0.029872,0.140341
cgrs0yqy,Measuring depression and anxiety prevalence am...,2021-05-07,2021-05,0.040264,0.162328
...,...,...,...,...,...
h4ojr2pp,Denture Acrylic Resin Material with Antibacter...,2022-01-07,2022-01,0.076649,0.190241
3b63pnxq,2022 ACC Expert Consensus Decision Pathway on ...,2022-03-16,2022-03,0.044330,0.351436
3x1mm80o,Chronic granulomatous invasive fungal rhinosin...,2021-12-02,2021-12,0.056136,0.178038
05dxn54t,A Patient With Bilateral Conjunctivitis Positi...,2020-07-02,2020-07,0.069447,0.178595
