In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm

#### paper dist to higly-impact papers (in last month)

In [2]:
data_dir = '2021-07-19'
# method = 'scibert'
method = 'tfidf'
# method = 'glove'
# citiing_window = 1
citiing_window = 6

In [3]:
paper_self_info = pd.read_csv(f'paper_self_info_{data_dir}_{method}.csv',
           dtype={'citing_paper':'object','cord_uid':'object',
#                   'citing_year':'object','citing_month':'object','citing_date':'object',
                  'citing_year_month':'object',
                  'min_cos_distance':'float','mean_cos_distance':'float',
                })[['cord_uid','citing_paper','citing_year_month','min_cos_distance','mean_cos_distance']]

In [4]:
paper_self_info = paper_self_info.rename(columns={'cord_uid':'uid','citing_paper':'title','citing_year_month':'publish_year_month'})

In [5]:
paper_self_info.head(5)

Unnamed: 0,uid,title,publish_year_month,min_cos_distance,mean_cos_distance
0,6jbjwl8j,Hypoxia-induced amniotic fluid stem cell secre...,2021-01,0.855313,0.952014
1,yz3od5m7,Impfbereitschaft unter intensivmedizinischem P...,2021-02,0.950205,0.997864
2,8rdo9wgc,Use of machine learning and artificial intelli...,2020-06,0.757355,0.866406
3,npcz4qr2,Emerging COVID‐19 vaccines: A rheumatology per...,2021-02,0.843224,0.96862
4,k0xi1agw,Scientific societies fostering inclusivity thr...,2020-11,0.905278,0.96272


In [6]:
# paper_self_info.set_index('uid',inplace=True)

#### global info of each month

In [7]:
domain_global_info = pd.read_csv(f'global_info_{data_dir}_{method}.csv')

In [8]:
domain_global_info.set_index('year_month',inplace=True)

#### paper being cited count each month

In [9]:
cited_rank_papers_dict = {}
for file in os.listdir('../scibert_encoded_papers/cited_rank'):
    if file.endswith(f'_{data_dir}.tsv'):
        month = file.split('.')[0].split('_')[-2]
        df = pd.read_csv(os.path.join('../scibert_encoded_papers/cited_rank',file),sep='\t')
        cited_rank_papers_dict[str(month)] = df.loc[~df['uid'].isna()].set_index('uid')

In [10]:
# cited_rank_papers_dict

#### concatenate

In [11]:
valid_samples = []
year_months = ['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06','2020-07','2020-08',
               '2020-09','2020-10','2020-11','2020-12','2021-01','2021-02','2021-03','2021-04','2021-05','2021-06']
paper_self_info = paper_self_info.loc[paper_self_info['publish_year_month'].isin(year_months)]

for row_idx,paper_info in tqdm(paper_self_info.iterrows()):
#     print(uid,paper_info)
#     break
    year_month = paper_info['publish_year_month']
    year_month_idx = year_months.index(year_month)
    if year_month_idx <= 1 or year_month_idx > len(year_months)-citiing_window-1:
        continue
    # 上个月
    prev_year_month = year_months[year_month_idx-1]
    # 上个月的global info
    global_info = domain_global_info.loc[prev_year_month]
    
    # citing window cnt
    cnt = 0
    cited_count_next_month = None
    for offset in range(1,citiing_window+1):
        next_year_month = year_months[year_month_idx+offset]
        # count of being cited next month
         # may be zero, resulting "key not found"
        try:
            cited_count_next_month = cited_rank_papers_dict[next_year_month].loc[paper_info['uid']]
            cnt+=cited_count_next_month['count']
            cited_count_next_month['count'] = cnt
        except:
            continue
    if cited_count_next_month is not None:
#         print(type(cited_count_next_month))
#         print(cited_count_next_month)
        valid_samples.append(pd.concat([paper_info,global_info,cited_count_next_month]))
    else:
        valid_samples.append(pd.concat([paper_info,global_info,pd.Series(data=[paper_info['title'],0],index=['cited_paper','count'])]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
47130it [00:17, 2710.36it/s]


In [12]:
valid_samples = pd.concat(valid_samples,axis=1).transpose()

In [13]:
valid_samples['count'].max()

174.0

In [14]:
valid_samples = valid_samples[['uid','title', 'publish_year_month', 'min_cos_distance', 'mean_cos_distance',
       'max_mean', 'mean_mean', 'min_mean', 'max_min', 'mean_min', 'min_min','count']].rename(columns={'title':'paper_name'})

In [15]:
# valid_samples.loc[valid_samples['count']>0].shape

In [16]:
valid_samples.to_csv(f'valid_samples_{data_dir}_{method}_timewindow_{citiing_window}.csv',sep='\t',index=False)