In [None]:
import os
import time
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
oag_path = 'path_to_OAG_dataset'
feat_path = 'path_to_context_feature'

In [None]:
df_train_context = pd.read_csv(os.path.join(feat_path, 'train_context_filled.csv'))
df_test_pub_gen_context = pd.read_csv(os.path.join(feat_path, 'test_pub_context_gen_title.csv'))

In [None]:
train_null_title = df_train_context[df_train_context.ref_pid.isnull()]['ref_title']
test_pub_gen_null_title = df_test_pub_gen_context[df_test_pub_gen_context.ref_pid.isnull()]['ref_title']

In [None]:
df_oag_title_train = pd.DataFrame([], columns=['ref_title','id','title'])
df_oag_title_test_pub_gen = pd.DataFrame([], columns=['ref_title','id','title'])


for j in range(1,14):
    for i in range(10):
        print(j, i)
        finename = "v3.1_oag_publication_"+str(j)+"_0"+str(i)
        out_file = finename+".gz"
        start_time = time.time()
        
        try:
            df_oag = pd.read_csv(os.path.join(oag_path, out_file))
            df_oag['title'] = df_oag['title'].str.lower()
            print('Finish loading', time.time() - start_time)

            df_temp_oag_title_train = pd.merge(test_pub_null_title.drop_duplicates(), df_oag, left_on='ref_title', right_on='title')
            df_oag_title_train = pd.concat([df_oag_title_train, df_temp_oag_title_train])
            
            df_temp_oag_title_test = pd.merge(test_pub_gen_null_title.drop_duplicates(), df_oag, left_on='ref_title', right_on='title')
            df_oag_title_test_pub_gen = pd.concat([df_oag_title_test_pub_gen, df_temp_oag_title_test])
            
            print('Finish extracting', time.time() - start_time)
        except:
            print('Not exist', out_file)


In [None]:
df_oag_id_title = pd.concat([df_oag_title_train[['id','title']], df_oag_title_test_pub_gen[['id','title']]]).drop_duplicates()
df_oag_id_title

In [None]:
df_OAG_extracted = pd.DataFrame([], columns=['id', 'title', 'abstract', 'keywords', 'year', 'authors', 'references', 'doi', 'venue_id', 'n_citation', 'venue'])

for j in range(1,14):
    for i in range(10):
        print(j, i)
        finename = "v3.1_oag_publication_"+str(j)+"_0"+str(i)
        src_fiile = finename+".json"
        out_file = finename+".gz"
        start_time = time.time()
        
        try:
            df = [json.loads(line)
                    for line in open(os.path.join(oag_path, src_fiile), 'r', encoding='utf-8')]
            print('Finish loading', time.time() - start_time)
            df = pd.DataFrame(df)
            df_OAG_extracted = pd.concat([df_OAG_extracted, df[df.id.isin(df_oag_id_title['id'])]])

            print('Finish extracting', time.time() - start_time)
        except:
            print('Not exist', src_fiile)
            
df_OAG_extracted = df_OAG_extracted.reset_index(drop=True)


In [None]:
df_OAG_extracted['title'] = df_OAG_extracted['title'].str.lower()

In [None]:
df_OAG_extracted

In [None]:
# df_OAG_extracted.to_csv('df_OAG_extracted_test.csv',index=False)

In [None]:
df_OAG_sort = df_OAG_extracted.sort_values(['title', 'n_citation','year'], ascending=[True, False,False])

In [None]:
df_OAG_sort = df_OAG_sort.drop_duplicates(subset='title', ignore_index=True)
df_OAG_sort

In [None]:
df_OAG_sort.columns = ['ref_pid','ref_title','ref_abstract','ref_keywords','ref_year','authors','references','ref_doi','venue_id','n_citation','ref_venue']

In [None]:
train_context_filled = []

for i, row in tqdm(df_train_context.iterrows(), total=df_train_context.shape[0]):
    if pd.isna(row['ref_pid']):
        df_extracted = df_OAG_sort[df_OAG_sort.ref_title==row['ref_title']]
#         display(df_extracted)
        if df_extracted.shape[0]==0:
            train_context_filled.append(list(row.values))
        else:
            train_context_filled.append([row['pid'], row['title'], df_extracted['ref_pid'].values[0], row['bid'],row['ref_title'], row['context'],
                                                             row['context_clean'], row['abstract'], df_extracted['ref_abstract'].values[0], row['xml_ref_count'], 
                                                             row['context_bibr_count'],row['context_target_bibr_count'],  
                                                             row['context_other_bibr_count'], row['keywords'], df_extracted['ref_keywords'].values[0], row['year'], 
                                                             int(df_extracted['ref_year'].values[0]), row['venue'], df_extracted['ref_venue'].values[0], row['org'], 
                                                             row['ref_org'], row['doi'], df_extracted['ref_doi'].values[0], row['introduction'], row['ref_introduction'],
                                                            row['conclusion'],row['ref_conclusion'],row['related_work'],row['ref_related_work'], row['authors'], row['ref_authors']])
            
    else:
        train_context_filled.append(list(row.values))


In [None]:
df_train_context_filled = pd.DataFrame(train_context_filled, columns=df_test_pub_context.columns)

In [None]:
df_train_context_filled.head()

In [None]:
df_train_context_filled.to_csv('train_context_filled.csv', index=False)

In [None]:
test_context_filled = []

for i, row in tqdm(df_test_pub_gen_context.iterrows(), total=df_test_pub_gen_context.shape[0]):
    if pd.isna(row['ref_pid']):
        df_extracted = df_OAG_sort[df_OAG_sort.ref_title==row['ref_title']]
        if df_extracted.shape[0]==0:
            test_context_filled.append(list(row.values))
        else:
            test_context_filled.append([row['pid'], row['title'], df_extracted['ref_pid'].values[0], row['bid'],row['ref_title'], row['context'],
                                                             row['context_clean'], row['abstract'], df_extracted['ref_abstract'].values[0], row['xml_ref_count'], 
                                                             row['context_bibr_count'],row['context_target_bibr_count'],  
                                                             row['context_other_bibr_count'], row['keywords'], df_extracted['ref_keywords'].values[0], row['year'], 
                                                             int(df_extracted['ref_year'].values[0]), row['venue'], df_extracted['ref_venue'].values[0], row['org'], 
                                                             row['ref_org'], row['doi'], df_extracted['ref_doi'].values[0], row['introduction'], row['ref_introduction'],
                                                            row['conclusion'],row['ref_conclusion'],row['related_work'],row['ref_related_work'], row['authors'], row['ref_authors']])
    else:
        test_context_filled.append(list(row.values))

In [None]:
df_test_context_filled = pd.DataFrame(test_context_filled, columns=df_train_context.columns)

In [None]:
df_test_context_filled.to_csv('test_pub_gen_context_filled.csv', index=False)

In [None]:
df_train_context = pd.read_csv('train_context_filled.csv')
df_test_context = pd.read_csv('test_pub_gen_context_filled.csv')

In [None]:
pid_list = np.concatenate([df_train_context['pid'].dropna().unique(),
                                              df_train_context['ref_pid'].dropna().unique(),
                                              df_test_context['pid'].dropna().unique(),
                                              df_test_context['ref_pid'].dropna().unique()])

In [None]:
pid_list[:5]

In [None]:
df_OAG_extracted = pd.DataFrame([], columns=['id','title','n_citation'])

for j in range(1,14):
    for i in range(10):
        print(j, i)
        finename = "v3.1_oag_publication_"+str(j)+"_0"+str(i)
        src_fiile = finename+".json"
        out_file = finename+".gz"
        start_time = time.time()
        
        try:
            df = [json.loads(line)
                    for line in open(os.path.join(oag_path, src_fiile), 'r', encoding='utf-8')]
            print('Finish loading', time.time() - start_time)
            df = pd.DataFrame(df)
            df_OAG_extracted = pd.concat([df_OAG_extracted, df[df.id.isin(pid_list)][['id','title','n_citation']]])
            print('Finish extracting', time.time() - start_time)
        except:
            print('Not exist', src_fiile)

In [None]:
df_train_context = pd.merge(df_train_context, df_OAG_extracted[['id','n_citation']], left_on='pid',right_on='id', how='left')

In [None]:
df_train_context = pd.merge(df_train_context, df_OAG_extracted[['id','n_citation']], left_on='ref_pid',right_on='id', how='left')
df_train_context = df_train_context.drop(columns=['id_x','id_y'])
df_train_context = df_train_context.rename(columns={'n_citation_x':'n_citation', 'n_citation_y':'ref_n_citation'})
df_train_context

In [None]:
df_test_context = pd.merge(df_test_context, df_OAG_extracted[['id','n_citation']], left_on='pid',right_on='id', how='left')

In [None]:
df_test_context = pd.merge(df_test_context, df_OAG_extracted[['id','n_citation']], left_on='ref_pid',right_on='id', how='left')
df_test_context = df_test_context.drop(columns=['id_x','id_y'])
df_test_context = df_test_context.rename(columns={'n_citation_x':'n_citation', 'n_citation_y':'ref_n_citation'})
df_test_context

In [None]:
print(df_train_context.shape)
print(df_test_context.shape)

print(df_test_pub_context.shape)
print(df_test_pub_gen_context.shape)

In [None]:
df_train_context.to_csv('train_context_filled_citation.csv',index=False)
df_test_context.to_csv('test_pub_gen_context_filled_citation.csv',index=False)