# Data process

In [1]:
import os
import subprocess
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
# Read meta data and pdf_parse data

meta_list = []

with open('metadata/sample.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    meta_list.append(result)
    
pdf_list = []
    
with open('pdf_parses/sample.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    pdf_list.append(result)

In [3]:
# Use paper_id as index

df_meta = pd.DataFrame(meta_list)
df_meta = df_meta.set_index('paper_id')
df_meta.columns

Index(['title', 'authors', 'abstract', 'year', 'arxiv_id', 'acl_id', 'pmc_id',
       'pubmed_id', 'doi', 'venue', 'journal', 'mag_id', 'mag_field_of_study',
       'outbound_citations', 'inbound_citations', 'has_outbound_citations',
       'has_inbound_citations', 'has_pdf_parse', 's2_url', 'has_pdf_body_text',
       'has_pdf_parsed_abstract', 'has_pdf_parsed_body_text',
       'has_pdf_parsed_bib_entries', 'has_pdf_parsed_ref_entries'],
      dtype='object')

In [4]:
# Use paper_id as index

df_pdf = pd.DataFrame(pdf_list)
df_pdf = df_pdf.set_index('paper_id')
df_pdf.columns

Index(['_pdf_hash', 'abstract', 'body_text', 'bib_entries', 'ref_entries'], dtype='object')

In [5]:
# Find sentence-citataion pairs that have more than 10 words in the sentence

def find_pair(textl, bibl):
    pair_dict = {}
    for para in textl:
        cite_list = para['cite_spans']
        text = para['text']
        for cite in cite_list:
            start = int(cite['start'])
            end = int(cite['end'])
            # If cite_spans out of the sentence
            if text[start-2] in ['.', '!', '?']:
                i = start-3
                while i >= 0:
                    if text[i] in ['.', '!', '?']:
                        break
                    i = i - 1
                while i <= start-2:
                    if re.match("[A-Z]", text[i]) == None:
                        i = i + 1
                    else:
                        break
                sentence = text[i:start-1]
                sentence_len = len(sentence.split(' '))
                ref_id = cite['ref_id']
                paper_id = bibl[ref_id]['link']
                if paper_id != None and sentence_len >= 10:
                    pair_dict[paper_id] = sentence
                    
            # If cite_spans in the sentence
            if end + 1 < len(text):
                if text[end+1] in ['.', '!', '?']:
                    i = start-1
                    while i >= 0:
                        if text[i] in ['.', '!', '?']:
                            break
                        i = i - 1
                    while i <= start-2:
                        if re.match("[A-Z]", text[i]) == None:
                            i = i + 1
                        else:
                            break
                    sentence = text[i:end+2]
                    sentence_len = len(sentence.split(' '))
                    ref_id = cite['ref_id']
                    paper_id = bibl[ref_id]['link']
                    if paper_id != None and sentence_len >= 10:
                        pair_dict[paper_id] = sentence
    
    return pair_dict

In [6]:
# Collect title and abstract

df = pd.DataFrame(df_meta.loc[:, 'title'])
for i in df.index:
    if i in df_pdf.index:
        if df_pdf.at[i, 'abstract'] != [] and df_pdf.at[i, 'abstract'] != '':
            df.at[i, 'abstract'] = df_pdf.at[i, 'abstract'][0]['text']

df.replace('', np.nan, inplace=True)
df = df.dropna()

# Collect full text

for i in df.index:
    if i in df_pdf.index:
        full_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['text'] != [] and sec['text'] != '':
                full_text += sec['text']
        df.at[i, 'full_text'] = df.loc[i, 'abstract'] + full_text
        
# Collect related work text
### WARNING: REPLACE 'introduction' with 'related_work'

for i in df.index:
    if i in df_pdf.index:
        related_work_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['section'].lower() == 'introduction' and sec['text'] != '' and sec['text'] != []:
                related_work_text += sec['text']
        df.at[i, 'related_work_text'] = related_work_text
        
# Remove papers that don't have related work sections  
df['related_work_text'].replace('', np.nan, inplace=True)
df = df.dropna()

# Collect sentence-citation pairs in related work text and body text
### WARNING: REPLACE 'introduction' with 'related_work'

for i in df.index:
    if i in df_pdf.index:
        full_textl = df_pdf.at[i, 'body_text']
        related_work_textl = []
        body_textl = []
        bibl = df_pdf.at[i, 'bib_entries']
        for t in full_textl:
            if t['section'] == 'INTRODUCTION':
                related_work_textl.append(t)
            else:
                body_textl.append(t)
        related_work_pairs = find_pair(related_work_textl, bibl)
        body_pairs = find_pair(body_textl, bibl)
        df.at[i, 'related_work_pairs'] = i # without this line, there will be an error. But I'm not sure about the reason
        df.at[i, 'related_work_pairs'] = related_work_pairs
        
        df.at[i, 'body_pairs'] = i # without this line, there will be an error. But I'm not sure about the reason
        df.at[i, 'body_pairs'] = body_pairs

# Remove papers that don't have citations in related work sections and body texts        
data_processed = df.loc[(df.body_pairs != {}) & (df.related_work_pairs != {})]

In [7]:
data_processed.head(3)

Unnamed: 0_level_0,title,abstract,full_text,related_work_text,related_work_pairs,body_pairs
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
94551546,Gd(III) ion-chelated supramolecular assemblies...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Gene therapy holds potential for treating many...,{'4683185': 'The application of traditional vi...,{'206804279': 'The resultant Ad-PGMA polymers ...
18981358,Rubber hand illusion induced by touching the f...,Background: Studies in animals and humans indi...,Background: Studies in animals and humans indi...,Spinal cord injuries (SCI) cause an irreversib...,"{'18136308': 'However, proprioceptive drift wa...",{'36808282': 'The AIS score (ranging from A to...
18982504,ANALYTICAL MODELING AND ANALYSIS OF THROUGH SI...,Abstract-This paper gives a comprehensive stud...,Abstract-This paper gives a comprehensive stud...,Three-dimensional integrated circuits (3D ICs)...,{'5786126': 'The advantages gained by vertical...,"{'14713728': 'Accordingly, an analytical formu..."
