In [1]:
import os
import subprocess
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
meta_list = []

with open('metadata/sample.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    meta_list.append(result)
    
pdf_list = []
    
with open('pdf_parses/sample.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    pdf_list.append(result)

In [3]:
df_meta = pd.DataFrame(meta_list)
df_meta = df_meta.set_index('paper_id')
df_meta.columns

Index(['title', 'authors', 'abstract', 'year', 'arxiv_id', 'acl_id', 'pmc_id',
       'pubmed_id', 'doi', 'venue', 'journal', 'mag_id', 'mag_field_of_study',
       'outbound_citations', 'inbound_citations', 'has_outbound_citations',
       'has_inbound_citations', 'has_pdf_parse', 's2_url', 'has_pdf_body_text',
       'has_pdf_parsed_abstract', 'has_pdf_parsed_body_text',
       'has_pdf_parsed_bib_entries', 'has_pdf_parsed_ref_entries'],
      dtype='object')

In [4]:
df_pdf = pd.DataFrame(pdf_list)
df_pdf = df_pdf.set_index('paper_id')
df_pdf.columns

Index(['_pdf_hash', 'abstract', 'body_text', 'bib_entries', 'ref_entries'], dtype='object')

In [62]:
# Collect title and abstract

df = pd.DataFrame(df_meta.loc[:, 'title'])
for i in df.index:
    if i in df_pdf.index:
        if df_pdf.at[i, 'abstract'] != [] and df_pdf.at[i, 'abstract'] != '':
            df.at[i, 'abstract'] = df_pdf.at[i, 'abstract'][0]['text']

df.replace('', np.nan, inplace=True)
df = df.dropna()

# Collect full text

for i in df.index:
    if i in df_pdf.index:
        full_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['text'] != [] and sec['text'] != '':
                full_text += sec['text']
        df.at[i, 'full_text'] = df.loc[i, 'abstract'] + full_text
        
# Collect related work text
### WARNING: REPLACE 'introduction' with 'related_work'

for i in df.index:
    if i in df_pdf.index:
        related_work_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['section'].lower() == 'introduction' and sec['text'] != '' and sec['text'] != []:
                related_work_text += sec['text']
        df.at[i, 'related_work_text'] = related_work_text
        
# Remove papers that don't have related work sections  
df['related_work_text'].replace('', np.nan, inplace=True)
df = df.dropna()
df.head(3)

Unnamed: 0_level_0,title,abstract,full_text,related_work_text
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
94551546,Gd(III) ion-chelated supramolecular assemblies...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Gene therapy holds potential for treating many...
18981358,Rubber hand illusion induced by touching the f...,Background: Studies in animals and humans indi...,Background: Studies in animals and humans indi...,Spinal cord injuries (SCI) cause an irreversib...
18982127,GRAPHS WITH 4-STEINER CONVEX BALLS,Abstract. Recently a new graph convexity was i...,Abstract. Recently a new graph convexity was i...,The study of abstract convexity began in the e...


In [91]:
# Find sentence-citataion pairs that have more than 10 words in the sentence

def find_pair(textl, bibl):
    pair_dict = {}
    for para in textl:
        cite_list = para['cite_spans']
        text = para['text']
        for cite in cite_list:
            start = int(cite['start'])
            end = int(cite['end'])
            # If cite_spans out of the sentence
            if text[start-2] in ['.', '!', '?']:
                i = start-3
                while i >= 0:
                    if text[i] in ['.', '!', '?']:
                        break
                    i = i - 1
                while i <= start-2:
                    if re.match("[A-Z]", text[i]) == None:
                        i = i + 1
                    else:
                        break
                sentence = text[i:start-1]
                sentence_len = len(sentence.split(' '))
                ref_id = cite['ref_id']
                paper_id = bibl[ref_id]['link']
                if paper_id != None and sentence_len >= 10:
                    pair_dict[paper_id] = sentence
                    
            # If cite_spans in the sentence
            if end + 1 < len(text):
                if text[end+1] in ['.', '!', '?']:
                    i = start-1
                    while i >= 0:
                        if text[i] in ['.', '!', '?']:
                            break
                        i = i - 1
                    while i <= start-2:
                        if re.match("[A-Z]", text[i]) == None:
                            i = i + 1
                        else:
                            break
                    sentence = text[i:end+2]
                    sentence_len = len(sentence.split(' '))
                    ref_id = cite['ref_id']
                    paper_id = bibl[ref_id]['link']
                    if paper_id != None and sentence_len >= 10:
                        pair_dict[paper_id] = sentence
    
    return pair_dict

In [92]:
# test find_pair function
test_text = df_pdf.at['18982127', 'body_text']
test_bib = df_pdf.at['18982127', 'bib_entries']
find_pair(test_text, test_bib)

{'6739367': 'For a more extensive study on this topic see [5] .',
 '10259494': 'Such characterizations for the g-and m-convexity are given in [8] , for the g 3 -convexity in [12] and for the m-convexity in [4] .',
 '12220557': 'It is not hard to see that graphs in which N [S] is convex for every convex set S ⊆ V are exactly the graphs in which N j [S] is convex for every convex set S ⊆ V and every j ≥ 1 in terms of any convexity in graphs, see [6] .'}

In [93]:
# Collect sentence-citation pairs in full text
for i in df.index:
    if i in df_pdf.index:
        textl = df_pdf.at[i, 'body_text']
        bibl = df_pdf.at[i, 'bib_entries']
        pair = find_pair(textl, bibl)
        df.at[i, 'full_text_pairs'] = i
        df.at[i, 'full_text_pairs'] = pair

In [94]:
df

Unnamed: 0_level_0,title,abstract,full_text,related_work_text,full_text_pairs
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
94551546,Gd(III) ion-chelated supramolecular assemblies...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Gene therapy holds potential for treating many...,{'4683185': 'The application of traditional vi...
18981358,Rubber hand illusion induced by touching the f...,Background: Studies in animals and humans indi...,Background: Studies in animals and humans indi...,Spinal cord injuries (SCI) cause an irreversib...,{'18136308': 'There is a clear difference betw...
18982127,GRAPHS WITH 4-STEINER CONVEX BALLS,Abstract. Recently a new graph convexity was i...,Abstract. Recently a new graph convexity was i...,The study of abstract convexity began in the e...,{'6739367': 'For a more extensive study on thi...
18982460,A novel automatic regulatory device for contin...,Background: Benign prostatic hyperplasia is a ...,Background: Benign prostatic hyperplasia is a ...,Benign prostatic hyperplasia (BPH) is a common...,{'34028337': 'Benign prostatic hyperplasia (BP...
18982504,ANALYTICAL MODELING AND ANALYSIS OF THROUGH SI...,Abstract-This paper gives a comprehensive stud...,Abstract-This paper gives a comprehensive stud...,Three-dimensional integrated circuits (3D ICs)...,{'5786126': 'The advantages gained by vertical...
18982781,The Del1 deposition domain can immobilize 3α-h...,Abstract Developing methods that result in tar...,Abstract Developing methods that result in tar...,Mouse Del1 is an ECM protein secreted by embry...,{'22689447': 'Administration of exogenous DHT ...
18983391,Investigation ofant colonyalgorithm in multipl...,The conventional approaches to routing,The conventional approaches to routing Traffic...,Traffic engineering is one of the active resea...,{'5173944': 'We study the routing and load bal...
18985891,Global Morbidity and Mortality of Leptospirosi...,Background,BackgroundLeptospirosis is a zoonotic bacteria...,Leptospirosis is a zoonotic bacterial disease ...,{'22675707': 'Our study provides a baseline es...
18986705,Duplicated rDNA sequences of variable lengths ...,We describe cloned segments of rDNA that conta...,We describe cloned segments of rDNA that conta...,The rDNA of Drosophila melanogaster can contai...,{}
18986901,HOW TO DEAL WITH THE ARROW OF TIME IN QUANTUM ...,Abstract The formalism of Quantum Mechanics is...,Abstract The formalism of Quantum Mechanics is...,The formalism of Quantum Mechanics (QM) is bas...,{'118117994': 'The formalism for the dho turns...
