In [1]:
import os
import subprocess
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

import json

In [4]:
meta_list = []

with open('metadata/sample.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    meta_list.append(result)
    
pdf_list = []
    
with open('pdf_parses/sample.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    pdf_list.append(result)

In [5]:
df_meta = pd.DataFrame(meta_list)
df_meta = df_meta.set_index('paper_id')
df_meta.head(3)
df_meta.columns

Index(['title', 'authors', 'abstract', 'year', 'arxiv_id', 'acl_id', 'pmc_id',
       'pubmed_id', 'doi', 'venue', 'journal', 'mag_id', 'mag_field_of_study',
       'outbound_citations', 'inbound_citations', 'has_outbound_citations',
       'has_inbound_citations', 'has_pdf_parse', 's2_url', 'has_pdf_body_text',
       'has_pdf_parsed_abstract', 'has_pdf_parsed_body_text',
       'has_pdf_parsed_bib_entries', 'has_pdf_parsed_ref_entries'],
      dtype='object')

In [6]:
df_pdf = pd.DataFrame(pdf_list)
df_pdf.head(3)
df_pdf = df_pdf.set_index('paper_id')
df_pdf.columns

Index(['_pdf_hash', 'abstract', 'body_text', 'bib_entries', 'ref_entries'], dtype='object')

In [7]:
# Collect title and abstract

df = pd.DataFrame(df_meta.loc[:, 'title'])
for i in df.index:
    if i in df_pdf.index:
        if df_pdf.at[i, 'abstract'] != [] and df_pdf.at[i, 'abstract'] != '':
            df.at[i, 'abstract'] = df_pdf.at[i, 'abstract'][0]['text']

df.replace('', np.nan, inplace=True)
df = df.dropna()
df

Unnamed: 0_level_0,title,abstract
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
77499681,Effects of Teriparatide Administration on Frac...,The purpose of this study is to evaluate the e...
94551546,Gd(III) ion-chelated supramolecular assemblies...,Ethanolamine (EA) or ethylenediamine (ED)-func...
159355456,An update on model Ayush wellness clinic at pr...,The Government of India has presented an expan...
18980380,Distributionally Robust Counterpart in Markov ...,This technical note studies Markov decision pr...
18980463,Adult and larval photoreceptors use different ...,Although development of the adult Drosophila c...
...,...,...
118866026,The semi-constrained NMSSM in light of muon g-...,Abstract: The semi-constrained NMSSM (scNMSSM)...
118869358,Approximate quantum state sharing via two priv...,We investigate the approximate quantum state s...
84882469,The Effects of Different Hormones and their do...,"In this research, three different hormones and..."
84887049,Fifty years of devotion to spiders: a concise ...,Th is paper provides an overview of the life a...


In [8]:
# Collect full text

for i in df.index:
    if i in df_pdf.index:
        full_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['text'] != [] and sec['text'] != '':
                full_text += sec['text']
        df.at[i, 'full_text'] = df.loc[i, 'abstract'] + full_text

In [9]:
# Collect related work text
### WARNING: REPLACE 'introduction' with 'related_work'

for i in df.index:
    if i in df_pdf.index:
        related_work_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['section'].lower() == 'introduction' and sec['text'] != '' and sec['text'] != []:
                related_work_text += sec['text']
        df.at[i, 'related_work_text'] = related_work_text

In [10]:
df['related_work_text'].replace('', np.nan, inplace=True)
df = df.dropna()
df

Unnamed: 0_level_0,title,abstract,full_text,related_work_text
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
94551546,Gd(III) ion-chelated supramolecular assemblies...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Ethanolamine (EA) or ethylenediamine (ED)-func...,Gene therapy holds potential for treating many...
18981358,Rubber hand illusion induced by touching the f...,Background: Studies in animals and humans indi...,Background: Studies in animals and humans indi...,Spinal cord injuries (SCI) cause an irreversib...
18982127,GRAPHS WITH 4-STEINER CONVEX BALLS,Abstract. Recently a new graph convexity was i...,Abstract. Recently a new graph convexity was i...,The study of abstract convexity began in the e...
18982460,A novel automatic regulatory device for contin...,Background: Benign prostatic hyperplasia is a ...,Background: Benign prostatic hyperplasia is a ...,Benign prostatic hyperplasia (BPH) is a common...
18982504,ANALYTICAL MODELING AND ANALYSIS OF THROUGH SI...,Abstract-This paper gives a comprehensive stud...,Abstract-This paper gives a comprehensive stud...,Three-dimensional integrated circuits (3D ICs)...
18982781,The Del1 deposition domain can immobilize 3α-h...,Abstract Developing methods that result in tar...,Abstract Developing methods that result in tar...,Mouse Del1 is an ECM protein secreted by embry...
18983391,Investigation ofant colonyalgorithm in multipl...,The conventional approaches to routing,The conventional approaches to routing Traffic...,Traffic engineering is one of the active resea...
18985891,Global Morbidity and Mortality of Leptospirosi...,Background,BackgroundLeptospirosis is a zoonotic bacteria...,Leptospirosis is a zoonotic bacterial disease ...
18986705,Duplicated rDNA sequences of variable lengths ...,We describe cloned segments of rDNA that conta...,We describe cloned segments of rDNA that conta...,The rDNA of Drosophila melanogaster can contai...
18986901,HOW TO DEAL WITH THE ARROW OF TIME IN QUANTUM ...,Abstract The formalism of Quantum Mechanics is...,Abstract The formalism of Quantum Mechanics is...,The formalism of Quantum Mechanics (QM) is bas...


In [49]:
df_pdf.head(3)

Unnamed: 0_level_0,_pdf_hash,abstract,body_text,bib_entries,ref_entries
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
77499681,11f281316fe4638843a83cf559ce4f60aade00f8,"[{'section': 'Abstract', 'text': 'The purpose ...","[{'section': '', 'text': 'Values are presented...",{'BIBREF0': {'title': 'Bone health and osteopo...,{'FIGREF0': {'text': '비스포스포네이트를 장기간 복용한 골다공증 환...
94550656,42b3e1bd9c4740192f22d8725d470218e86301c8,[],[],{'BIBREF0': {'title': 'Solving ratio-dependent...,{}
94551239,b355fc0f19e1945bcb585b0f696da8b01aa4578f,[],[],{'BIBREF2': {'title': 'Optical Near Field Reco...,{}


In [64]:
# Get citaion-sentence pair
test_text = df_pdf.at['94551546', 'body_text']
test_bib = df_pdf.at['94551546', 'bib_entries']
test_text

[{'section': 'INTRODUCTION',
  'text': 'Gene therapy holds potential for treating many severe diseases, such as cancer and genetic diseases. 1 Successful gene therapy depends on highefficiency gene delivery processes, in which the gene carriers have an essential role. The application of traditional viral vectors has been a challenge because of their toxicity, immunogenicity and low capability for scaling up. 2 There has long been a scientific demand for developing non-viral gene delivery systems that can overcome the drawbacks of viral vectors. 3 Non-viral gene delivery has been advanced by the rapid development of materials science and technology. Numerous novel gene delivery systems have been proposed based on functional cationic polymers, such as polyethylenimine (PEI), [4] [5] [6] [7] [8] [9] poly(2-(dimethylamino) ethyl methacrylate), 4,10 poly(L-lysine), 11 poly(aspartic acid) 12, 13 and polyamidoamine. 14 However, these non-viral gene carriers still have shortcomings, including 

In [84]:
test_text[0]['text'][0]

'G'

In [65]:
test_bib

{'BIBREF0': {'title': 'Cancer statistics',
  'authors': [{'first': 'R', 'middle': [], 'last': 'Siegel', 'suffix': ''},
   {'first': 'D', 'middle': [], 'last': 'Naishadham', 'suffix': ''},
   {'first': 'A', 'middle': [], 'last': 'Jemal', 'suffix': ''}],
  'year': 2012,
  'venue': 'CA Cancer J. Clin',
  'link': None},
 'BIBREF1': {'title': 'Enzyme-responsive cell-penetrating peptide conjugated mesoporous silica quantum dot nanocarriers for controlled release of nucleus-targeted drug molecules and real-time intracellular fluorescence imaging of tumor cells',
  'authors': [{'first': 'J', 'middle': [], 'last': 'Li', 'suffix': ''},
   {'first': 'F', 'middle': [], 'last': 'Liu', 'suffix': ''},
   {'first': 'Q', 'middle': [], 'last': 'Shao', 'suffix': ''},
   {'first': 'Y', 'middle': [], 'last': 'Min', 'suffix': ''},
   {'first': 'M', 'middle': [], 'last': 'Costa', 'suffix': ''},
   {'first': 'E', 'middle': ['K L'], 'last': 'Yeow', 'suffix': ''},
   {'first': 'B', 'middle': [], 'last': 'Xing',

In [113]:
pair_dict = {}
for para in test_text:
    cite_list = para['cite_spans']
    text = para['text']
    for cite in cite_list:
        start = int(cite['start'])
        if text[start-2] in ['.', '!', '?']:
            i = start-3
            while i >= 0:
                if text[i] in ['.', '!', '?']:
                    break
                i = i - 1
            while i >= 0:
                if re.match("[A-Z]", text[i]) == None:
                    i = i + 1
                else:
                    break
            sentence = text[i:start-1]
            ref_id = cite['ref_id']
            paper_id = test_bib[ref_id]['link']
            if paper_id != None and sentence != '':
                pair_dict[paper_id] = sentence

In [114]:
pair_dict

{'4683185': 'The application of traditional viral vectors has been a challenge because of their toxicity, immunogenicity and low capability for scaling up.',
 '82875295': 'There has long been a scientific demand for developing non-viral gene delivery systems that can overcome the drawbacks of viral vectors.',
 '40686720': 'Numerous novel gene delivery systems have been proposed based on functional cationic polymers, such as polyethylenimine (PEI), [4] [5] [6] [7] [8] [9] poly(2-(dimethylamino) ethyl methacrylate), 4,10 poly(L-lysine), 11 poly(aspartic acid) 12, 13 and polyamidoamine.',
 '20402408': 'The resultant ED-functionalized PGMA (PGED) was prepared using excess ED, as reported previously.',
 '82732655': 'Owing to the dynamically unable ability of supramolecular polymers, the application of supramolecular chemistry for gene delivery has been a hot research topic in the biomedical field.',
 '4805879': 'The construction of supramolecular polycations via host-guest interaction is a 