# Data process

In [1]:
import os
import subprocess
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
# Read meta data and pdf_parse data

meta_list = []

with open('metadata/metadata_0.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    meta_list.append(result)
    
pdf_list = []
    
with open('pdf_parses/pdf_parses_0.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    pdf_list.append(result)

In [3]:
# Use paper_id as index

df_meta = pd.DataFrame(meta_list)
df_meta = df_meta.set_index('paper_id')
df_meta.columns

Index(['title', 'authors', 'abstract', 'year', 'arxiv_id', 'acl_id', 'pmc_id',
       'pubmed_id', 'doi', 'venue', 'journal', 'has_pdf_body_text', 'mag_id',
       'mag_field_of_study', 'outbound_citations', 'inbound_citations',
       'has_outbound_citations', 'has_inbound_citations', 'has_pdf_parse',
       'has_pdf_parsed_abstract', 'has_pdf_parsed_body_text',
       'has_pdf_parsed_bib_entries', 'has_pdf_parsed_ref_entries', 's2_url'],
      dtype='object')

In [4]:
# Use paper_id as index

df_pdf = pd.DataFrame(pdf_list)
df_pdf = df_pdf.set_index('paper_id')
df_pdf.columns

Index(['_pdf_hash', 'abstract', 'body_text', 'bib_entries', 'ref_entries'], dtype='object')

In [5]:
# Find sentence-citataion pairs that have more than 10 words in the sentence

def find_pair(textl, bibl):
    pair_dict = {}
    for para in textl:
        cite_list = para['cite_spans']
        text = para['text']
        for cite in cite_list:
            start = int(cite['start'])
            end = int(cite['end'])
            # If cite_spans out of the sentence
            if text[start-2] in ['.', '!', '?']:
                i = start-3
                while i >= 0:
                    if text[i] in ['.', '!', '?']:
                        break
                    i = i - 1
                while i <= start-2:
                    if re.match("[A-Z]", text[i]) == None:
                        i = i + 1
                    else:
                        break
                sentence = text[i:start-1]
                sentence_len = len(sentence.split(' '))
                ref_id = cite['ref_id']
                paper_id = bibl[ref_id]['link']
                if paper_id != None and sentence_len >= 10:
                    pair_dict[paper_id] = sentence
                    
            # If cite_spans in the sentence
            if end + 1 < len(text):
                if text[end+1] in ['.', '!', '?']:
                    i = start-1
                    while i >= 0:
                        if text[i] in ['.', '!', '?']:
                            break
                        i = i - 1
                    while i <= start-2:
                        if re.match("[A-Z]", text[i]) == None:
                            i = i + 1
                        else:
                            break
                    sentence = text[i:end+2]
                    sentence_len = len(sentence.split(' '))
                    ref_id = cite['ref_id']
                    paper_id = bibl[ref_id]['link']
                    if paper_id != None and sentence_len >= 10:
                        pair_dict[paper_id] = sentence
    
    return pair_dict

In [19]:
# Collect title and abstract

df = pd.DataFrame(df_meta.loc[:, 'title'])
for i in df.index:
    if i in df_pdf.index:
        if df_pdf.at[i, 'abstract'] != [] and df_pdf.at[i, 'abstract'] != '':
            df.at[i, 'abstract'] = df_pdf.at[i, 'abstract'][0]['text']

df.replace('', np.nan, inplace=True)
df = df.dropna()

# Collect full text

for i in df.index:
    if i in df_pdf.index:
        full_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['text'] != [] and sec['text'] != '':
                full_text += sec['text']
        df.at[i, 'full_text'] = df.loc[i, 'abstract'] + full_text
        
# Collect related work text
### WARNING: REPLACE 'introduction' with 'related_work'

for i in df.index:
    if i in df_pdf.index:
        related_work_text = ''
        for sec in df_pdf.loc[i,'body_text']:
            if sec['section'].lower() == 'related work' and sec['text'] != '' and sec['text'] != []:
                related_work_text += sec['text']
        df.at[i, 'related_work_text'] = related_work_text
        
# Remove papers that don't have related work sections  
df['related_work_text'].replace('', np.nan, inplace=True)
df = df.dropna()

# Collect sentence-citation pairs in related work text and body text
### WARNING: REPLACE 'introduction' with 'related_work'

for i in df.index:
    if i in df_pdf.index:
        full_textl = df_pdf.at[i, 'body_text']
        related_work_textl = []
        body_textl = []
        bibl = df_pdf.at[i, 'bib_entries']
        for t in full_textl:
            if t['section'].lower() == 'related work':
                related_work_textl.append(t)
            else:
                body_textl.append(t)
        related_work_pairs = find_pair(related_work_textl, bibl)
        body_pairs = find_pair(body_textl, bibl)
        df.at[i, 'related_work_pairs'] = i # without this line, there will be an error. But I'm not sure about the reason
        df.at[i, 'related_work_pairs'] = related_work_pairs
        
        df.at[i, 'body_pairs'] = i # without this line, there will be an error. But I'm not sure about the reason
        df.at[i, 'body_pairs'] = body_pairs

# Remove papers that don't have citations in related work sections and body texts        
data_processed = df.loc[(df.body_pairs != {}) & (df.related_work_pairs != {})]

In [20]:
data_processed.head(3)

Unnamed: 0_level_0,title,abstract,full_text,related_work_text,related_work_pairs,body_pairs
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
207957044,Maintainer Perspectives on Data-driven Transpo...,In this paper we present fieldwork findings fr...,In this paper we present fieldwork findings fr...,"Emerging from work involving energy advisors, ...","{'52080384': 'In particular, our contribution ...",{'54867745': 'Inefficient urban surface-water ...
8432521,Is Anyone Looking? Mitigating Shoulder Surfing...,"Displays are growing in size, and are increasi...","Displays are growing in size, and are increasi...",We are not the first to consider shoulder surf...,"{'7876575': 'As mentioned, this can be done by...","{'17032627': 'For example, if they glance at a..."
12148403,Business Process Management: Saving the Planet?,Organisational and government concerns about e...,Organisational and government concerns about e...,A systematic literature review is essential to...,{'10686726': 'A systematic literature review i...,"{'109322554': 'Initially, sustainability was d..."


In [8]:
import nltk.data

for i in data_processed.index:
    for col in ['abstract', 'full_text', 'related_work_text']:
        data_processed.at[i, col] = nltk.tokenize.sent_tokenize(data_processed.loc[i, col])
data_processed.head(3)

Unnamed: 0_level_0,title,abstract,full_text,related_work_text,related_work_pairs,body_pairs
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
55856311,Applications of Mathematical Modeling for Sens...,[ABSTRACT],[ABSTRACTThe supply chain sustainability and s...,[The supply chain sustainability and strategic...,{'119524': 'It is also recognized the necessit...,{'154208356': 'When multiple suppliers are ava...
167930514,To pay more or use less: China’s urban water p...,[Abstract: China is the most populous country ...,[Abstract: China is the most populous country ...,[The sustainable use of China's limited water ...,"{'140535492': 'China's water crisis, built up ...",{'140535492': 'When exploring water shortages ...
24698940,Can We Trust Social Media Data?: Social Networ...,[The size of a social media account's audience...,[The size of a social media account's audience...,[Online social networks (OSN) are primary outl...,"{'168073699': 'To increase brand recognition, ...","{'8956432': 'Although not insurmountable, this..."


In [22]:
for p in df_pdf.loc[:,'body_text']:
    for sec in p:
        if sec['section'].lower() == 'related work':
            print(sec)

{'section': 'RELATED WORK', 'text': 'Emerging from work involving energy advisors, data work [12] a nascent area of CSCW research that attempts to broaden the understanding of "social practices in and through which IoT data is accountably collected, used and acted upon" [11] provides a suitable lens in which to focus our research. Therefore, we align with Fischer et al\'s motivations in attempting to address the "arguably underdeveloped" [11] perspective of data work but situated within the highly cooperative work of drainage surface-water management. Although, here we attempt to apply a much broader investigative lens, that encompasses engagements with a diverse range of actors across two sectors. In particular, our contribution stands apart, both in terms of the work context, where prior CSCW research, with the exception of Liu et al. [39] , appears to have bypassed cooperative work concerns of drainage maintenance, and approach, where we attempt to understand the present and future 