# Data process

In [1]:
import os
import subprocess
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import nltk.data

In [2]:
# Find sentence-citataion pairs that have more than 10 words in the sentence

def find_pair(textl, bibl):
    pair_dict = {}
    for para in textl:
        cite_list = para['cite_spans']
        text = para['text']
        for cite in cite_list:
            start = int(cite['start'])
            end = int(cite['end'])
            # If cite_spans out of the sentence
            if text[start-2] in ['.', '!', '?']:
                i = start-3
                while i >= 0:
                    if text[i] in ['.', '!', '?']:
                        break
                    i = i - 1
                while i <= start-2:
                    if re.match("[A-Z]", text[i]) == None:
                        i = i + 1
                    else:
                        break
                sentence = text[i:start-1]
                sentence_len = len(sentence.split(' '))
                ref_id = cite['ref_id']
                paper_id = bibl[ref_id]['link']
                if paper_id != None and sentence_len >= 10:
                    pair_dict[paper_id] = sentence
                    
            # If cite_spans in the sentence
            if end + 1 < len(text):
                if text[end+1] in ['.', '!', '?']:
                    i = start-1
                    while i >= 0:
                        if text[i] in ['.', '!', '?']:
                            break
                        i = i - 1
                    while i <= start-2:
                        if re.match("[A-Z]", text[i]) == None:
                            i = i + 1
                        else:
                            break
                    sentence = text[i:end+2]
                    sentence_len = len(sentence.split(' '))
                    ref_id = cite['ref_id']
                    paper_id = bibl[ref_id]['link']
                    if paper_id != None and sentence_len >= 10:
                        pair_dict[paper_id] = sentence
    
    return pair_dict

In [3]:
def data_preprocessor(df_meta, df_pdf):
    
    # Collect title and abstract

    df = pd.DataFrame(df_meta.loc[:, 'title'])
    for i in df.index:
        if i in df_pdf.index:
            if df_pdf.at[i, 'abstract'] != [] and df_pdf.at[i, 'abstract'] != '':
                df.at[i, 'abstract'] = df_pdf.at[i, 'abstract'][0]['text']

    df.replace('', np.nan, inplace=True)
    df = df.dropna()

    # Collect full text

    for i in df.index:
        if i in df_pdf.index:
            full_text = ''
            for sec in df_pdf.loc[i,'body_text']:
                if sec['text'] != [] and sec['text'] != '':
                    full_text += sec['text']
            df.at[i, 'full_text'] = df.loc[i, 'abstract'] + full_text

    # Collect related work text
    ### WARNING: REPLACE 'introduction' with 'related_work'

    for i in df.index:
        if i in df_pdf.index:
            related_work_text = ''
            for sec in df_pdf.loc[i,'body_text']:
                if sec['section'].lower() == 'related work' and sec['text'] != '' and sec['text'] != []:
                    related_work_text += sec['text']
            df.at[i, 'related_work_text'] = related_work_text

    # Remove papers that don't have related work sections  
    df['related_work_text'].replace('', np.nan, inplace=True)
    df = df.dropna()

    # Collect sentence-citation pairs in related work text and body text
    ### WARNING: REPLACE 'introduction' with 'related_work'

    for i in df.index:
        if i in df_pdf.index:
            full_textl = df_pdf.at[i, 'body_text']
            related_work_textl = []
            body_textl = []
            bibl = df_pdf.at[i, 'bib_entries']
            for t in full_textl:
                if t['section'].lower() == 'related work':
                    related_work_textl.append(t)
                else:
                    body_textl.append(t)
            related_work_pairs = find_pair(related_work_textl, bibl)
            body_pairs = find_pair(body_textl, bibl)
            df.at[i, 'related_work_pairs'] = i # without this line, there will be an error. But I'm not sure about the reason
            df.at[i, 'related_work_pairs'] = related_work_pairs

            df.at[i, 'body_pairs'] = i # without this line, there will be an error. But I'm not sure about the reason
            df.at[i, 'body_pairs'] = body_pairs

    # Remove papers that don't have citations in related work sections and body texts        
    data_processed = df.loc[(df.body_pairs != {}) & (df.related_work_pairs != {})]

    pd.options.mode.chained_assignment = None

    col_names = ['abstract', 'full_text', 'related_work_text']

    for col in col_names:
        new_col = col + "_split"
        data_processed.at[:, new_col] = data_processed[col]

    for i in data_processed.index:
        for col in col_names:
            new_col = col + "_split"
            data_processed.at[i, new_col] = nltk.tokenize.sent_tokenize(data_processed.loc[i, col])
            
    return data_processed

In [4]:
meta_dir = '../dataset/SS/metadata/business/'
pdf_dir = '../dataset/SS/pdf_parses/business/'

meta_files = os.listdir(meta_dir)
pdf_files = os.listdir(pdf_dir)


for i in np.arange(len(meta_files)):
    
    meta_list = []
    
    with open(os.path.join(meta_dir, meta_files[i]), 'r') as json_file:
        json_list = list(json_file)
        
        for json_str in json_list:
            result = json.loads(json_str)
            meta_list.append(result)
        
    pdf_list = []

    with open(os.path.join(pdf_dir, pdf_files[i]), 'r') as json_file:
        json_list = list(json_file)
        
        for json_str in json_list:
            result = json.loads(json_str)
            pdf_list.append(result) 
            
    df_meta = pd.DataFrame(meta_list)
    df_meta = df_meta.set_index('paper_id')
    
    df_pdf = pd.DataFrame(pdf_list)
    df_pdf = df_pdf.set_index('paper_id')
    
    data_processed = data_preprocessor(df_meta, df_pdf)
    
    if i == 0:
        df = data_processed
        print(pdf_files[i], 'filtered')
    else:
        df = df.append(data_processed)
        print(pdf_files[i], 'filtered')

pdf_parses_0.jsonl filtered
pdf_parses_1.jsonl filtered
pdf_parses_10.jsonl filtered
pdf_parses_11.jsonl filtered
pdf_parses_12.jsonl filtered
pdf_parses_13.jsonl filtered
pdf_parses_14.jsonl filtered
pdf_parses_15.jsonl filtered
pdf_parses_16.jsonl filtered
pdf_parses_17.jsonl filtered
pdf_parses_18.jsonl filtered
pdf_parses_19.jsonl filtered
pdf_parses_2.jsonl filtered
pdf_parses_20.jsonl filtered
pdf_parses_21.jsonl filtered
pdf_parses_22.jsonl filtered
pdf_parses_23.jsonl filtered
pdf_parses_24.jsonl filtered
pdf_parses_25.jsonl filtered
pdf_parses_26.jsonl filtered
pdf_parses_27.jsonl filtered
pdf_parses_28.jsonl filtered
pdf_parses_29.jsonl filtered
pdf_parses_3.jsonl filtered
pdf_parses_30.jsonl filtered
pdf_parses_31.jsonl filtered
pdf_parses_32.jsonl filtered
pdf_parses_33.jsonl filtered
pdf_parses_34.jsonl filtered
pdf_parses_35.jsonl filtered
pdf_parses_36.jsonl filtered
pdf_parses_37.jsonl filtered
pdf_parses_38.jsonl filtered
pdf_parses_39.jsonl filtered
pdf_parses_4.jsonl

In [5]:
df.head()

Unnamed: 0_level_0,title,abstract,full_text,related_work_text,related_work_pairs,body_pairs,abstract_split,full_text_split,related_work_text_split
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
207957044,Maintainer Perspectives on Data-driven Transpo...,In this paper we present fieldwork findings fr...,In this paper we present fieldwork findings fr...,"Emerging from work involving energy advisors, ...","{'52080384': 'In particular, our contribution ...",{'54867745': 'Inefficient urban surface-water ...,[In this paper we present fieldwork findings f...,[In this paper we present fieldwork findings f...,"[Emerging from work involving energy advisors,..."
8432521,Is Anyone Looking? Mitigating Shoulder Surfing...,"Displays are growing in size, and are increasi...","Displays are growing in size, and are increasi...",We are not the first to consider shoulder surf...,"{'7876575': 'As mentioned, this can be done by...","{'17032627': 'For example, if they glance at a...","[Displays are growing in size, and are increas...","[Displays are growing in size, and are increas...",[We are not the first to consider shoulder sur...
12148403,Business Process Management: Saving the Planet?,Organisational and government concerns about e...,Organisational and government concerns about e...,A systematic literature review is essential to...,{'10686726': 'A systematic literature review i...,"{'109322554': 'Initially, sustainability was d...",[Organisational and government concerns about ...,[Organisational and government concerns about ...,[A systematic literature review is essential t...
6940735,A Middleware Model in Alloy for Supply Chain-W...,Abstract. To support the complex coordination ...,Abstract. To support the complex coordination ...,The approach presented in this paper intersect...,{'10536760': 'A number of approaches propose m...,{'3182275': 'Due to company-specific restricti...,"[Abstract., To support the complex coordinatio...","[Abstract., To support the complex coordinatio...",[The approach presented in this paper intersec...
31613778,Decision Support for Structured Energy Procure...,Infrastructure operators in Germany such as ai...,Infrastructure operators in Germany such as ai...,"In this section, we examine the areas of energ...","{'33876989': 'In this section, we examine the ...",{'1138948': 'On the other hand energy provider...,[Infrastructure operators in Germany such as a...,[Infrastructure operators in Germany such as a...,"[In this section, we examine the areas of ener..."


In [6]:
len(df)

377