In [34]:
import os
import pandas as pd
import re
from pypdf import PdfReader

pdf = os.path.join('..','paper','s40798-019-0202-3.pdf')

# Function to extract references from a PDF file using a context-aware pattern
def extract_incremental_references(pdf_path):
    reader = PdfReader(pdf_path)
    references = []
    in_references_section = False
    last_ref_num = 0  # To track the last reference number

    for page in reader.pages:
        text = page.extract_text()
        if text and "References" in text:
            in_references_section = True
            text = text.split("References", 1)[1]
        if in_references_section and text:
            # Find all matches of reference patterns
            for match in re.finditer(r'(\d+)\. (.*?\..*?)\.', text, re.DOTALL):
                current_ref_num = int(match.group(1))
                # Check if the current reference number is sequential
                if current_ref_num == last_ref_num + 1:
                    references.append(match.groups())
                    last_ref_num = current_ref_num

    return references

# Extract references using the context-aware pattern from the uploaded PDF file
extracted_incremental_references = extract_incremental_references(pdf)

# Convert the list of references to a DataFrame
df_incremental_references = pd.DataFrame(extracted_incremental_references, columns=['Reference Number', 'Reference Text'])

df_incremental_references

Unnamed: 0,Reference Number,Reference Text
0,1,"Russell S, Norvig P. Artificial Intelligence: ..."
1,2,"Witten IH, Frank E, Hall MA, et al. Data Minin..."
2,3,"Zaki MJ, Meira Jr, W. Data Mining and analysis..."
3,4,"Passfield L, Hopker JG. A mine of information:..."
4,5,"Rein R, Memmert D. Big data and tactical analy..."
...,...,...
98,99,"Dalton-Barron NE, McLaren SJ, Black CJ, et al...."
99,100,"McLaren SJ, Weston M, Smith A, et al. Variabil..."
100,101,"Oliveira WK, Jesus K, Andrade AD, et al. Monit..."
101,102,"Düking P, Achtzehn S, Holmberg HC, Sperlich B...."
