In [1]:
from pypdf import PdfReader
import subprocess
import os
import uuid
import re
import json
from utils import clean_text
import pathlib

ANYSTYPE_PATH="/home/viet/.local/share/gem/ruby/3.2.0/bin/anystyle"

def extract_references_and_citations(text, name=None):
    
    # Find the reference section
    text = clean_text(text)
    reference_section = re.search(r'References([\s\S]*)', text, re.IGNORECASE)
    if reference_section:
        references = reference_section.group(1)
    else:
        return "References section not found."
    
    # Extract individual references
    reference_list = re.findall(r'\[\d+\].*?(?=\[\d+\]|\Z)', references, re.DOTALL)
    
    # Find in-text citations
    citations = re.findall(r'\[(\d+)\]', text)

    reference_list = [ref.replace('\n','') for ref in reference_list]

    if name:
        json.dump({
            "references": reference_list,
            "citations": citations,
        }, open(f"tmp/{name}_references.json", "w"), indent=4)
    
    return reference_list, citations

def parse_pdf2text(pdf_path):
    reader = PdfReader(pdf_path)
    number_of_pages = len(reader.pages)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

    return text

def parse_references(references, work_dir):
    file_path = f"{work_dir}/refs.txt"
    with open(file_path, "w") as f:
        for ref in references:
            f.write(ref[4:] + '\n')

    process = subprocess.Popen([ANYSTYPE_PATH, 'parse', file_path], 
                           stdout=subprocess.PIPE, 
                           stderr=subprocess.PIPE,
                           text=True)

    stdout, stderr = process.communicate()

    if stderr:
        raise NameError(stderr)

    parsed_refs = json.loads(stdout)

    for i, ref in enumerate(references):
        parsed_refs[i]["cite_id"] = ref[1:2]

    return parsed_refs

def make_working_dir():
    paper_uuid = uuid.uuid4()
    working_dir = f"tmp/{paper_uuid}/"
    pathlib.Path(working_dir).mkdir(parents=True, exist_ok=True)
    return working_dir


In [31]:
import requests
import requests
from arxvi_utils import parse_arxiv_response
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def tfidf_similarity(text1, text2):
    # Create TfidfVectorizer object
    vectorizer = TfidfVectorizer()
    
    # Fit and transform the texts
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    
    return similarity[0][0]

def crossref_search(reference):
    params = {
        'query': reference
    }
    r = requests.get("https://api.crossref.org/works", params= params)
    
    return r.json()['message']['items'][0]

def get_paper_link(doi):
    URL = f"https://doi.org/{doi}" # Specify the DOI here
    r = requests.get(URL,allow_redirects=True) # Redirects help follow to the actual domain
    return r.url

def arxiv_search(reference):
    params = {
        "search_query": reference,
        "start": 0,
        "max_results": 1,
    }

    response = requests.get("http://export.arxiv.org/api/query", params=params) 
    parsed_res = parse_arxiv_response(response.text)
    if len(parsed_res) != 0:
        return parsed_res[0]
    else:
        return {
            "title": "",
            "id": ""
        }

def retrieve_from_crossref(parsed_ref):
    search_results = {
        'tf-idf_score': [],
        'ref_title': [],
        'res_title': [],
        'res_DOI': [],
        'paper_link': []
    }

    for ref in tqdm(parsed_ref):
        ref_text_title = ref["title"][0]
        search_result = crossref_search(ref_text_title)
        search_result_title = search_result['title'][0]
        doi = search_result['DOI']
        paper_link = get_paper_link(doi)
        sim_score = tfidf_similarity(ref_text_title, search_result_title)

        search_results['ref_title'].append(ref_text_title)
        search_results['res_title'].append(search_result_title)
        search_results['res_DOI'].append(doi)
        search_results['tf-idf_score'].append(sim_score)
        search_results['paper_link'].append(paper_link)
        
        # print(f"{sim_score} | {ref_text_title} | {search_result_title}")


    retrieve_df = pd.DataFrame(search_results)
    # retrieve_df.to_csv("tmp/retrieve_table.csv")
    return retrieve_df

def retrieve_from_arxiv(parsed_ref):
    arvix_search_results = {
        'cite_id': [],
        'tf-idf_score': [],
        'ref_title': [],
        'res_title': [],
        'arxiv_id': []
    }

    for ref in tqdm(parsed_ref):
        ref_text_title = ref["title"][0]
        search_result = arxiv_search(ref_text_title)
        search_result_title = search_result['title']
        search_result_id = search_result['id'].split("/")[-1]
        sim_score = tfidf_similarity(ref_text_title, search_result_title)
        
        arvix_search_results['cite_id'].append(ref['cite_id'])
        arvix_search_results['ref_title'].append(ref_text_title)
        arvix_search_results['res_title'].append(search_result_title)
        arvix_search_results['tf-idf_score'].append(sim_score)
        arvix_search_results['arxiv_id'].append(search_result_id)

    retrieve_df = pd.DataFrame(arvix_search_results)
    # retrieve_df.to_csv("tmp/arvix_retrieve_table.csv")

    return retrieve_df

def retrieve_from_crossref(parsed_ref):
    arvix_search_results = {
        'cite_id': [],
        'tf-idf_score': [],
        'ref_title': [],
        'res_title': [],
        'URL': []
    }

    for ref in tqdm(parsed_ref):
        ref_text_title = ref["title"][0]
        search_result = crossref_search(ref_text_title)
        search_result_title = search_result['title'][0]
        search_result_url = search_result['URL']
        sim_score = tfidf_similarity(ref_text_title, search_result_title)
        
        arvix_search_results['cite_id'].append(ref['cite_id'])
        arvix_search_results['ref_title'].append(ref_text_title)
        arvix_search_results['res_title'].append(search_result_title)
        arvix_search_results['tf-idf_score'].append(sim_score)
        arvix_search_results['URL'].append(search_result_url)

    retrieve_df = pd.DataFrame(arvix_search_results)

    return retrieve_df

In [3]:
ARXIV_PDF_URL = "http://export.arxiv.org/pdf/"

def download_papers(retrieve_df, working_dir):
    download_path = f"{working_dir}/papers"
    mapper = {}
    pathlib.Path(download_path).mkdir(exist_ok=True)
    for index, row in tqdm(retrieve_df.iterrows()):
        arxiv_id = row["arxiv_id"]
        cite_id = row["cite_id"]
        pdf_url = f"{ARXIV_PDF_URL}{arxiv_id}"
        response = requests.get(pdf_url)

        save_path = f"{download_path}/{arxiv_id}.pdf"
        with open(save_path, "wb") as f:
            f.write(response.content) 

        mapper[cite_id] = save_path

    return mapper

In [4]:
from pdfminer.high_level import extract_pages
import re
from pdfminer.layout import LTTextBoxHorizontal, LTTextBox, LTTextLine, LTChar
from utils import clean_text

def is_reference_header(text):
    """Check if the given text is likely to be a reference section header."""
    reference_headers = ['references', 'bibliography', 'works cited', 'literature cited']
    return any(header in text.lower() for header in reference_headers)

def extract_citation(text, element_info):
    citation_pattern = r'\[(\d+(?:,\s*\d+)*)\]'

    citations = re.findall(citation_pattern, text)
    citation_sentences = []
    if citations:
        # Split text into sentences (simple split on period)
        sentences = text.split('.')
        for sentence in sentences:
            if re.search(citation_pattern, sentence):
                # cleaned_text = clean_text(sentence.strip())
                # cleaned_text = cleaned_text.replace('\n', ' ').replace('- ', '')
                cleaned_text = sentence
                citation_sentences.append(cleaned_text)

        citations = []
        citation_sentences = []

        for sentence in citation_sentences:
            cites = re.findall(citation_pattern, sentence)
            for citation in cites:
                splitted_cites = citation.split(', ')
                for cite in splitted_cites:
                    citations.append(int(cite))
                    citation_sentences.append(sentence)

        element_info["citations"] = citations
        element_info["citation_sentences"] = citation_sentences
    
    # return element_info

def extract_element_boxes(pdf_path, get_citation=False):
    
    # Extract pages using PDFMiner
    pages = list(extract_pages(pdf_path))
    
    # Initialize list to store all text elements
    all_elements = []
    
    for page_num, page in enumerate(pages, start=1):
        
        # Get page dimensions
        pdf_width = page.width
        pdf_height = page.height
        
        # Iterate through layout objects on the page
        for element in page:
            if hasattr(element, 'get_text'): 
                # isinstance(element, (LTTextBoxHorizontal, LTTextBox, LTTextLine, LTChar)):
                # Get coordinates
                x0, y0, x1, y1 = element.bbox
                
                # Normalize coordinates
                x0_norm = x0 / pdf_width
                x1_norm = x1 / pdf_width
                y0_norm = y0 / pdf_height
                y1_norm = y1 / pdf_height
                
                # Add text with element type
                element_type = type(element).__name__
                
                # Extract text content
                text = ""
                text = element.get_text().strip()

                if is_reference_header(text):
                    continue

                element_info = {
                    "type": element_type,
                    "page": page_num,
                    "bbox": [x0_norm, y0_norm, x1_norm, y1_norm],
                    "text": text,
                }

                if get_citation:                
                    extract_citation(text, element_info)                   

                all_elements.append(element_info)
    
    return all_elements

In [5]:
paper_path = "papers/1403.6382v3.pdf"

working_dir = make_working_dir()

In [32]:
text = parse_pdf2text(paper_path)
reference_list, citations = extract_references_and_citations(text)
parsed_refs = parse_references(reference_list, working_dir)

# retrieve_df = retrieve_from_arxiv(parsed_refs)
retrieve_df = retrieve_from_crossref(parsed_refs)
retrieve_df = retrieve_df[retrieve_df["tf-idf_score"] > 0.7]

100%|██████████| 52/52 [04:32<00:00,  5.24s/it]


In [33]:
retrieve_df

Unnamed: 0,cite_id,tf-idf_score,ref_title,res_title,URL
0,1,0.776515,Imagenet large scale visual recognition challe...,ImageNet Large Scale Visual Recognition Challenge,http://dx.doi.org/10.1007/s11263-015-0816-y
1,2,1.0,Efficient object detection and segmentation fo...,Efficient Object Detection and Segmentation fo...,http://dx.doi.org/10.1109/cvpr.2013.110
3,4,1.0,All about VLAD,All About VLAD,http://dx.doi.org/10.1109/cvpr.2013.207
4,5,0.711559,Poof: Part-based one-vs.-onefeatures for fine-...,POOF: Part-Based One-vs.-One Features for Fine...,http://dx.doi.org/10.1109/cvpr.2013.128
5,6,0.779915,Describing people: Aposelet-based approach to ...,Describing people: A poselet-based approach to...,http://dx.doi.org/10.1109/iccv.2011.6126413
6,7,0.706078,Bicos: A bilevel co-segmentation method for im...,BiCoS: A Bi-level co-segmentation method for i...,http://dx.doi.org/10.1109/iccv.2011.6126546
7,8,0.895532,Hierarchical matching with side information fo...,Hierarchical matching with side information fo...,http://dx.doi.org/10.1109/cvpr.2012.6248083
10,1,0.81818,Yan.Subcategory-aware object classification,Subcategory-Aware Object Classification,http://dx.doi.org/10.1109/cvpr.2013.112
12,1,1.0,Describing objects by their attributes,Describing objects by their attributes,http://dx.doi.org/10.1109/cvprw.2009.5206772
17,1,0.826728,Negative evidences and cooccurences in image r...,Negative Evidences and Co-occurences in Image ...,http://dx.doi.org/10.1007/978-3-642-33709-3_55


In [7]:
reference_pdf = download_papers(retrieve_df, working_dir)
reference_pdf

0it [00:00, ?it/s]

3it [00:19,  6.47s/it]


{'1': 'tmp/627ea2fa-d138-4da2-b063-ea483ab9d399//papers/1310.1531v1.pdf',
 '4': 'tmp/627ea2fa-d138-4da2-b063-ea483ab9d399//papers/1312.4659v3.pdf',
 '5': 'tmp/627ea2fa-d138-4da2-b063-ea483ab9d399//papers/1311.5591v2.pdf'}

In [8]:
element_boxes = extract_element_boxes(paper_path, get_citation=True)

cite_boxes = []
for i, element_info in enumerate(element_boxes):
    if "citations" in element_info.keys():
        cite_boxes.append(element_info)

In [26]:
parsed_refs[0]
results = crossref_search(parsed_refs[0]["title"])

In [27]:
results.keys()

dict_keys(['indexed', 'reference-count', 'publisher', 'issue', 'license', 'content-domain', 'short-container-title', 'published-print', 'DOI', 'type', 'created', 'page', 'update-policy', 'source', 'is-referenced-by-count', 'title', 'prefix', 'volume', 'author', 'member', 'published-online', 'reference', 'container-title', 'language', 'link', 'deposited', 'score', 'resource', 'issued', 'references-count', 'journal-issue', 'alternative-id', 'URL', 'ISSN', 'issn-type', 'published'])

In [30]:
results

{'indexed': {'date-parts': [[2024, 8, 22]],
  'date-time': '2024-08-22T12:44:16Z',
  'timestamp': 1724330656384},
 'reference-count': 102,
 'publisher': 'Springer Science and Business Media LLC',
 'issue': '3',
 'license': [{'start': {'date-parts': [[2015, 4, 11]],
    'date-time': '2015-04-11T00:00:00Z',
    'timestamp': 1428710400000},
   'content-version': 'tdm',
   'delay-in-days': 0,
   'URL': 'http://www.springer.com/tdm'}],
 'content-domain': {'domain': ['link.springer.com'],
  'crossmark-restriction': False},
 'short-container-title': ['Int J Comput Vis'],
 'published-print': {'date-parts': [[2015, 12]]},
 'DOI': '10.1007/s11263-015-0816-y',
 'type': 'journal-article',
 'created': {'date-parts': [[2015, 4, 10]],
  'date-time': '2015-04-10T04:47:45Z',
  'timestamp': 1428641265000},
 'page': '211-252',
 'update-policy': 'http://dx.doi.org/10.1007/springer_crossmark_policy',
 'source': 'Crossref',
 'is-referenced-by-count': 24474,
 'title': ['ImageNet Large Scale Visual Recognitio