# ***Libraries & Tools***

In [11]:
import fitz  # PyMuPDF
import requests
import io
import re
from tqdm import tqdm  # Ensure proper import
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor

# ***PDF Extraction & Analysis***

## ***Serial***

In [None]:
def download_pdf(paper_id):
    url = f"https://arxiv.org/pdf/{paper_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return io.BytesIO(response.content)
    else:
        raise None #Exception(f"Failed to download PDF. Status code: {response.status_code}")

def extract_text_from_pdf(pdf_stream):
    if not pdf_stream:
        return None  # Failed to download
    
    try:
        doc = fitz.open("pdf", pdf_stream)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        return text if text.strip() else None  # Return None if empty
    except:
        return None  # Failed to extract text

def extract_introduction(text, word_limit=400):
    lines = text.split("\n")
    introduction_found = False
    extracted_text = []
    word_count = 0
    
    section_pattern = re.compile(r'^\s*(\d+\.?|[IVXLCDM]+\.?|[A-Z]\.?)\s+(INTRODUCTION|Introduction)\s*$')
    
    for line in lines:
        if not introduction_found:
            if section_pattern.match(line):
                introduction_found = True
        else:
            words = line.split()
            try:
                if word_count + len(words) > word_limit:
                    words = words[: word_limit - word_count]
                    extracted_text.append(" ".join(words))
                    break
                else:
                    extracted_text.append(line)
                    word_count += len(words)
            except Exception:
                break  # Stop if an error occurs
    
    cleaned_text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', " ".join(extracted_text))
    return cleaned_text.strip() if cleaned_text.strip() else None  # None if intro not found



def process_pdf(paper_id):
    try:
        pdf_stream = download_pdf(paper_id)
        if not pdf_stream:
            print(f"{paper_id} failed to download.")
            return None
        
        #start_time = datetime.now()
        text = extract_text_from_pdf(pdf_stream)
        if text is None: 
            print(f"Failed to extract text from {paper_id}.")
            return None
        
        introduction_text = extract_introduction(text)
        if introduction_text is None:
            print(f"Failed to extract text from introduction sec. for {paper_id}.")
            return None
        
        #end_time = datetime.now()
        #print((end_time - start_time).total_seconds())
        return introduction_text
    except Exception as e:
        return None #f"Error: {str(e)}"

In [None]:
def update_output_file(output_file, successful_retries):
    # Read the file into a list
    with open(output_file, "r") as f:
        lines = f.readlines()  # Read all lines

    # Update the correct indices
    for index, new_text in successful_retries:
        if 0 <= index < len(lines):  # Ensure index is valid
            lines[index] = new_text.strip() + "\n\n"  # Ensure proper formatting
    
    # Write back the updated lines
    with open(output_file, "w") as f:
        f.writelines(lines)


In [None]:
paper_ids = ["0704.1274", "0704.1028", "0704.0954", "0704.1308"]
successful_retries = []

for paper_id in paper_ids: # for idx, paper_id in failed_papers[0]:
    intro_text = process_pdf(paper_id)
    if intro_text:
        print(f"\n==================================================")
        print(intro_text)
        print(f"==================================================\n")
        #successful_retries.append((idx, intro_text))

## ***Parallel***

In [None]:
def download_pdf(paper_id):
    url = f"https://arxiv.org/pdf/{paper_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return io.BytesIO(response.content)
    else:
        return None  # Indicate failure

def extract_text_from_pdf(pdf_stream):
    if not pdf_stream:
        return None  # Failed to download
    
    try:
        doc = fitz.open("pdf", pdf_stream)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        return text if text.strip() else None  # Return None if empty
    except:
        return None  # Failed to extract text

def extract_introduction(text, word_limit=500):
    lines = text.split("\n")
    introduction_found = False
    extracted_text = []
    word_count = 0
    
    section_pattern = re.compile(r'^\s*(\d+\.?|[IVXLCDM]+\.?|[A-Z]\.?)\s+(INTRODUCTION|Introduction)\s*$')
    
    for line in lines:
        if not introduction_found:
            if section_pattern.match(line):
                introduction_found = True
        else:
            words = line.split()
            try:
                if word_count + len(words) > word_limit:
                    words = words[: word_limit - word_count]
                    extracted_text.append(" ".join(words))
                    break
                else:
                    extracted_text.append(line)
                    word_count += len(words)
            except Exception:
                break  # Stop if an error occurs
    
    cleaned_text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', " ".join(extracted_text))
    return cleaned_text.strip() if cleaned_text.strip() else None  # None if intro not found

def process_range(paper_ids, start, end):
    results = [(i, "FAILED") for i in range(start, end)]
    failed_papers = [[] for _ in range(3)]  # Track failures: [download, extraction, introduction]
    
    #for i, paper_id in enumerate(paper_ids, start=start):
    for i, paper_id in enumerate(tqdm(paper_ids, total=len(paper_ids), desc="Processing Papers"), start=start):

        pdf_stream = download_pdf(paper_id)
        if not pdf_stream:
            failed_papers[0].append((i, paper_id))
            continue
        
        text = extract_text_from_pdf(pdf_stream)
        if text is None:
            failed_papers[1].append((i, paper_id))
            continue
        
        introduction_text = extract_introduction(text)
        if introduction_text is None:
            failed_papers[2].append((i, paper_id))
            continue
        
        results[i - start] = (i, introduction_text)
    
    return results, failed_papers

def process_papers_parallel(id_file, ranges, output_file, max_workers=5):
    with open(id_file, "r") as f:
        paper_ids = [line.strip() for line in f if line.strip()]
    
    failed_papers = [[] for _ in range(3)]  # Track failures: [download, extraction, introduction]
    results = ["FAILED"] * len(paper_ids)  # Initialize output list with "FAILED"
    
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_range = {executor.submit(process_range, paper_ids[r[0]:r[1]], r[0], r[1]): r for r in ranges}
        
        for future in future_to_range:
            range_results, range_failures = future.result()
            for i, text in range_results:
                results[i] = text
            for j in range(3):
                failed_papers[j].extend(range_failures[j])
    

    return failed_papers, results

In [17]:
num_ids = 52596
batch_size = int(num_ids / 10)  

ranges = []

start = 0
while start < num_ids:
    end = min(start + batch_size, num_ids)
    ranges.append([start, end])
    start = end

ranges[-2][1] = ranges[-1][1]
del ranges[-1]
print(ranges)

[[0, 5259], [5259, 10518], [10518, 15777], [15777, 21036], [21036, 26295], [26295, 31554], [31554, 36813], [36813, 42072], [42072, 47331], [47331, 52596]]


In [None]:
id_file = "graph-v2/Node_IDs.txt"  # File containing one ID per line

output_file = "graph-v2/Additional_Node_Content.txt"
failed_papers, results = process_papers_parallel(id_file, ranges, output_file)

In [None]:
with open(output_file, "w", encoding='utf-8') as f:
        for text in results:
            f.write(text + "\n")
    
with open("graph-v2/Failed_papers.txt", "w") as f:
    f.write("Failed Downloads:\n")
    for idx, pid in failed_papers[0]:
        f.write(f"{idx}: {pid}\n")
    f.write("\nFailed Extraction:\n")
    for idx, pid in failed_papers[1]:
        f.write(f"{idx}: {pid}\n")
    f.write("\nFailed Introduction Detection:\n")
    for idx, pid in failed_papers[2]:
        f.write(f"{idx}: {pid}\n")

The function below concatenates the contents of a row from two files 


In [None]:
def concatenate_files(file1, file2, output_file):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2, open(output_file, 'w', encoding='utf-8') as out:
        for line1, line2 in zip(f1, f2):
            out.write(f"{line1.strip()}. {line2.strip()}\n")

# Example usage:
concatenate_files("graph-v2/data-v2.txt", "graph-v2/Additional_Node_Content.txt", "graph-v2/data-v4.txt")