# ***Libraries & Tools***

In [None]:
import fitz  # PyMuPDF
import requests
import io
import re
import os
from tqdm import tqdm 
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor

# ***PDF Extraction & Analysis***

## ***Helper Functions & General Variables***

In [None]:
def download_pdf(paper_id):
    response = requests.get(f"https://arxiv.org/pdf/{paper_id}")
    if response.status_code == 200:
        return io.BytesIO(response.content)
    else:
        #raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
        return None 

def extract_text_from_pdf(pdf):
    try:
        doc = fitz.open("pdf", pdf)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        return text if text.strip() else None  # Return None if empty
    except Exception as e:
        print(f"❌ Failed to extract text from {pdf}: {e}")
        return None  # Failed to extract text

def extract_introduction(text, word_limit=250):
    lines = text.split("\n")
    introduction_found = False
    extracted_text = []
    word_count = 0
    
    section_pattern = re.compile(r'^\s*(\d+\.?|[IVXLCDM]+\.?|[A-Z]\.?)\s+(INTRODUCTION|Introduction)\s*$')
    
    for line in lines:
        if not introduction_found:
            if section_pattern.match(line):
                introduction_found = True
        else:
            #words = line.split()
            words = re.findall(r"\b\w+\b", line)
            try:
                if word_count + len(words) > word_limit:
                    words = words[: word_limit - word_count]
                    extracted_text.append(" ".join(words))
                    break
                else:
                    extracted_text.append(line)
                    word_count += len(words)
            except Exception:
                break  
    
    cleaned_text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', " ".join(extracted_text))
    return cleaned_text.strip() if cleaned_text.strip() else None  # None if intro not found


def update_output_file(output_file, successful_retries):
    # Read the file into a list
    with open(output_file, "r") as f:
        lines = f.readlines()  # Read all lines

    # Update the correct indices
    for index, new_text in successful_retries:
        if 0 <= index < len(lines):  # Ensure index is valid
            lines[index] = new_text.strip() + "\n" 
    
    # Write back the updated lines
    with open(output_file, "w") as f:
        f.writelines(lines)


def concatenate_files(file1, file2, output_file):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2, open(output_file, 'w', encoding='utf-8') as out:
        for line1, line2 in zip(f1, f2):
            if "FAILED" in line2:
                out.write(f"{line1.strip()}\n")
            else:
                out.write(f"{line1.strip()}. {line2.strip()}\n")

In [None]:
id_file = "graph-v2/Node_IDs.txt"  # File containing one ID per line

output_file = "graph-v2/Additional_Node_Content.txt"

## ***Serial***

In [None]:
# Function to process PDFs stored in directories
def process_pdfs(pdf_dirs, results):
    successes = 0
    failures = []

    for directory in pdf_dirs:
        for filename in os.listdir(directory):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(directory, filename)

                text = extract_text_from_pdf(pdf_path)
                if text is None:
                    print(f"Skipping {filename}: Failed to extract text")
                    failures.append(filename.replace("_", "/").replace(".pdf", ""))
                    continue

                intro_text = extract_introduction(text)
                if intro_text is None:
                    print(f"Skipping {filename}: No introduction found")
                    failures.append(filename.replace("_", "/").replace(".pdf", ""))
                    continue

                results[filename.replace("_", "/").replace(".pdf", "")] = intro_text
                successes += 1
                #print(f"✅ Extracted Introduction from {filename}")

    return results, successes, failures  # Dictionary { "filename.pdf": "Introduction text" }



# Function to process individual papers
def process_pdf(paper_id):
    try:
        pdf_stream = download_pdf(paper_id)
        if not pdf_stream:
            print(f"{paper_id} failed to download")
            return None
        
        text = extract_text_from_pdf(pdf_stream)
        if text is None: 
            print(f"Failed to extract text from {paper_id}")
            return None
        
        introduction_text = extract_introduction(text)
        if introduction_text is None:
            print(f"Failed to extract text from introduction section for {paper_id}")
            return None
        
        return introduction_text
    except Exception as e:
        return None #f"Error: {str(e)}"


In [None]:
paper_ids = ["0704.1274", "0704.1028", "0704.0954", "0704.1308"] # For selected papers
successful_retries = []

for paper_id in paper_ids: # for idx, paper_id in failed_papers[0]:
    intro_text = process_pdf(paper_id)
    if intro_text:
        print(f"\n==================================================")
        print(intro_text)
        print(f"==================================================\n")
        #successful_retries.append((idx, intro_text)) # "successful_retries" will be used with "failed_papers" 

In [None]:
pdf_dirs = ['Papers 1', 'Papers 2', 'Papers 3', 'Papers 4', 'Papers 5', 'Papers 6']

results = {}

with open(id_file, "r") as f:
    paper_ids = {line.strip() for line in f}  # Use a set for faster lookups

for paper_id in paper_ids:
    results[paper_id] = "FAILED"

In [None]:
paper_ids

In [None]:
results

In [None]:
results, successes, failures = process_pdfs(pdf_dirs, "graph-v2/Node_IDs.txt")

print(f'Successes: {successes}. Failures: {len(failures)}. Node IDs: {len(paper_ids)}.')

with open(output_file, "w", encoding='utf-8') as f:
    for filename, text in results.items():
        f.write(text + "\n")

In [None]:
print(failures)

## ***Parallel***

In [None]:
def process_range(paper_ids, start, end):
    results = [(i, "FAILED") for i in range(start, end)]
    failed_papers = [[] for _ in range(3)]  # Track failures: [download, extraction, introduction]
    
    #for i, paper_id in enumerate(paper_ids, start=start):
    for i, paper_id in enumerate(tqdm(paper_ids, total=len(paper_ids), desc="Processing Papers"), start=start):

        pdf_stream = download_pdf(paper_id)
        if not pdf_stream:
            failed_papers[0].append((i, paper_id))
            continue
        
        text = extract_text_from_pdf(pdf_stream)
        if text is None:
            failed_papers[1].append((i, paper_id))
            continue
        
        introduction_text = extract_introduction(text)
        if introduction_text is None:
            failed_papers[2].append((i, paper_id))
            continue
        
        results[i - start] = (i, introduction_text)
    
    return results, failed_papers

def process_papers_parallel(id_file, ranges, output_file, max_workers=5):
    with open(id_file, "r") as f:
        paper_ids = [line.strip() for line in f if line.strip()]
    
    failed_papers = [[] for _ in range(3)]  # Track failures: [download, extraction, introduction]
    results = ["FAILED"] * len(paper_ids)  # Initialize output list with "FAILED"
    
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_range = {executor.submit(process_range, paper_ids[r[0]:r[1]], r[0], r[1]): r for r in ranges}
        
        for future in future_to_range:
            range_results, range_failures = future.result()
            for i, text in range_results:
                results[i] = text
            for j in range(3):
                failed_papers[j].extend(range_failures[j])
    

    return failed_papers, results

In [None]:
# Create ranges
num_ids = 52596
batch_size = int(num_ids / 10)  

ranges = []

start = 0
while start < num_ids:
    end = min(start + batch_size, num_ids)
    ranges.append([start, end])
    start = end

ranges[-2][1] = ranges[-1][1]
del ranges[-1]
print(ranges)

[[0, 5259], [5259, 10518], [10518, 15777], [15777, 21036], [21036, 26295], [26295, 31554], [31554, 36813], [36813, 42072], [42072, 47331], [47331, 52596]]


In [None]:
failed_papers, results = process_papers_parallel(id_file, ranges, output_file)

In [None]:
with open(output_file, "w", encoding='utf-8') as f:
    for text in results:
        f.write(text + "\n")
    
with open("graph-v2/Failed_papers.txt", "w") as f:
    f.write("Failed Downloads:\n")
    for idx, pid in failed_papers[0]:
        f.write(f"{idx}: {pid}\n")
    f.write("\nFailed Extraction:\n")
    for idx, pid in failed_papers[1]:
        f.write(f"{idx}: {pid}\n")
    f.write("\nFailed Introduction Detection:\n")
    for idx, pid in failed_papers[2]:
        f.write(f"{idx}: {pid}\n")