# ***Libraries & Tools***

In [None]:
import fitz  # PyMuPDF
import requests
import io
import re
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor

# ***PDF Extraction & Analysis***

## ***Serial***

In [None]:
def download_pdf(url):
    response = requests.get(url)
    if response.status_code == 200:
        return io.BytesIO(response.content)
    else:
        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")

def extract_text_from_pdf(pdf_stream):
    doc = fitz.open("pdf", pdf_stream)  # Open PDF from memory
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"  # Extract text from each page
    return text

def extract_introduction(text, word_limit=400):
    lines = text.split("\n")
    introduction_found = False
    extracted_text = []
    word_count = 0

    # Pattern for section title with numbering
    section_pattern = re.compile(r'^\s*(\d+\.|[IVXLCDM]+\.|[A-Z]\.)\s+(INTRODUCTION|Introduction)\s*$')

    for line in lines:
        if not introduction_found:
            # Check if the line matches an Introduction section title
            if section_pattern.match(line):
                introduction_found = True
        else:
            # Split line into words and keep punctuation/symbols
            words = line.split()
            if word_count + len(words) > word_limit:
                # Take only the words needed to reach the limit
                words = words[: word_limit - word_count]
                extracted_text.append(" ".join(words))
                break  # Stop once the limit is reached
            else:
                extracted_text.append(line)
                word_count += len(words)

    # Fix inline hyphenated words (e.g., "poten- tially" → "potentially")
    cleaned_text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', " ".join(extracted_text))

    return cleaned_text.strip() if cleaned_text else "Introduction section not found."


def process_pdf(url):
    try:
        pdf_stream = download_pdf(url)
        #start_time = datetime.now()
        text = extract_text_from_pdf(pdf_stream)
        introduction_text = extract_introduction(text)
        #end_time = datetime.now()
        #print((end_time - start_time).total_seconds())
        return introduction_text
    except Exception as e:
        return str(e)

In [None]:
pdf_url = "https://arxiv.org/pdf/0704.1274"  # Replace with your actual PDF URL
intro_text = process_pdf(pdf_url)
print("==================================================")
print(intro_text)

## ***Parallel***

In [None]:
def download_pdf(paper_id):
    url = f"https://arxiv.org/pdf/{paper_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return io.BytesIO(response.content)
    else:
        return None  # Indicate failure

def extract_text_from_pdf(pdf_stream):
    if not pdf_stream:
        return None  # Failed to download
    
    try:
        doc = fitz.open("pdf", pdf_stream)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        return text if text.strip() else None  # Return None if empty
    except:
        return None  # Failed to extract text

def extract_introduction(text, word_limit=400):
    lines = text.split("\n")
    introduction_found = False
    extracted_text = []
    word_count = 0
    
    section_pattern = re.compile(r'^\s*(\d+\.|[IVXLCDM]+\.|[A-Z]\.)\s+(INTRODUCTION|Introduction)\s*$')
    
    for line in lines:
        if not introduction_found:
            if section_pattern.match(line):
                introduction_found = True
        else:
            words = line.split()
            if word_count + len(words) > word_limit:
                words = words[: word_limit - word_count]
                extracted_text.append(" ".join(words))
                break
            else:
                extracted_text.append(line)
                word_count += len(words)
    
    cleaned_text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', " ".join(extracted_text))
    return cleaned_text.strip() if cleaned_text.strip() else None  # None if intro not found

def process_range(paper_ids, start, end):
    results = [(i, "FAILED") for i in range(start, end)]
    failed_papers = [[] for _ in range(3)]  # Track failures: [download, extraction, introduction]
    
    for i, paper_id in enumerate(paper_ids, start=start):
        pdf_stream = download_pdf(paper_id)
        if not pdf_stream:
            failed_papers[0].append((i, paper_id))
            continue
        
        text = extract_text_from_pdf(pdf_stream)
        if text is None:
            failed_papers[1].append((i, paper_id))
            continue
        
        introduction_text = extract_introduction(text)
        if introduction_text is None:
            failed_papers[2].append((i, paper_id))
            continue
        
        results[i - start] = (i, introduction_text)
    
    return results, failed_papers

def process_papers_parallel(id_file, ranges, output_file, max_workers=15):
    with open(id_file, "r") as f:
        paper_ids = [line.strip() for line in f if line.strip()]
    
    failed_papers = [[] for _ in range(3)]  # Track failures: [download, extraction, introduction]
    results = ["FAILED"] * len(paper_ids)  # Initialize output list with "FAILED"
    
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_range = {executor.submit(process_range, paper_ids[r[0]:r[1]], r[0], r[1]): r for r in ranges}
        for future in future_to_range:
            range_results, range_failures = future.result()
            for i, text in range_results:
                results[i] = text
            for j in range(3):
                failed_papers[j].extend(range_failures[j])
    
    with open(output_file, "w") as f:
        for text in results:
            f.write(text + "\n")
    
    with open("Failed_papers.txt", "w") as f:
        f.write("Failed Downloads:\n")
        for idx, pid in failed_papers[0]:
            f.write(f"{idx}: {pid}\n")
        f.write("\nFailed Extraction:\n")
        for idx, pid in failed_papers[1]:
            f.write(f"{idx}: {pid}\n")
        f.write("\nFailed Introduction Detection:\n")
        for idx, pid in failed_papers[2]:
            f.write(f"{idx}: {pid}\n")

    return failed_papers

In [None]:
num_ids = 43000
batch_size = int(num_ids / 15)  

ranges = []

start = 0
while start < num_ids:
    end = min(start + batch_size, num_ids)
    ranges.append([start, end])
    start = end

ranges[-2][1] = ranges[-1][1]
del ranges[-1]
print(ranges)

In [None]:
id_file = "Node_IDs.txt"  # File containing one ID per line

output_file = "graph-v3/Additional_Text.txt"
failed_papers = process_papers_parallel(id_file, ranges, output_file)

The function below concatenates the contents of a row from two files 


In [None]:
def concatenate_files(file1, file2, output_file):
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2, open(output_file, 'w', encoding='utf-8') as out:
        for line1, line2 in zip(f1, f2):
            out.write(f"{line1.strip()}. {line2.strip()}\n")

# Example usage:
concatenate_files("graph-v3/data-v2.txt", "graph-v3/Additional_Text.txt", "graph-v3/data-v3.txt")