TEXT AND TABLE EXTRACTION INTO JSON FILE FROM PDF 

In [4]:
import fitz  # PyMuPDF
import os
import json
import re

# Your provided sections dictionary
sections = {
    "title": r"\btitle\b",
    "abstract": r"\babstract\b",
    "keywords": r"\bkeywords\b|\bkey\s*words\b",
    "introduction": r"\bintroduction\b|\bbackground\b",
    "literature_review": r"\bliterature\s+review\b|\brelated\s+work\b|\bprior\s+work\b|\bstate\s+of\s+the\s+art\b",
    "theoretical_framework": r"\btheoretical\s+framework\b",
    "methodology": r"\bmethodology\b|\bmethods\b|\bmaterials\s+and\s+methods\b|\bexperimental\s+procedures\b|\bmethod\b",
    "data_collection": r"\bdata\s+collection\b",
    "data_analysis": r"\bdata\s+analysis\b",
    "experiments": r"\bexperiments?\b|\bexperimental\s+(setup|design|results)\b",
    "results": r"\bresults?\b|\bfindings\b",
    "discussion": r"\bdiscussion\b|\banalysis\b|\binterpretation\b|\bresults\s+and\s+discussion\b",
    "conclusion": r"\bconclusion\b|\bsummary\b|\bconcluding\s+remarks\b",
    "future_work": r"\bfuture\s+work\b|\bfuture\s+directions\b|\bfurther\s+research\b",
    "acknowledgements": r"\backnowledgements?\b|\bthanks\b",
    "references": r"\breferences\b|\bbibliography\b|\bworks\s+cited\b",
    "appendix": r"\bappendix\b|\bappendices\b",
    "supplementary_material": r"\bsupplementary\s+(material|information)\b",
    "author_information": r"\bauthor\s+(information|details|biography|bios)\b",
    "conflict_of_interest": r"\bconflicts?\s+of\s+interest\b|\bcompeting\s+interests?\b",
    "funding": r"\bfunding\b|\bfunding\s+sources\b",
    "ethics_statement": r"\bethics\s+statement\b|\bethical\s+approval\b",
    "abbreviations": r"\babbreviations\b|\blist\s+of\s+abbreviations\b",
    "glossary": r"\bglossary\b",
    "implementation": r"\bimplementation\b",
    "evaluation": r"\bevaluation\b",
    "validation": r"\bvalidation\b",
    "limitations": r"\blimitations\b|\bscope\s+and\s+limitations\b",
    "related_work": r"\brelated\s+work\b|\brelated\s+studies\b",
    "study_design": r"\bstudy\s+design\b|\bresearch\s+design\b",
    "objectives": r"\bobjectives?\b|\bgoals?\b|\baims?\b",
    "hypotheses": r"\bhypotheses?\b",
    "problem_statement": r"\bproblem\s+statement\b|\bresearch\s+problem\b",
    "data_and_methods": r"\bdata\s+and\s+methods\b",
    "experimental_setup": r"\bexperimental\s+setup\b|\bexperiment\s+setup\b",
    "findings": r"\bfindings\b",
    "analysis": r"\banalysis\b",
    "author_contributions": r"\bauthor\s+contributions?\b",
    "data_availability": r"\bdata\s+availability\b|\bavailability\s+of\s+data\b",
    "supplementary_information": r"\bsupplementary\s+information\b",
    "notes": r"\bnotes\b",
    "footnotes": r"\bfootnotes\b",
    "preface": r"\bpreface\b",
    "foreword": r"\bforeword\b",
    "prologue": r"\bprologue\b",
    "epilogue": r"\bepilogue\b",
    "afterword": r"\bafterword\b",
    "about_the_author": r"\babout\s+the\s+author\b|\bauthor\s+bio\b",
    "index": r"\bindex\b",
    "list_of_figures": r"\blist\s+of\s+figures\b",
    "list_of_tables": r"\blist\s+of\s+tables\b",
    "executive_summary": r"\bexecutive\s+summary\b",
    "acknowledgments": r"\backnowledgments\b",
    "copyright": r"\bcopyright\b",
    "patents": r"\bpatents\b",
    "sponsors": r"\bsponsors?\b",
    "disclaimer": r"\bdisclaimer\b",
    "dedication": r"\bdedication\b",
    "transparency_statement": r"\btransparency\s+statement\b",
    "statistical_analysis": r"\bstatistical\s+analysis\b",
    "methodological_approach": r"\bmethodological\s+approach\b",
    "experimental_methods": r"\bexperimental\s+methods\b",
    "design_and_methods": r"\bdesign\s+and\s+methods\b",
    "case_study": r"\bcase\s+study\b|\bcase\s+studies\b",
    "materials": r"\bmaterials\b",
    "supplementary_results": r"\bsupplementary\s+results\b",
    "recommendations": r"\brecommendations\b",
    "policy_implications": r"\bpolicy\s+implications\b",
    "theoretical_implications": r"\btheoretical\s+implications\b",
    "practical_implications": r"\bpractical\s+implications\b",
    "abstract_figure": r"\babstract\s+figure\b",
    "graphical_abstract": r"\bgraphical\s+abstract\b",
    "main_text": r"\bmain\s+text\b",
    "body": r"\bbody\b",
    "review_of_literature": r"\breview\s+of\s+literature\b",
}

def extract_pdf_to_json_with_sections(pdf_path, output_path):
    try:
        doc = fitz.open(pdf_path)
        data = {}
        content = ""

        # Combine text from all pages
        for page_num in range(len(doc)):
            page = doc[page_num]
            content += page.get_text("text") + "\n"

        # Convert 'sections' patterns into compiled regular expressions
        predefined_sections = {}
        for section_name, pattern in sections.items():
            # Adjust the section name to be more readable
            readable_section_name = ' '.join(word.capitalize() for word in section_name.split('_'))
            # Adjust the pattern to match section headings at the start of a line
            adjusted_pattern = re.compile(rf"(?im)^\s*({pattern})\s*$")
            predefined_sections[readable_section_name] = adjusted_pattern

        # Find all section headers and their positions
        section_positions = []
        for section_name, pattern in predefined_sections.items():
            for match in pattern.finditer(content):
                position = match.start()
                section_positions.append((position, section_name))

        # Remove duplicates and sort sections based on their positions in the text
        section_positions = list(set(section_positions))
        section_positions.sort()

        # Extract content before the first section as 'Title and Authors'
        if section_positions:
            first_section_start = section_positions[0][0]
            title_and_authors = content[:first_section_start].strip()
            if title_and_authors:
                data['Title and Authors'] = title_and_authors
        else:
            # If no sections are found, consider the entire content as 'Title and Authors'
            data['Title and Authors'] = content.strip()

        # Extract text between sections
        for i, (start_pos, section_name) in enumerate(section_positions):
            if i + 1 < len(section_positions):
                end_pos = section_positions[i + 1][0]
            else:
                end_pos = len(content)
            section_text = content[start_pos:end_pos].strip()
            data[section_name] = section_text

        # Extract tables and their descriptions
        table_pattern = re.compile(r'(?i)(Table\s*\d+[\.\:]*\s*)(.*?)\n', re.DOTALL)
        tables = []
        for match in table_pattern.finditer(content):
            table_title = match.group(1).strip()
            table_description = match.group(2).strip()
            tables.append({
                'Table Title': table_title,
                'Table Description': table_description
            })
        if tables:
            data['Tables'] = tables

        # DEBUG: Print extracted sections
        print(f"Extracted sections for {pdf_path}: {list(data.keys())}\n")

        # Save to JSON if data is not empty
        if data:
            with open(output_path, "w", encoding="utf-8") as json_file:
                json.dump(data, json_file, indent=4, ensure_ascii=False)
            print(f"JSON saved to {output_path}")
        else:
            print(f"No sections found for {pdf_path}. Skipping...")

    except Exception as e:
        print(f"Failed to process {pdf_path}: {e}")

if __name__ == "__main__":
    pdf_directory = "/Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024"
    output_directory = "/Users/kuntal/Documents/Github/arxiv scraper/json_files-4"
    os.makedirs(output_directory, exist_ok=True)

    for pdf_file in os.listdir(pdf_directory):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, pdf_file)
            output_path = os.path.join(output_directory, f"{os.path.splitext(pdf_file)[0]}.json")
            extract_pdf_to_json_with_sections(pdf_path, output_path)


Extracted sections for /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/1807.05385.pdf: ['Title and Authors', 'Abstract', 'Acknowledgments', 'References']

JSON saved to /Users/kuntal/Documents/Github/arxiv scraper/json_files-4/1807.05385.json
Extracted sections for /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/2004.06795.pdf: ['Title and Authors', 'Abstract', 'Tables']

JSON saved to /Users/kuntal/Documents/Github/arxiv scraper/json_files-4/2004.06795.json
Extracted sections for /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/2102.09342.pdf: ['Title and Authors', 'References']

JSON saved to /Users/kuntal/Documents/Github/arxiv scraper/json_files-4/2102.09342.json
Extracted sections for /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/2103.08413.pdf: ['Title and Authors', 'References']

JSON saved to /Users/kuntal/Documents/Github/arxiv scraper/json_files-4/2103.08413.json
Extracted sections for /Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/1810.00

IMAGE EXTRACTION 

In [None]:
import fitz  # PyMuPDF
import os

# Define the input and output directories
pdf_dir = "/Users/kuntal/Documents/Github/arxiv scraper/pdfs/2024/"
output_base_dir = "/Users/kuntal/Documents/Github/arxiv scraper/images-3"

# Ensure the output base directory exists
os.makedirs(output_base_dir, exist_ok=True)

# Iterate over each file in the PDF directory
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        # Construct the full path to the PDF file
        pdf_path = os.path.join(pdf_dir, pdf_file)
        
        # Extract the base name of the PDF file without the extension for folder naming
        folder_name = os.path.splitext(pdf_file)[0]
        
        # Create a full path for the new directory inside the output base directory
        output_folder_path = os.path.join(output_base_dir, folder_name)
        
        # Create a new directory with the folder name if it doesn't already exist
        os.makedirs(output_folder_path, exist_ok=True)
        
        # Open the PDF file using PyMuPDF
        pdf_document = fitz.open(pdf_path)
        
        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            image_list = page.get_images(full=True)
            
            # Initialize a flag to check if vector image is present
            vector_image_present = False
            
            if image_list:  # If raster images are found
                print(f"[+] Found {len(image_list)} raster image(s) on page {page_number+1} of {pdf_file}")
                for image_index, img in enumerate(image_list, start=1):
                    try:
                        xref = img[0]
                        base_image = pdf_document.extract_image(xref)
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]
                        
                        # Save raster image
                        image_filename = os.path.join(output_folder_path, f"page_{page_number+1}_raster_image_{image_index}.{image_ext}")
                        with open(image_filename, "wb") as img_file:
                            img_file.write(image_bytes)
                        print(f"[+] Raster image saved as {image_filename}")
                    
                    except Exception as e:
                        print(f"[!] Could not process raster image {image_index} on page {page_number+1} of {pdf_file}: {e}")
            
            # Check if there are vector images
            try:
                # Attempt to identify vector images
                for img in page.get_images(full=True):
                    if img[7] == 0:  # Check if it's a vector image
                        vector_image_present = True
                        break
            except Exception as e:
                print(f"[!] Error checking for vector images: {e}")
            
            # Render and save full page ONLY if vector images are present
            if vector_image_present:
                try:
                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Scale by 2x for better quality
                    rendered_image_path = os.path.join(output_folder_path, f"page_{page_number+1}_vector_rendered.png")
                    pix.save(rendered_image_path)
                    print(f"[+] Rendered vector graphics page saved as {rendered_image_path}")
                except Exception as e:
                    print(f"[!] Could not render page {page_number+1} of {pdf_file}: {e}")
        
        # Close the PDF document
        pdf_document.close()

METADATA CREATION:

In [None]:
import os
import json
import uuid

# Define your directories
base_dir = '/Users/kuntal/Documents/Github/arxiv scraper'
json_files_dir = os.path.join(base_dir, 'json_files')
equation_json_dir = os.path.join(base_dir, 'equation_json')
images_dir = os.path.join(base_dir, 'images')
output_dir = os.path.join(base_dir, 'metadata_output')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Function to generate a unique ID
def generate_unique_id():
    return str(uuid.uuid4())

# Iterate over each paper's JSON file
for paper_json_file in os.listdir(json_files_dir):
    if paper_json_file.endswith('.json'):
        paper_id = os.path.splitext(paper_json_file)[0]
        paper_json_path = os.path.join(json_files_dir, paper_json_file)
        
        # Load paper metadata
        with open(paper_json_path, 'r') as f:
            paper_data = json.load(f)
        
        # Initialize the metadata dictionary
        metadata = {
            "paper_id": paper_data.get("paper_id", paper_id),
            "unique_id": generate_unique_id(),
            "text_file": f"{paper_id}.json",  # Direct path to the text file
            "images": [],
            "equations": [],  # Changed from 'formulas' to 'equations'
            "tables": []  # Direct table data is now here
        }
        
        # Process images
        paper_images_dir = os.path.join(images_dir, paper_id)
        if os.path.exists(paper_images_dir):
            for image_file in os.listdir(paper_images_dir):
                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_id = os.path.splitext(image_file)[0]
                    image_data = {
                        "image_id": image_id,
                    }
                    metadata["images"].append(image_data)
        
        # Process equations
        equation_json_file = os.path.join(equation_json_dir, f'{paper_id}.json')
        if os.path.exists(equation_json_file):
            with open(equation_json_file, 'r') as eq_f:
                equations = json.load(eq_f)
                if isinstance(equations, dict):  # Ensure the JSON contains a dictionary
                    for eq_key, eq_latex in equations.items():
                        equation_id = generate_unique_id()
                        equation_data = {
                            "equation_id": equation_id,
                            "equation_key": eq_key,  # Include the key as metadata
                            "latex": eq_latex  # Store the actual LaTeX code
                        }
                        metadata["equations"].append(equation_data)
        
        # Process tables directly from JSON file
        if "Tables" in paper_data:
            for idx, table in enumerate(paper_data["Tables"], start=1):
                table_id = generate_unique_id()
                table_data = {
                    "table_id": table_id,
                    "title": table.get("Table Title", ""),
                    "description": table.get("Table Description", ""),
                }
                metadata["tables"].append(table_data)
        
        # Write the metadata to a JSON file
        output_file = os.path.join(output_dir, f'{paper_id}_metadata.json')
        with open(output_file, 'w') as out_f:
            json.dump(metadata, out_f, indent=4)

        print(f"Metadata for paper {paper_id} has been created.")

FINAL FOLDER CREATION WITH PROPER FORMAT :


In [None]:
import os
import json
import shutil

# Define your directories
base_dir = '/Users/kuntal/Documents/Github/arxiv scraper'
json_files_dir = os.path.join(base_dir, 'json_files')
images_dir = os.path.join(base_dir, 'images')
equation_json_dir = os.path.join(base_dir, 'equation_json')
metadata_output_dir = os.path.join(base_dir, 'metadata_output')
output_dir = os.path.join(base_dir, '2024_arxic_papers_CS')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate over each paper's JSON file
for paper_json_file in os.listdir(json_files_dir):
    if paper_json_file.endswith('.json'):
        paper_id = os.path.splitext(paper_json_file)[0]
        paper_json_path = os.path.join(json_files_dir, paper_json_file)
        
        # Create paper-specific folder structure
        paper_folder = os.path.join(output_dir, paper_id)
        os.makedirs(paper_folder, exist_ok=True)
        os.makedirs(os.path.join(paper_folder, 'images'), exist_ok=True)
        os.makedirs(os.path.join(paper_folder, 'equations'), exist_ok=True)
        
        # Copy metadata from metadata_output if available
        metadata_file_path = os.path.join(metadata_output_dir, f'{paper_id}_metadata.json')
        metadata_path = os.path.join(paper_folder, 'metadata.json')
        if os.path.exists(metadata_file_path):
            shutil.copy(metadata_file_path, metadata_path)
        else:
            shutil.copy(paper_json_path, metadata_path)
        
        # Create paper_name.json with only text and tables
        text_file_path = os.path.join(paper_folder, f'{paper_id}.json')
        with open(paper_json_path, 'r') as f:
            paper_data = json.load(f)
            text_content = {
                "title": paper_data.get("title", ""),
                "abstract": paper_data.get("abstract", ""),
                "introduction": paper_data.get("Introduction", ""),
                "tables": paper_data.get("Tables", [])
            }
            with open(text_file_path, 'w') as text_file:
                json.dump(text_content, text_file, indent=4)
        
        # Update metadata to include path to text file
        with open(metadata_path, 'r+') as metadata_file:
            metadata = json.load(metadata_file)
            metadata["text_file"] = os.path.join(paper_id, f'{paper_id}.json')
            metadata_file.seek(0)
            json.dump(metadata, metadata_file, indent=4)
            metadata_file.truncate()
        
        # Process images
        paper_images_dir = os.path.join(images_dir, paper_id)
        if os.path.exists(paper_images_dir):
            for image_file in os.listdir(paper_images_dir):
                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    src_path = os.path.join(paper_images_dir, image_file)
                    dest_path = os.path.join(paper_folder, 'images', image_file)
                    shutil.copy(src_path, dest_path)
        
        # Process equations (formerly formulas)
        equation_json_path = os.path.join(equation_json_dir, f'{paper_id}.json')
        if os.path.exists(equation_json_path):
            with open(equation_json_path, 'r') as eq_f:
                equations = json.load(eq_f)
                for idx, equation in enumerate(equations, start=1):
                    equation_filename = f'page_{idx}_equation_{idx}.txt'
                    equation_path = os.path.join(paper_folder, 'equations', equation_filename)
                    with open(equation_path, 'w') as equation_file:
                        equation_file.write(equation)
        
        print(f"Organized data for paper {paper_id} into {paper_folder}")