# Demo: Parallel Data Processing Pipeline

This notebook demonstrates how to use the multi-threaded capability of the processing pipeline to speed up data extraction from LaTeX papers.

In [1]:
import sys
import os

# Adjust path to import src from the sibling directory '23127011'
# Assuming notebook is running from 'Milestone2_Project/notebooks'
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir) # Go up one level to Milestone2_Project
src_parent_dir = os.path.join(project_root, '23127011')

if src_parent_dir not in sys.path:
    sys.path.append(src_parent_dir)

from src.pipeline import run_processing_pipeline

print(f"Project Root: {project_root}")
print("Successfully imported `run_processing_pipeline`.")

Project Root: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project
Successfully imported `run_processing_pipeline`.


In [2]:
# Define file paths
# We use the existing 'data_raw' folder and create a new output folder for this demo
DATA_RAW = os.path.join(project_root, 'data_raw_real')
DATA_OUTPUT = os.path.join(project_root, 'data_output_v2')
MAX_WORKERS = os.cpu_count() or 4

print(f"Data Raw Path: {DATA_RAW}")
print(f"Data Output Path: {DATA_OUTPUT}")
print(f"Max Workers: {MAX_WORKERS}")

Data Raw Path: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\data_raw_real
Data Output Path: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\data_output_v2
Max Workers: 20


In [None]:
# Run the pipeline in parallel
print(f"Starting parallel execution... Logs will be written to {os.path.join(DATA_OUTPUT, 'pipeline.log')}")
run_processing_pipeline(DATA_RAW, DATA_OUTPUT, parallel=True, max_workers=MAX_WORKERS)
print("Execution finished.")

Starting parallel execution... Logs will be written to d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\data_output_v2\pipeline.log


In [None]:
# Verify outputs
if os.path.exists(DATA_OUTPUT):
    output_folders = os.listdir(DATA_OUTPUT)
    print(f"Total papers processed: {len(output_folders)}")
    if output_folders:
        print(f"Sample output: {output_folders[:5]}")
        
        # Check contents of one folder
        sample_paper = output_folders[0]
        sample_path = os.path.join(DATA_OUTPUT, sample_paper)
        print(f"\nContents of {sample_paper}:")
        for f in os.listdir(sample_path):
            print(f" - {f}")
else:
    print("Output directory was not created.")