## Example  - Simple Station Distance Calculation

A clean, simple example that simulates your bike rental scenario:
- **Reader**: Loads data files and creates station lookup table
- **Workers**: Calculate distances between stations with caching
- **Single lock**: Simple approach for shared data access

In [11]:
import multiprocessing as mp
import random
import time

def data_reader(queue, lookup_table, lock, num_pairs=30, total_files=3):
    """Simulates loading bike rental data files"""
    print("Reader: Starting...")
    
    for file_num in range(1, total_files + 1):
        print(f"Reader: Processing file {file_num}/{total_files}")
        
        # Add new stations to lookup table
        with lock:
            for station_id in range(1, 5 + file_num + 1):  
                if station_id not in lookup_table:
                    # Simulate station coordinates
                    lookup_table[station_id] = random.uniform(10.0, 100.0)
        
        # Generate trip pairs (start_station, end_station)
        for i in range(num_pairs):
            start_station = random.randint(1, 5 + file_num)
            end_station = random.randint(1, 5 + file_num)
            if start_station != end_station:
                queue.put((start_station, end_station))
            time.sleep(0.1)  # simulate file reading
        
        time.sleep(0.5)  # simulate file processing
    
    # Signal workers to stop
    queue.put(None)
    print("Reader: Finished")

def distance_worker(worker_id, queue, lookup_table, distance_cache, results, lock):
    """Calculates distances between stations"""
    print(f"Worker {worker_id}: Starting...")
    processed = 0
    calculated = 0
    
    while True:
        item = queue.get()
        
        if item is None:
            queue.put(None)  # pass shutdown signal to other workers
            break
        
        start_station, end_station = item
        cache_key = (start_station, end_station)  # directional: (1,2) != (2,1)
        
        # Check cache first
        with lock:
            if cache_key in distance_cache:
                distance = distance_cache[cache_key]
                processed += 1
                continue
        # Calculate new distance
            else:
                coord1 = lookup_table[start_station]
                coord2 = lookup_table[end_station]
        
        # Simulate slow distance calculation (like OSM routing)
        time.sleep(0.5)
        distance = abs(coord1 - coord2)  # simple distance
        
        # Save to cache and results
        with lock:
            distance_cache[cache_key] = distance

        processed += 1
        calculated += 1
    
    with lock:
        results.append(
            {"worker_id": worker_id, "processed": processed, "calculated": calculated}
        )

    print(f"Worker {worker_id}: Finished with {processed} processed, {calculated} calculated distances")

In [12]:
import multiprocessing as mp
import threading
import time

# Setup shared data structures
manager = mp.Manager()
queue = manager.Queue()
lookup_table = manager.dict()  # station_id -> coordinate
distance_cache = manager.dict()  # (start_station, end_station) -> distance
results = manager.list()  # trip results
lock = manager.Lock()  # single lock for all shared data

# Configuration
num_workers = 3
total_files = 3

# Create processes
reader_process = mp.Process(
    target=data_reader,
    args=(queue, lookup_table, lock, 20, total_files)
)

worker_processes = [
    mp.Process(
        target=distance_worker,
        args=(i, queue, lookup_table, distance_cache, results, lock)
    )
    for i in range(num_workers)
]

print(f"Starting 1 reader and {num_workers} workers...")
start_time = time.time()

# Start all processes
reader_process.start()
for worker in worker_processes:
    worker.start()

# Wait for completion
reader_process.join()
for worker in worker_processes:
    worker.join()

end_time = time.time()
print(f"\n✅ Completed in {end_time - start_time:.2f} seconds")

Starting 1 reader and 3 workers...
Reader: Starting...
Reader: Processing file 1/3
Worker 0: Starting...
Worker 1: Starting...


Worker 2: Starting...
Reader: Processing file 2/3
Reader: Processing file 3/3
Reader: FinishedWorker 2: Finished with 19 processed, 10 calculated distancesWorker 1: Finished with 15 processed, 11 calculated distances

Worker 0: Finished with 17 processed, 11 calculated distances


✅ Completed in 7.70 seconds


In [14]:
import pandas as pd
workers_results = pd.DataFrame(list(results))
workers_results

Unnamed: 0,worker_id,processed,calculated
0,2,19,10
1,1,15,11
2,0,17,11


In [18]:
# Analyze results
print(f"\n📊 Results Summary:")
print(f"Stations in lookup table: {len(lookup_table)}")
print(f"Cached routes: {len(distance_cache)}")

# Worker statistics from the new format
total_processed = workers_results['processed'].sum()
total_calculated = workers_results['calculated'].sum()
cached_count = total_processed - total_calculated

if total_processed > 0:
    print(f"Total items processed: {total_processed}")
    print(f"Calculated (new): {total_calculated}")
    print(f"Cached (reused): {cached_count}")
    print(f"Cache hit rate: {cached_count/total_processed*100:.1f}%")

# Show worker performance
print(f"\n👥 Worker Performance:")
for result in workers_results.to_dict('records'):
    worker_id = result['worker_id']
    processed = result['processed']
    calculated = result['calculated']
    cached = processed - calculated
    print(f"  Worker {worker_id}: {processed} total ({calculated} calculated, {cached} cached)")

# Show sample cache entries
print(f"\n🗂️ Sample cached routes (first 8):")
for i, (route, distance) in enumerate(list(distance_cache.items())[:8]):
    start, end = route
    print(f"  Route {start}→{end}: Distance = {distance:.2f}")


📊 Results Summary:
Stations in lookup table: 8
Cached routes: 30
Total items processed: 51
Calculated (new): 32
Cached (reused): 19
Cache hit rate: 37.3%

👥 Worker Performance:
  Worker 2: 19 total (10 calculated, 9 cached)
  Worker 1: 15 total (11 calculated, 4 cached)
  Worker 0: 17 total (11 calculated, 6 cached)

🗂️ Sample cached routes (first 8):
  Route 2→4: Distance = 13.14
  Route 1→5: Distance = 55.21
  Route 4→3: Distance = 50.38
  Route 5→1: Distance = 55.21
  Route 3→1: Distance = 12.87
  Route 1→2: Distance = 50.65
  Route 4→2: Distance = 13.14
  Route 5→4: Distance = 17.70


In [19]:
for station_id, coord in sorted(lookup_table.items()):
    print(f"  Station {station_id}: Coordinate {coord:.2f}")

  Station 1: Coordinate 74.77
  Station 2: Coordinate 24.12
  Station 3: Coordinate 87.64
  Station 4: Coordinate 37.26
  Station 5: Coordinate 19.56
  Station 6: Coordinate 98.48
  Station 7: Coordinate 95.07
  Station 8: Coordinate 28.38


## Example of a process with dashbaord

In [29]:
import multiprocessing as mp
import time, random, math
from queue import Empty

# ── shared counters ──────────────────────────────
files_done      = mp.Value('i', 0)   # files processed
items_in_q      = mp.Value('i', 0)   # queue length
items_done      = mp.Value('i', 0)   # total pairs processed
workers_waiting = mp.Value('i', 0)   # idle workers

def inc(var, n=1):
    with var.get_lock():
        var.value += n

# ── producer ─────────────────────────────────────
def reader(q, total_files=3, pairs_per_file=20):
    for _ in range(total_files):
        for _ in range(pairs_per_file):
            q.put((random.randint(1, 99), random.randint(1, 99)))
            inc(items_in_q)
        inc(files_done)
        time.sleep(0.5)              # simulate file load
    q.put((None, None))              # poison pill

# ── consumer ─────────────────────────────────────
def worker(wid, q):
    while True:
        try:
            inc(workers_waiting)     # about to block
            a, b = q.get(timeout=1)
            inc(workers_waiting, -1)
        except Empty:
            inc(workers_waiting, -1)
            continue
        if a is None:
            q.put((None, None))      # pass poison pill
            break
        math.gcd(a, b)               # dummy work
        time.sleep(0.1)
        inc(items_in_q, -1)
        inc(items_done)

# ── tiny dashboard (parent process) ─────────────
def monitor(total_files, nworkers, procs):
    while any(p.is_alive() for p in procs):
        print(f"\rFiles {files_done.value}/{total_files} | "
              f"Queue {items_in_q.value} | "
              f"Waiting {workers_waiting.value}/{nworkers} | "
              f"Processed {items_done.value}", end="", flush=True)
        time.sleep(0.2)
    print()  # newline when done

# ── bootstrap ───────────────────────────────────
if __name__ == '__main__':
    total_files, nworkers = 3, 3
    q = mp.Queue()

    procs = [mp.Process(target=reader, args=(q, total_files))] + \
            [mp.Process(target=worker, args=(i, q)) for i in range(nworkers)]

    for p in procs:
        p.start()

    monitor(total_files, nworkers, procs)

    for p in procs:
        p.join()


Files 1/3 | Queue 20 | Waiting 0/3 | Processed 0

Files 3/3 | Queue 1 | Waiting 0/3 | Processed 597


## Bike trip with dashboard

In [30]:
"""
Compact bike-trip example with a live one-line dashboard.

‣ One reader process:
    – adds new “station” coordinates,
    – emits <start,end> pairs for each fake CSV file.

‣ N worker processes:
    – pull pairs, look up / cache a distance, simulate a slow call.

Shared counters give the dashboard:
    Files 1/3 | Queue 14 | Waiting 2/3 | Processed 26
"""

import multiprocessing as mp, time, random, math
from queue import Empty          # non-blocking Queue ops

# ───────────────────────── helpers ──────────────────────────────
def inc(val, n=1):               # atomic += n for a mp.Value
    with val.get_lock():
        val.value += n

# ──────────────────────── producer ──────────────────────────────
def data_reader(queue,
                lookup_table, lock,
                total_files, pairs_per_file,
                items_in_q, files_done):
    for f in range(1, total_files + 1):
        # new stations for this “file”
        with lock:
            for sid in range(1, 5 + f + 1):
                lookup_table.setdefault(sid, random.uniform(10.0, 100.0))

        # emit trip pairs
        for _ in range(pairs_per_file):
            a, b = random.randint(1, 5 + f), random.randint(1, 5 + f)
            if a == b:
                continue
            queue.put((a, b))
            inc(items_in_q)
            time.sleep(0.3)          # simulate reading delay

        inc(files_done)              # one file done
        time.sleep(1.5)              # simulate file processing

    queue.put((None, None))          # poison pill

# ──────────────────────── consumer ─────────────────────────────
def distance_worker(wid, queue,
                    lookup_table, distance_cache, results, lock,
                    items_in_q, items_done, workers_waiting):
    processed = calculated = 0

    while True:
        try:
            inc(workers_waiting)     # going to block
            a, b = queue.get(timeout=1)
            inc(workers_waiting, -1)
        except Empty:                # nothing to do
            inc(workers_waiting, -1)
            continue

        if a is None:                # shutdown signal
            queue.put((None, None))
            break

        key = (a, b)
        with lock:                   # cache lookup / insert
            dist = distance_cache.get(key)
            if dist is None:
                dist = abs(lookup_table[a] - lookup_table[b])
                distance_cache[key] = dist
                calculated += 1

        time.sleep(0.5)              # simulate “slow” call
        processed += 1
        inc(items_in_q, -1)
        inc(items_done)

    with lock:
        results.append(dict(worker=wid,
                            processed=processed,
                            calculated=calculated))

# ────────────────── tiny dashboard in parent ───────────────────
def monitor(total_files, nworkers,
            files_done, items_in_q, workers_waiting, items_done,
            procs):
    while any(p.is_alive() for p in procs):
        print(f"\rFiles {files_done.value}/{total_files} | "
              f"Queue {items_in_q.value} | "
              f"Waiting {workers_waiting.value}/{nworkers} | "
              f"Processed {items_done.value}",
              end="", flush=True)
        time.sleep(0.2)
    print()                          # newline when finished

In [31]:
# ──────────────────────── bootstrap ────────────────────────────
# mp.set_start_method("spawn")     # safe on all OSes

TOTAL_FILES, PAIRS_PER_FILE = 3, 20
NWORKERS = 3

queue = mp.Queue()
manager = mp.Manager()
lookup_table   = manager.dict()      # station_id → coord
distance_cache = manager.dict()      # (a,b)     → dist
results        = manager.list()
lock           = manager.Lock()

# shared dashboard counters
files_done      = mp.Value('i', 0)
items_in_q      = mp.Value('i', 0)
items_done      = mp.Value('i', 0)
workers_waiting = mp.Value('i', 0)

reader = mp.Process(target=data_reader,
                  args=(queue, lookup_table, lock,
                        TOTAL_FILES, PAIRS_PER_FILE,
                        items_in_q, files_done))

workers = [mp.Process(target=distance_worker,
                        args=(wid, queue,
                              lookup_table, distance_cache, results, lock,
                              items_in_q, items_done, workers_waiting))
            for wid in range(NWORKERS)]

procs = [reader, *workers]
for p in procs: p.start()

monitor(TOTAL_FILES, NWORKERS,
      files_done, items_in_q, workers_waiting, items_done,
      procs)

for p in procs: p.join()

print("\nPer-worker stats:", list(results))
print("Cached distances :", len(distance_cache))


Files 3/3 | Queue 0 | Waiting 0/3 | Processed 55

Per-worker stats: [{'worker': 2, 'processed': 18, 'calculated': 9}, {'worker': 1, 'processed': 18, 'calculated': 10}, {'worker': 0, 'processed': 19, 'calculated': 11}]
Cached distances : 30
