In [3]:
# ├── 00.pre-process.ipynb
# ├── 01.clustering.ipynb
# ├── 02.training.ipynb
# ├── 03.inference-preprocess.ipynb
# ├── 05.post-processing.ipynb
# ├── 06.inference.ipynb
# ├── 07.evaluation.ipynb

In [13]:
import os

paths = ["data/input", "data/intermediate", "artifacts", "data/output", "checkpoints"]

for path in paths:
    os.makedirs(path, exist_ok=True)
    print(f"✅ Ensured directory exists: {path}")

✅ Ensured directory exists: data/input
✅ Ensured directory exists: data/intermediate
✅ Ensured directory exists: artifacts
✅ Ensured directory exists: data/output
✅ Ensured directory exists: checkpoints


In [14]:
import os
import sys
import time

import papermill as pm


def run_notebook(input_notebook, output_notebook=None, parameters=None):
    try:
        print(f"\n📓 Running notebook: {input_notebook}")
        start_time = time.time()

        pm.execute_notebook(
            input_path=input_notebook,
            output_path=output_notebook or input_notebook.replace(".ipynb", ".executed.ipynb"),
            parameters=parameters or {},
            log_output=True,
            autosave_cell_every=60,
            stdout_file=sys.stdout,
            stderr_file=sys.stderr,
        )

        duration = time.time() - start_time
        print(f"✅ Done: {output_notebook or input_notebook} (⏱️ {duration:.2f} seconds)")
    except Exception as e:
        print(f"❌ Error in {input_notebook}: {e}")
        raise


def main():
    os.makedirs("executed", exist_ok=True)
    total_start_time = time.time()

    run_notebook("00.pre-process.ipynb", "executed/00.pre-process.ipynb")
    run_notebook("02.training.ipynb", "executed/02.training.ipynb")
    run_notebook("03.inference-preprocess.ipynb", "executed/03.inference-preprocess.ipynb")
    run_notebook("04.inference.ipynb", "executed/04.inference.ipynb")
    run_notebook("05.post-processing.ipynb", "executed/05.post-processing.ipynb")
    run_notebook("06.evaluation.ipynb", "executed/06.evaluation.ipynb")

    total_duration = time.time() - total_start_time
    print(f"\n✅ Flow has completed successfully in {total_duration:.2f} seconds!")


if __name__ == "__main__":
    main()


📓 Running notebook: 00.pre-process.ipynb


Executing:   0%|          | 0/3 [00:00<?, ?cell/s]

  freq = pd.tseries.frequencies.to_offset(class_group.freq)

  freq = pd.tseries.frequencies.to_offset(class_group.freq)


Using device: cuda


Scaling series:   0%|          | 0/1428 [00:00<?, ?it/s]
Scaling series:   0%|          | 1/1428 [00:00<03:11,  7.46it/s]
Scaling series:  32%|███▏      | 453/1428 [00:00<00:00, 2333.18it/s]
Scaling series:  64%|██████▎   | 908/1428 [00:00<00:00, 3270.63it/s]
Scaling series:  88%|████████▊ | 1260/1428 [00:00<00:00, 3127.67it/s]
Scaling series: 100%|██████████| 1428/1428 [00:00<00:00, 2880.93it/s]
Scaling series: 100%|██████████| 1428/1428 [00:00<00:00, 2880.93it/s]




✅ Scaled data saved to: data/intermediate/m3-monthly_scaled.parquet
✅ Scaler stats saved to: artifacts/m3-monthly_scalers.json
✅ Done: executed/00.pre-process.ipynb (⏱️ 3.71 seconds)

📓 Running notebook: 02.training.ipynb


Executing:   0%|          | 0/6 [00:00<?, ?cell/s]

Using 16bit Automatic Mixed Precision (AMP)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores

TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

HPU available: False, using: 0 HPUs
/home/pranav-pc/projects/ts/.venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/pranav-pc/projects/ts/nbs/pipeline/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

/home/pranav-pc/projects/ts/.venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/pranav-pc/projects/ts/nbs/pipeline/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the conf

[{'test_loss': 0.013208459131419659, 'test_smape': 0.27940425276756287, 'test_mase': 0.0001561021344969049, 'test_owa': 0.7268368005752563}]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[{'val_loss': 0.01312445942312479, 'val_smape': 0.2746376097202301, 'val_mase': 0.0001552953035570681, 'val_owa': 0.6890231966972351}]
✅ Done: executed/02.training.ipynb (⏱️ 92.80 seconds)

📓 Running notebook: 03.inference-preprocess.ipynb


Executing:   0%|          | 0/5 [00:00<?, ?cell/s]

  infer_df = df.groupby("unique_id").apply(extract_series).dropna()

  infer_df = df.groupby("unique_id").apply(extract_series).dropna()


✅ Done: executed/03.inference-preprocess.ipynb (⏱️ 2.29 seconds)

📓 Running notebook: 04.inference.ipynb


Executing:   0%|          | 0/5 [00:00<?, ?cell/s]

✅ Done: executed/04.inference.ipynb (⏱️ 4.28 seconds)

📓 Running notebook: 05.post-processing.ipynb


Executing:   0%|          | 0/4 [00:00<?, ?cell/s]

Inverting:   0%|          | 0/1108 [00:00<?, ?it/s]
Inverting:   5%|▍         | 55/1108 [00:00<00:01, 549.73it/s]
Inverting:  35%|███▌      | 388/1108 [00:00<00:00, 2184.29it/s]
Inverting:  65%|██████▌   | 721/1108 [00:00<00:00, 2705.56it/s]
Inverting:  95%|█████████▌| 1054/1108 [00:00<00:00, 2951.77it/s]
Inverting: 100%|██████████| 1108/1108 [00:00<00:00, 2658.56it/s]
Inverting: 100%|██████████| 1108/1108 [00:00<00:00, 2658.56it/s]




✅ Done: executed/05.post-processing.ipynb (⏱️ 2.29 seconds)

📓 Running notebook: 06.evaluation.ipynb


Executing:   0%|          | 0/6 [00:00<?, ?cell/s]

✅ Done: executed/06.evaluation.ipynb (⏱️ 1.92 seconds)

✅ Flow has completed successfully in 107.28 seconds!


In [13]:
## Parallel execution

In [12]:
# import papermill as pm
# from concurrent.futures import ThreadPoolExecutor, as_completed

# def run_notebook(input_notebook, parameters=None):
#     """
#     Executes a notebook without saving the output.

#     Args:
#         input_notebook (str): Path to the input notebook.
#         parameters (dict, optional): Parameters to pass into the notebook.
#     """
#     try:
#         print(f"Running notebook: {input_notebook}")
#         pm.execute_notebook(input_notebook, input_notebook, parameters=parameters or {})
#         print(f"✅ Done: {input_notebook}")
#     except Exception as e:
#         print(f"Error occurred during notebook execution: {str(e)}")
#         raise e

# def main():
#     # List of notebooks to run
#     notebooks = [
#         '00.pre-process.ipynb',
#         # '01.feature-engineering.ipynb',
#         # '02.model-training.ipynb',
#         '03.post-processing.ipynb'
#     ]

#     # Using ThreadPoolExecutor to run notebooks in parallel
#     with ThreadPoolExecutor(max_workers=4) as executor:
#         futures = [executor.submit(run_notebook, notebook) for notebook in notebooks]

#         # Wait for all futures to complete
#         for future in as_completed(futures):
#             future.result()  # If any exceptions were raised, it will be propagated here

#     print("Flow has completed successfully!")

# if __name__ == "__main__":
#     main()

Running notebook: 00.pre-process.ipynbRunning notebook: 03.post-processing.ipynb



Executing:   0%|          | 0/3 [00:00<?, ?cell/s]

Executing:   0%|          | 0/3 [00:00<?, ?cell/s]

✅ Done: 03.post-processing.ipynb
✅ Done: 00.pre-process.ipynb
Flow has completed successfully!
