# Baseline Control Pipeline Notebook
This notebook orchestrates the full baseline pipeline for the Siamese baseline:
1. Split metadata into train/test sets
2. Generate positive/negative pairs
3. Train the baseline model


In [1]:
import os, sys

# Compute directories
cwd       = os.getcwd()
repo_root = os.path.abspath(os.path.join(cwd, '..'))
utils_dir = os.path.join(repo_root, 'utils')
train_dir = os.path.join(repo_root, 'train')

# Add to Python path
sys.path.insert(0, utils_dir)
sys.path.insert(0, train_dir)

# Imports
from sklearn.model_selection import train_test_split
from pair_generator import PairGen          # in utils/
from data_loader import ShapePairDataset    # in utils/
import train_baseline                       # in train/

print("Imports succeeded!")


Imports succeeded!


In [2]:
# Parameters
metadata   = 'data/raw/metadata.jsonl'     # Path to the raw metadata JSONL file containing view records
raw_dir    = 'data/raw'                    # Directory containing raw rendered images
work_dir   = 'data/processed'              # Directory where processed metadata and pairs will be saved
test_size  = 0.2                           # Fraction of data to reserve for the test set 
neg_ratio  = 1                             # Number of negative samples to generate per positive sample
batch_size = 32                            # Batch size for training and evaluation
epochs     = 10                            # Number of training epochs to run
seed       = 42                            # Random seed for reproducibility


In [3]:
# Split metadata into train/test 
import json
from pathlib import Path

# Compute paths
repo_root = Path(os.getcwd()).parent
meta_path = repo_root / metadata   

# Load all view records
with open(meta_path, 'r') as f:
    records = [json.loads(line) for line in f]

# Perform the split
train_records, test_records = train_test_split(
    records,
    test_size=test_size,
    random_state=seed,
    shuffle=True
)

# Helper to write out JSONL files
def write_jsonl(path, data):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, 'w') as fw:
        for rec in data:
            fw.write(json.dumps(rec) + '\n')

# Write the splits
train_meta_out = repo_root / work_dir / 'metadata_train.jsonl'
test_meta_out  = repo_root / work_dir / 'metadata_test.jsonl'
write_jsonl(train_meta_out, train_records)
write_jsonl(test_meta_out,  test_records)

print(f"Split metadata: {len(train_records)} train, {len(test_records)} test")


Split metadata: 14400 train, 3600 test


In [5]:
# Generate positive/negative pairs
from pathlib import Path
import os

# Compute path
repo_root = Path(os.getcwd()).parent
work_dir_path = repo_root / work_dir

# Define output paths for the pairs files
train_pairs_out = work_dir_path / 'pairs_train.jsonl'
test_pairs_out  = work_dir_path / 'pairs_test.jsonl'

# Generate training pairs
print("Generating training pairs...")
PairGen(
    input_file=str(train_meta_out),
    output_file=str(train_pairs_out),
    neg_samples_per_pos=neg_ratio
).run()

# Generate test pairs
print("Generating test pairs...")
PairGen(
    input_file=str(test_meta_out),
    output_file=str(test_pairs_out),
    neg_samples_per_pos=neg_ratio
).run()

# Print paths
rel_train = train_pairs_out.relative_to(repo_root)
rel_test  = test_pairs_out.relative_to(repo_root)
print("Pairs written to:")
print(f"  {rel_train}")
print(f"  {rel_test}")


Generating training pairs...
Reading metadata from ..\data\processed\metadata_train.jsonl...
Found 14400 metadata entries.
Generated 46096 positive pairs.
Generated 46096 negative pairs.
Saving 92192 pairs to ..\data\processed\pairs_train.jsonl...
Done.
Generating test pairs...
Reading metadata from ..\data\processed\metadata_test.jsonl...
Found 3600 metadata entries.
Generated 2896 positive pairs.
Generated 2896 negative pairs.
Saving 5792 pairs to ..\data\processed\pairs_test.jsonl...
Done.
Pairs written to:
  data\processed\pairs_train.jsonl
  data\processed\pairs_test.jsonl
