In [None]:
# ========================================
# LIGN 167 Final Project: Spatial Navigation Dataset Generator
# Modeling Place vs. Heading Representations in Language Models
# ========================================

import pandas as pd
import numpy as np
import random
from itertools import product
import json
from collections import defaultdict

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

print("✓ Imports complete")

✓ Imports complete


In [None]:
# ========================================
# 1. DEFINE ATTRIBUTE SPACE
# ========================================

# Geometry options
GEOMETRIES = ['square', 'rectangular', 'circular']

# Non-geometric cues (place identity)
WALL_COLORS = ['red', 'blue', 'green', 'yellow']
FLOOR_TEXTURES = ['wooden', 'tiled', 'carpeted']
OBJECTS = ['lamp', 'table', 'chair', 'painting']

# Spatial layout (heading identity)
CARDINAL_DIRECTIONS = ['north', 'south', 'east', 'west']

print("Attribute Space:")
print(f"  Geometries: {len(GEOMETRIES)}")
print(f"  Wall Colors: {len(WALL_COLORS)}")
print(f"  Floor Textures: {len(FLOOR_TEXTURES)}")
print(f"  Objects: {len(OBJECTS)}")
print(f"  Directions: {len(CARDINAL_DIRECTIONS)}")
print(f"\nTotal possible combinations: {len(GEOMETRIES) * len(WALL_COLORS) * len(FLOOR_TEXTURES) * len(OBJECTS) * len(CARDINAL_DIRECTIONS) * len(CARDINAL_DIRECTIONS)}")

Attribute Space:
  Geometries: 3
  Wall Colors: 4
  Floor Textures: 3
  Objects: 4
  Directions: 4

Total possible combinations: 2304


In [None]:
# ========================================
# 2. TEXT GENERATION TEMPLATES
# ========================================

def generate_room_description(geometry, wall_color, floor_texture,
                              object_type, object_side, door_side,
                              template_id=1):
    """
    Generate textual description with multiple template variations
    to avoid pure memorization of surface patterns.
    """

    templates = {
        1: f"You are in a {geometry} room with {wall_color} walls and a {floor_texture} floor. There is a {object_type} on the {object_side} wall and a door on the {door_side} wall.",

        2: f"This is a {geometry} chamber. The walls are {wall_color} and the floor is {floor_texture}. A {object_type} is positioned on the {object_side} side, and you can see a door on the {door_side} side.",

        3: f"The {geometry} room has {wall_color} walls with a {floor_texture} floor. On the {object_side} wall is a {object_type}. The door is located on the {door_side} wall.",

        4: f"You find yourself in a {geometry} space. {wall_color} walls surround you, and the floor beneath is {floor_texture}. A {object_type} sits on the {object_side} wall, while the exit door is on the {door_side} wall."
    }

    return templates[template_id]

# Test the templates
test_desc = generate_room_description('square', 'red', 'wooden', 'lamp', 'north', 'east', template_id=1)
print("Example description:")
print(test_desc)


Example description:
You are in a square room with red walls and a wooden floor. There is a lamp on the north wall and a door on the east wall.


In [None]:
# ========================================
# 3. TRAIN/TEST SPLIT STRATEGY (OPTION A)
# ========================================

def create_train_test_splits():
    """
    Option A: Held-out color+geometry combinations

    Strategy:
    - TRAIN: Most color+geometry combinations
    - TEST: Held-out combinations never seen during training

    This forces compositional generalization and prevents
    lexical overlap heuristics.
    """

    # All possible color+geometry combinations
    all_combinations = list(product(WALL_COLORS, GEOMETRIES))

    # Shuffle and split
    random.shuffle(all_combinations)

    # 70% train, 30% test (held-out combinations)
    split_idx = int(len(all_combinations) * 0.7)

    train_combos = all_combinations[:split_idx]
    test_combos = all_combinations[split_idx:]

    print(f"Total color+geometry combinations: {len(all_combinations)}")
    print(f"Train combinations: {len(train_combos)}")
    print(f"Test combinations (held-out): {len(test_combos)}")
    print(f"\nExample train combos: {train_combos[:3]}")
    print(f"Example test combos: {test_combos[:3]}")

    return train_combos, test_combos

TRAIN_COMBOS, TEST_COMBOS = create_train_test_splits()

Total color+geometry combinations: 12
Train combinations: 8
Test combinations (held-out): 4

Example train combos: [('green', 'rectangular'), ('blue', 'circular'), ('red', 'circular')]
Example test combos: [('blue', 'rectangular'), ('red', 'square'), ('red', 'rectangular')]


In [None]:
# ========================================
# 4. DATASET GENERATION
# ========================================

def generate_dataset(n_samples=2000, val_split=0.15):
    """
    Generate synthetic spatial navigation dataset.

    Args:
        n_samples: Total number of scenes to generate
        val_split: Proportion of train data to use for validation

    Returns:
        DataFrame with all scenes and metadata
    """

    data = []
    scene_id = 0

    # Calculate target samples per split
    n_train_val = int(n_samples * 0.7)
    n_test = n_samples - n_train_val
    n_val = int(n_train_val * val_split)
    n_train = n_train_val - n_val

    print(f"Target distribution:")
    print(f"  Train: {n_train} ({n_train/n_samples*100:.1f}%)")
    print(f"  Val: {n_val} ({n_val/n_samples*100:.1f}%)")
    print(f"  Test: {n_test} ({n_test/n_samples*100:.1f}%)")
    print()

    # Generate train/val samples
    for i in range(n_train_val):
        # Sample from TRAIN combinations only
        wall_color, geometry = random.choice(TRAIN_COMBOS)
        floor_texture = random.choice(FLOOR_TEXTURES)
        object_type = random.choice(OBJECTS)
        object_side = random.choice(CARDINAL_DIRECTIONS)

        # Ensure door is on different wall than object
        available_door_sides = [d for d in CARDINAL_DIRECTIONS if d != object_side]
        door_side = random.choice(available_door_sides)

        # Random template for variation
        template_id = random.choice([1, 2, 3, 4])

        text = generate_room_description(
            geometry, wall_color, floor_texture,
            object_type, object_side, door_side,
            template_id
        )

        # Create IDs for place and heading
        place_id = f"{wall_color}-{floor_texture}-{object_type}-{object_side}"
        heading_id = f"{geometry}-{door_side}"

        # Assign train/val split
        split = 'train' if i < n_train else 'val'

        data.append({
            'scene_id': f'scene_{scene_id:04d}',
            'text': text,
            'geometry': geometry,
            'wall_color': wall_color,
            'floor_texture': floor_texture,
            'object_type': object_type,
            'object_side': object_side,
            'door_side': door_side,
            'place_id': place_id,
            'heading_id': heading_id,
            'split': split,
            'split_condition': 'seen_combination',
            'template_id': template_id
        })

        scene_id += 1

        # Progress indicator
        if (i + 1) % 200 == 0:
            print(f"  Generated {i + 1}/{n_train_val} train/val samples...")

    # Generate test samples (HELD-OUT combinations)
    for i in range(n_test):
        # Sample from TEST combinations only
        wall_color, geometry = random.choice(TEST_COMBOS)
        floor_texture = random.choice(FLOOR_TEXTURES)
        object_type = random.choice(OBJECTS)
        object_side = random.choice(CARDINAL_DIRECTIONS)

        available_door_sides = [d for d in CARDINAL_DIRECTIONS if d != object_side]
        door_side = random.choice(available_door_sides)

        template_id = random.choice([1, 2, 3, 4])

        text = generate_room_description(
            geometry, wall_color, floor_texture,
            object_type, object_side, door_side,
            template_id
        )

        place_id = f"{wall_color}-{floor_texture}-{object_type}-{object_side}"
        heading_id = f"{geometry}-{door_side}"

        data.append({
            'scene_id': f'scene_{scene_id:04d}',
            'text': text,
            'geometry': geometry,
            'wall_color': wall_color,
            'floor_texture': floor_texture,
            'object_type': object_type,
            'object_side': object_side,
            'door_side': door_side,
            'place_id': place_id,
            'heading_id': heading_id,
            'split': 'test',
            'split_condition': 'held_out_color_geometry',
            'template_id': template_id
        })

        scene_id += 1

        # Progress indicator
        if (i + 1) % 100 == 0:
            print(f"  Generated {i + 1}/{n_test} test samples...")

    df = pd.DataFrame(data)
    print(f"\n✓ Generated {len(df)} total scenes")
    return df

# Generate the dataset
print("Generating dataset...\n")
df = generate_dataset(n_samples=2000)


Generating dataset...

Target distribution:
  Train: 1190 (59.5%)
  Val: 210 (10.5%)
  Test: 600 (30.0%)

  Generated 200/1400 train/val samples...
  Generated 400/1400 train/val samples...
  Generated 600/1400 train/val samples...
  Generated 800/1400 train/val samples...
  Generated 1000/1400 train/val samples...
  Generated 1200/1400 train/val samples...
  Generated 1400/1400 train/val samples...
  Generated 100/600 test samples...
  Generated 200/600 test samples...
  Generated 300/600 test samples...
  Generated 400/600 test samples...
  Generated 500/600 test samples...
  Generated 600/600 test samples...

✓ Generated 2000 total scenes


In [None]:
# ========================================
# 5. DATASET STATISTICS & VALIDATION
# ========================================

print("="*50)
print("DATASET STATISTICS")
print("="*50)

print("\n1. Split Distribution:")
print(df['split'].value_counts().sort_index())
print(f"\nPercentages:")
print(df['split'].value_counts(normalize=True).sort_index() * 100)

print("\n2. Unique IDs:")
print(f"  Unique place_ids: {df['place_id'].nunique()}")
print(f"  Unique heading_ids: {df['heading_id'].nunique()}")

print("\n3. Attribute Coverage:")
for col in ['geometry', 'wall_color', 'floor_texture', 'object_type']:
    print(f"  {col}: {df[col].nunique()} unique values")

print("\n4. Split Condition Verification:")
print(df.groupby('split')['split_condition'].value_counts())

# Verify held-out combinations
print("\n5. Color+Geometry Combination Check:")
train_color_geo = set(df[df['split']=='train'][['wall_color', 'geometry']].apply(tuple, axis=1))
test_color_geo = set(df[df['split']=='test'][['wall_color', 'geometry']].apply(tuple, axis=1))
overlap = train_color_geo.intersection(test_color_geo)
print(f"  Train color+geo combinations: {len(train_color_geo)}")
print(f"  Test color+geo combinations: {len(test_color_geo)}")
print(f"  Overlap (should be 0): {len(overlap)}")
if len(overlap) == 0:
    print("  ✓ PASS: No overlap between train and test combinations!")
else:
    print(f"  ✗ FAIL: Found overlap: {overlap}")

DATASET STATISTICS

1. Split Distribution:
split
test      600
train    1190
val       210
Name: count, dtype: int64

Percentages:
split
test     30.0
train    59.5
val      10.5
Name: proportion, dtype: float64

2. Unique IDs:
  Unique place_ids: 192
  Unique heading_ids: 12

3. Attribute Coverage:
  geometry: 3 unique values
  wall_color: 4 unique values
  floor_texture: 3 unique values
  object_type: 4 unique values

4. Split Condition Verification:
split  split_condition        
test   held_out_color_geometry     600
train  seen_combination           1190
val    seen_combination            210
Name: count, dtype: int64

5. Color+Geometry Combination Check:
  Train color+geo combinations: 8
  Test color+geo combinations: 4
  Overlap (should be 0): 0
  ✓ PASS: No overlap between train and test combinations!


In [None]:
# ========================================
# 6. DISPLAY SAMPLE SCENES
# ========================================

print("="*50)
print("SAMPLE SCENES")
print("="*50)

print("\nTRAIN examples:")
print(df[df['split']=='train'][['scene_id', 'text', 'place_id', 'heading_id']].head(3).to_string(index=False))

print("\n\nVAL examples:")
print(df[df['split']=='val'][['scene_id', 'text', 'place_id', 'heading_id']].head(3).to_string(index=False))

print("\n\nTEST examples (held-out combinations):")
print(df[df['split']=='test'][['scene_id', 'text', 'place_id', 'heading_id']].head(3).to_string(index=False))


SAMPLE SCENES

TRAIN examples:
  scene_id                                                                                                                                                             text                      place_id        heading_id
scene_0000                               You are in a rectangular room with green walls and a wooden floor. There is a lamp on the south wall and a door on the north wall.       green-wooden-lamp-south rectangular-north
scene_0001                                  The circular room has green walls with a carpeted floor. On the south wall is a painting. The door is located on the east wall. green-carpeted-painting-south     circular-east
scene_0002 This is a rectangular chamber. The walls are green and the floor is wooden. A painting is positioned on the east side, and you can see a door on the south side.    green-wooden-painting-east rectangular-south


VAL examples:
  scene_id                                                               

In [None]:
# ========================================
# 7. CREATE PAIRED SCENES (JULIAN STYLE)
# ========================================

def create_paired_chambers(df, n_pairs=50):
    """
    Create explicit paired scenes with same heading_id but different place_id
    (mimicking Julian et al.'s "two chambers" paradigm)
    """

    pairs = []

    # Get heading_ids with multiple place_ids
    heading_counts = df.groupby('heading_id')['place_id'].nunique()
    valid_headings = heading_counts[heading_counts >= 2].index.tolist()

    for _ in range(n_pairs):
        # Pick a random heading
        heading = random.choice(valid_headings)

        # Get two scenes with same heading but different places
        candidates = df[df['heading_id'] == heading]
        if len(candidates) >= 2:
            pair = candidates.sample(2)
            pairs.append({
                'pair_id': len(pairs),
                'heading_id': heading,
                'scene_1': pair.iloc[0]['scene_id'],
                'place_1': pair.iloc[0]['place_id'],
                'scene_2': pair.iloc[1]['scene_id'],
                'place_2': pair.iloc[1]['place_id'],
                'same_heading': True,
                'same_place': pair.iloc[0]['place_id'] == pair.iloc[1]['place_id']
            })

    pairs_df = pd.DataFrame(pairs)
    print(f"Created {len(pairs_df)} paired chambers")
    print(f"All pairs have same_heading=True, same_place=False")
    return pairs_df

pairs_df = create_paired_chambers(df, n_pairs=100)
print("\nExample pairs:")
print(pairs_df.head())


Created 100 paired chambers
All pairs have same_heading=True, same_place=False

Example pairs:
   pair_id         heading_id     scene_1                      place_1  \
0        0  rectangular-south  scene_1676  blue-carpeted-painting-west   
1        1      circular-east  scene_1275  green-wooden-painting-north   
2        2   rectangular-east  scene_0540        green-tiled-lamp-west   
3        3     circular-north  scene_0297    blue-carpeted-table-south   
4        4  rectangular-south  scene_1516   blue-wooden-painting-north   

      scene_2                    place_2  same_heading  same_place  
0  scene_1696      blue-tiled-chair-west          True       False  
1  scene_0978  yellow-wooden-chair-north          True       False  
2  scene_1545    red-carpeted-lamp-south          True       False  
3  scene_0235      blue-tiled-lamp-south          True       False  
4  scene_1722      red-wooden-table-west          True       False  


In [None]:
# ========================================
# 7.5. FINAL VALIDATION BEFORE SAVING
# ========================================

print("="*50)
print("FINAL VALIDATION CHECKS")
print("="*50)

# Check 1: Split sizes
print("\n1. Split Size Check:")
expected = {'train': 1190, 'val': 210, 'test': 600}
actual = df['split'].value_counts().to_dict()
for split, exp_count in expected.items():
    act_count = actual[split]
    status = "✓" if act_count == exp_count else "✗"
    print(f"  {status} {split}: {act_count} (expected {exp_count})")

# Check 2: No null values
print("\n2. Null Value Check:")
null_counts = df.isnull().sum()
if null_counts.sum() == 0:
    print("  ✓ No null values found")
else:
    print(f"  ✗ Found null values:\n{null_counts[null_counts > 0]}")

# Check 3: Text length distribution
print("\n3. Text Length Check:")
df['text_length'] = df['text'].str.len()
print(f"  Mean length: {df['text_length'].mean():.1f} chars")
print(f"  Min length: {df['text_length'].min()} chars")
print(f"  Max length: {df['text_length'].max()} chars")

# Check 4: Template distribution
print("\n4. Template Distribution:")
print(df['template_id'].value_counts().sort_index())

# Check 5: Verify all attributes are used
print("\n5. Attribute Coverage per Split:")
for split in ['train', 'val', 'test']:
    split_df = df[df['split'] == split]
    print(f"\n  {split.upper()}:")
    print(f"    Geometries: {split_df['geometry'].nunique()}/{len(GEOMETRIES)}")
    print(f"    Colors: {split_df['wall_color'].nunique()}/{len(WALL_COLORS)}")
    print(f"    Textures: {split_df['floor_texture'].nunique()}/{len(FLOOR_TEXTURES)}")
    print(f"    Objects: {split_df['object_type'].nunique()}/{len(OBJECTS)}")

print("\n" + "="*50)
print("VALIDATION COMPLETE - Ready to save!")
print("="*50)

FINAL VALIDATION CHECKS

1. Split Size Check:
  ✓ train: 1190 (expected 1190)
  ✓ val: 210 (expected 210)
  ✓ test: 600 (expected 600)

2. Null Value Check:
  ✓ No null values found

3. Text Length Check:
  Mean length: 144.2 chars
  Min length: 115 chars
  Max length: 178 chars

4. Template Distribution:
template_id
1    485
2    477
3    506
4    532
Name: count, dtype: int64

5. Attribute Coverage per Split:

  TRAIN:
    Geometries: 3/3
    Colors: 4/4
    Textures: 3/3
    Objects: 4/4

  VAL:
    Geometries: 3/3
    Colors: 4/4
    Textures: 3/3
    Objects: 4/4

  TEST:
    Geometries: 2/3
    Colors: 3/4
    Textures: 3/3
    Objects: 4/4

VALIDATION COMPLETE - Ready to save!


In [None]:
# # ========================================
# # 8. SAVE DATASET
# # ========================================

# # Save main dataset
# df.to_csv('spatial_navigation_dataset.csv', index=False)
# print("✓ Saved main dataset to 'spatial_navigation_dataset.csv'")

# # Save paired chambers
# pairs_df.to_csv('paired_chambers.csv', index=False)
# print("✓ Saved paired chambers to 'paired_chambers.csv'")

# # Save train/val/test splits separately (useful for loading later)
# df[df['split']=='train'].to_csv('train.csv', index=False)
# df[df['split']=='val'].to_csv('val.csv', index=False)
# df[df['split']=='test'].to_csv('test.csv', index=False)
# print("✓ Saved individual split files (train.csv, val.csv, test.csv)")

# # Save metadata as JSON
# metadata = {
#     'total_scenes': len(df),
#     'train_count': len(df[df['split']=='train']),
#     'val_count': len(df[df['split']=='val']),
#     'test_count': len(df[df['split']=='test']),
#     'unique_place_ids': int(df['place_id'].nunique()),
#     'unique_heading_ids': int(df['heading_id'].nunique()),
#     'split_strategy': 'held_out_color_geometry_combinations',
#     'geometries': GEOMETRIES,
#     'wall_colors': WALL_COLORS,
#     'floor_textures': FLOOR_TEXTURES,
#     'objects': OBJECTS,
#     'cardinal_directions': CARDINAL_DIRECTIONS,
#     'num_templates': 4,
#     'train_color_geometry_combos': [[c, g] for c, g in TRAIN_COMBOS],
#     'test_color_geometry_combos': [[c, g] for c, g in TEST_COMBOS],
# }

# with open('dataset_metadata.json', 'w') as f:
#     json.dump(metadata, f, indent=2)
# print("✓ Saved metadata to 'dataset_metadata.json'")

# print("\n" + "="*50)
# print("ALL FILES SAVED SUCCESSFULLY!")
# print("="*50)
# print("\nFiles created:")
# print("  1. spatial_navigation_dataset.csv - Full dataset")
# print("  2. paired_chambers.csv - Julian-style paired scenes")
# print("  3. train.csv - Training split only")
# print("  4. val.csv - Validation split only")
# print("  5. test.csv - Test split only")
# print("  6. dataset_metadata.json - Dataset configuration and stats")