In [None]:
# Add parent directory to path to import py222
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

In [None]:
def load_dataset_split(data_dir: Path, split: str):
    #Load a dataset split (train/test/val)
    split_dir = data_dir / split
    if not split_dir.exists():
        raise FileNotFoundError(f"Split {split} not found in {data_dir}")

    # Load metadata
    with open(split_dir / "dataset.json", "r") as f:
        metadata = json.load(f)

    # Load arrays
    inputs = np.load(split_dir / "all__inputs.npy")
    labels = np.load(split_dir / "all__labels.npy")
    group_indices = np.load(split_dir / "all__group_indices.npy")
    puzzle_indices = np.load(split_dir / "all__puzzle_indices.npy")
    puzzle_identifiers = np.load(split_dir / "all__puzzle_identifiers.npy")

    return {
        'metadata': metadata,
        'inputs': inputs,
        'labels': labels,
        'group_indices': group_indices,
        'puzzle_indices': puzzle_indices,
        'puzzle_identifiers': puzzle_identifiers,
    }

In [29]:
data = load_dataset_split(Path("data/cube-2-by-2-heuristic"), "train")

In [32]:
df = pd.DataFrame({
    'inputs': [data['inputs'][i].tolist() for i in range(len(data['inputs']))],
    #'labels': [data['labels'][i].tolist() for i in range(len(data['labels']))],
    #'labels': [[x for x in data['labels'][i] if x != 0] for i in range(len(data['labels']))],
    'labels': data['labels'][:, 0],
    'puzzle_indices': data['puzzle_indices'][:-1],
})

In [33]:
df.head()

Unnamed: 0,inputs,labels,puzzle_indices
0,"[0, 1, 3, 0, 2, 2, 5, 2, 3, 5, 2, 1, 3, 0, 1, ...",9,0
1,"[0, 2, 3, 5, 2, 2, 5, 1, 4, 3, 0, 1, 3, 0, 1, ...",8,1
2,"[2, 5, 0, 3, 0, 1, 2, 2, 5, 1, 4, 3, 3, 0, 1, ...",7,2
3,"[2, 2, 0, 4, 0, 1, 2, 3, 5, 5, 3, 3, 3, 0, 1, ...",6,3
4,"[0, 2, 4, 2, 2, 3, 5, 5, 3, 3, 0, 1, 3, 0, 1, ...",5,4


In [34]:
df_inputs = pd.DataFrame(df['inputs'].tolist())
df_outputs = pd.DataFrame(df['labels'])

In [22]:
df_inputs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,4,1,2,6,3,1,5,2,5,2,...,2,6,4,3,4,5,3,5,1,6
1,2,4,6,1,5,2,5,2,3,1,...,2,6,4,3,4,5,3,5,1,6
2,2,5,6,6,5,2,5,4,3,4,...,2,3,1,3,2,5,3,4,1,1
3,6,6,5,2,3,4,6,1,5,2,...,2,3,1,3,2,5,3,4,1,1
4,6,6,6,4,3,3,2,6,5,2,...,3,1,2,3,2,5,1,5,1,1


In [23]:
df_inputs.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
count,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,...,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0,4534.0
mean,4.416189,4.046317,3.943317,3.762682,3.597486,3.471769,3.360829,3.458756,3.341861,3.219012,...,3.335465,3.422805,3.207764,3.022717,3.88112,5.0,3.028231,3.425232,1.0,3.019629
std,1.844953,1.851554,1.86557,1.845371,1.41814,1.623094,1.461143,1.579833,1.648196,1.588587,...,1.384637,1.56656,1.687752,1.541446,1.698608,0.0,1.859645,1.833588,0.0,1.832932
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0
25%,3.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,2.0,2.0,5.0,1.0,2.0,1.0,1.0
50%,5.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,2.0,4.0,5.0,3.0,3.0,1.0,3.0
75%,6.0,6.0,6.0,6.0,4.0,5.0,4.0,5.0,5.0,5.0,...,4.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,1.0,5.0
max,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,5.0,6.0,6.0,1.0,6.0


In [35]:
df_outputs.head()

Unnamed: 0,labels
0,9
1,8
2,7
3,6
4,5


In [36]:
df_outputs.describe()

Unnamed: 0,labels
count,1021937.0
mean,4.618137
std,2.951133
min,0.0
25%,2.0
50%,5.0
75%,7.0
max,11.0
