# Predicting the shape of the output grid
In the ARC-AGI-1 and ARC-AGI-2 datasets, many output grid shapes can be predicted at test time based only on the shape of the train input/output grid pair.

In [122]:
from arcexplore import *
import polars as pl

# Creation of Input/Output complete Dataframe
Loading data from python iterator into a polars data frame. The grids will be saved as nested polars lists.

In [176]:
rows = [(id,k,d,tt,it.input.tolist(), it.output.tolist()) for k,d,id,tt,_,it in ARC_DATA.data_iter()]
complete_scheme = ("ID","Source", "Set Type","Task Type","Input","Output")
Source = pl.Categorical(["ARC-AGI-1","ARC-AGI-2"])
SetType = pl.Categorical(["training","evaluation"])
TaskType = pl.Categorical(["train","test"])
complete_frame = pl.DataFrame(rows,
                              schema = complete_scheme,
                              schema_overrides = {
                                  "Source": Source,
                                  "Set Type": SetType,
                                  "Task Type": TaskType,
                                  "Input": pl.List(pl.List(pl.Int8)),
                                  "Output": pl.List(pl.List(pl.Int8))
                              },
                              orient="row")

display(complete_frame)

ID,Source,Set Type,Task Type,Input,Output
str,cat,cat,cat,list[list[i8]],list[list[i8]]
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""","[[0, 7, 7], [7, 7, 7], [0, 7, 7]]","[[0, 0, … 7], [0, 0, … 7], … [0, 0, … 7]]"
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""","[[4, 0, 4], [0, 0, 0], [0, 4, 0]]","[[4, 0, … 4], [0, 0, … 0], … [0, 0, … 0]]"
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""","[[0, 0, 0], [0, 0, 2], [2, 0, 2]]","[[0, 0, … 0], [0, 0, … 0], … [2, 0, … 2]]"
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""","[[6, 6, 0], [6, 0, 0], [0, 6, 6]]","[[6, 6, … 0], [6, 0, … 0], … [0, 0, … 6]]"
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""","[[2, 2, 2], [0, 0, 0], [0, 2, 2]]","[[2, 2, … 2], [0, 0, … 0], … [0, 0, … 2]]"
…,…,…,…,…,…
"""faa9f03d""","""ARC-AGI-2""","""evaluation""","""test""","[[0, 0, … 0], [0, 0, … 0], … [0, 0, … 0]]","[[0, 0, … 0], [0, 0, … 0], … [0, 0, … 0]]"
"""fc7cae8d""","""ARC-AGI-2""","""evaluation""","""train""","[[4, 4, … 4], [6, 0, … 5], … [6, 0, … 5]]","[[1, 1, … 1], [1, 3, … 1], … [1, 1, … 1]]"
"""fc7cae8d""","""ARC-AGI-2""","""evaluation""","""train""","[[5, 5, … 1], [0, 0, … 1], … [3, 3, … 3]]","[[2, 2, … 2], [2, 8, … 2], … [2, 2, … 2]]"
"""fc7cae8d""","""ARC-AGI-2""","""evaluation""","""train""","[[2, 0, … 5], [2, 2, … 0], … [4, 4, … 4]]","[[8, 8, … 0], [8, 8, … 8], … [8, 8, … 8]]"


# Creation of Shape Frame
For this analysis, we are only concerned with the shapes of the input and output grids

In [124]:
grid_shape_cols = (pl.col("Input").list.len().alias("Input Rows"),
                   pl.col("Input").list.get(0).list.len().alias("Input Columns"),
                   pl.col("Output").list.len().alias("Output Rows"),
                   pl.col("Output").list.get(0).list.len().alias("Output Columns"))

shape_frame = (complete_frame.
               with_columns(*grid_shape_cols).
               drop("Input","Output"))
display(shape_frame)

ID,Source,Set Type,Task Type,Input Rows,Input Columns,Output Rows,Output Columns
str,cat,cat,cat,u32,u32,u32,u32
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""",3,3,9,9
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""",3,3,9,9
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""",3,3,9,9
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""",3,3,9,9
"""007bbfb7""","""ARC-AGI-1""","""training""","""train""",3,3,9,9
…,…,…,…,…,…,…,…
"""faa9f03d""","""ARC-AGI-2""","""evaluation""","""test""",12,12,12,12
"""fc7cae8d""","""ARC-AGI-2""","""evaluation""","""train""",22,22,11,15
"""fc7cae8d""","""ARC-AGI-2""","""evaluation""","""train""",22,22,10,10
"""fc7cae8d""","""ARC-AGI-2""","""evaluation""","""train""",22,22,5,5


# Shape Types
There are different ways the output shape could depend on the input shape. Each of these categories can apply to the rows, columns or both
- Fixed: This dimension is a fixed value $\alpha$ for any case $$R_{out} = \alpha$$ $$C_{out} = \alpha$$

In [171]:
rows_fixed = (pl.col("Output Rows").n_unique() == 1).alias("Rows Fixed")
columns_fixed = (pl.col("Output Columns").n_unique() == 1).alias("Columns Fixed")

single_same_factor_expr = lambda a,b: ((a-b*(a//b)).n_unique() == 1)
same_factor_expr = lambda col1, col2: single_same_factor_expr(pl.col(col1),pl.col(col2)) | single_same_factor_expr(pl.col(col2),pl.col(col1)) | (pl.col(col1).n_unique() == 1)
rows_divisible = same_factor_expr("Output Rows", "Input Rows").alias("Rows Divisible")
rows_swapped = same_factor_expr("Output Rows", "Input Columns").alias("Rows Swapped")

cols_divisible = same_factor_expr("Output Columns", "Input Columns").alias("Columns Divisible")
columns_swapped = same_factor_expr("Output Columns", "Input Rows").alias("Columns Swapped")

drop_cols = ("Input Rows", "Output Rows", "Input Columns", "Output Columns")

shape_study_frame = (shape_frame.
                     group_by("Source", "Set Type", "ID")
                     .agg(rows_fixed, rows_swapped,rows_divisible, columns_fixed, columns_swapped, cols_divisible))
display(shape_study_frame)

filter_col = lambda key: pl.col(key).filter(key).len()
col_names = ["Rows Fixed", "Rows Swapped", "Rows Divisible",
             "Columns Fixed","Columns Swapped", "Columns Divisible"]

shape_study_frame.group_by("Source","Set Type").agg(*[filter_col(n) for n in col_names],pl.col("ID").len().alias("Total Tasks"))

Source,Set Type,ID,Rows Fixed,Rows Swapped,Rows Divisible,Columns Fixed,Columns Swapped,Columns Divisible
cat,cat,str,bool,bool,bool,bool,bool,bool
"""ARC-AGI-2""","""training""","""17b80ad2""",false,true,true,false,true,true
"""ARC-AGI-2""","""training""","""b60334d2""",true,true,true,true,true,true
"""ARC-AGI-1""","""evaluation""","""59341089""",true,true,true,true,true,true
"""ARC-AGI-2""","""training""","""760b3cac""",true,true,true,true,true,true
"""ARC-AGI-1""","""training""","""e8dc4411""",false,false,true,false,false,true
…,…,…,…,…,…,…,…,…
"""ARC-AGI-1""","""evaluation""","""712bf12e""",false,false,true,false,false,true
"""ARC-AGI-2""","""training""","""9bebae7a""",false,false,true,false,false,true
"""ARC-AGI-1""","""training""","""8be77c9e""",true,true,true,true,true,true
"""ARC-AGI-1""","""training""","""b9b7f026""",true,true,true,true,true,true


Source,Set Type,Rows Fixed,Rows Swapped,Rows Divisible,Columns Fixed,Columns Swapped,Columns Divisible,Total Tasks
cat,cat,u32,u32,u32,u32,u32,u32,u32
"""ARC-AGI-1""","""evaluation""",163,219,359,166,224,359,400
"""ARC-AGI-2""","""evaluation""",25,49,89,30,53,89,120
"""ARC-AGI-2""","""training""",439,615,892,430,617,886,1000
"""ARC-AGI-1""","""training""",216,273,358,209,274,355,400


# Predict rows and columns
Currently using both test and train. Will need to do with just train

In [172]:
rows_known = (pl.col("Rows Fixed") | pl.col("Rows Swapped") | pl.col("Rows Divisible")).alias("Rows Known")
cols_known = (pl.col("Columns Fixed") | pl.col("Columns Swapped") | pl.col("Columns Divisible")).alias("Columns Known")
shape_known = (pl.col("Rows Known") & pl.col("Columns Known")).alias("Shape Known")
shape_known_frame = (shape_study_frame.
                     with_columns(rows_known, cols_known).
                     with_columns(shape_known).
                     drop(col_names))
known_columns = ("Rows Known", "Columns Known", "Shape Known")
shape_known_frame.group_by("Source","Set Type").agg(*[filter_col(n)/pl.len() for n in known_columns])

Source,Set Type,Rows Known,Columns Known,Shape Known
cat,cat,f64,f64,f64
"""ARC-AGI-2""","""evaluation""",0.75,0.75,0.725
"""ARC-AGI-2""","""training""",0.893,0.892,0.884
"""ARC-AGI-1""","""training""",0.895,0.895,0.8925
"""ARC-AGI-1""","""evaluation""",0.8975,0.9,0.895
