In [1]:
import polars as pl

In [None]:
df_train = pl.read_csv("train_segments.csv")
df_val = pl.read_csv("val_segments.csv")
df_test = pl.read_csv("test_segments.csv")
df_merged = pl.concat([df_train, df_val, df_test])

In [9]:
df_merged.head(2)

segment_id,source_file,file_type,tokens,labels,num_tokens,num_transitions,original_text
str,str,str,str,str,i64,i64,str
"""S00271I_SKYES_BU_GSUM_GYIS_NYA…","""S00271I_SKYES_BU_GSUM_GYIS_NYA…","""txt""","""de lta bu'i rim pa 'dis chu ts…","""1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,…",301,1,"""de lta bu'i rim pa 'dis chu ts…"
"""S00271I_SKYES_BU_GSUM_GYIS_NYA…","""S00271I_SKYES_BU_GSUM_GYIS_NYA…","""txt""","""gzhung 'ga' zhig las bden gnyi…","""0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…",367,2,"""gzhung 'ga' zhig las bden gnyi…"


In [25]:
def parse_labels(str_list):
    """Convert list of comma-separated strings into list of integer lists."""
    return [int(x) for s in str_list for x in s.split(',') if x.strip().isdigit()]


def count_auto_allo_chunks(labels):
    """
    Counts contiguous chunks of 0s (ALLO) and 1s (AUTO),
    ignoring other labels like 2 or 3.
    Returns per-chunk lengths (+1 per your rule).
    """
    allo_chunks = []
    auto_chunks = []

    if not labels:
        return allo_chunks, auto_chunks

    current_value = None
    current_length = 0

    for label in labels:
        if label not in [0, 1]:  # ignore 2 and 3 as separators
            if current_value == 0:
                allo_chunks.append(current_length + 1)
            elif current_value == 1:
                auto_chunks.append(current_length + 1)
            current_value = None
            current_length = 0
            continue

        if label == current_value:
            current_length += 1
        else:
            # close previous chunk
            if current_value == 0:
                allo_chunks.append(current_length + 1)
            elif current_value == 1:
                auto_chunks.append(current_length + 1)
            # start new
            current_value = label
            current_length = 0

    # close last
    if current_value == 0:
        allo_chunks.append(current_length + 1)
    elif current_value == 1:
        auto_chunks.append(current_length + 1)

    return allo_chunks, auto_chunks

In [36]:
allo_sizes, auto_sizes = [], []
for labels in df_merged["labels"]:
    parsed = parse_labels([labels])
    allo, auto = count_auto_allo_chunks(parsed)
    allo_sizes.extend(allo)
    auto_sizes.extend(auto)

In [38]:
print("Minimum ALLO chunk size:", min(allo_sizes) if allo_sizes else "N/A")
print("Minimum AUTO chunk size:", min(auto_sizes) if auto_sizes else "N/A")
print("Maximum ALLO chunk size:", max(allo_sizes) if allo_sizes else "N/A")
print("Maximum AUTO chunk size:", max(auto_sizes) if auto_sizes else "N/A")

Minimum ALLO chunk size: 2
Minimum AUTO chunk size: 4
Maximum ALLO chunk size: 438
Maximum AUTO chunk size: 335


In [4]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
print(df_merged.shape)

(304, 8)
(60, 8)
(83, 8)
(447, 8)
