In [4]:
import pandas as pd

df = pd.read_csv('../../../data/6_11_2025_tcc_cv/durations_by_directory.txt', sep='\t')
df.head()

Unnamed: 0,Animal names,165.560
0,Childhood 1,165.56
1,Conversation about Sukuma wars,487.447
2,Cooking plans,161.446
3,Cultural change 1,1027.241
4,Daily schedule,161.446


In [5]:
df.columns = ['Title', 'Duration']
df.head()

Unnamed: 0,Title,Duration
0,Childhood 1,165.56
1,Conversation about Sukuma wars,487.447
2,Cooking plans,161.446
3,Cultural change 1,1027.241
4,Daily schedule,161.446


In [6]:
sum(df['Duration'])

59694.72199999998

In [7]:
df

Unnamed: 0,Title,Duration
0,Childhood 1,165.560
1,Conversation about Sukuma wars,487.447
2,Cooking plans,161.446
3,Cultural change 1,1027.241
4,Daily schedule,161.446
...,...,...
82,Verbal inflections and tonal case 1,979.115
83,Verbal inflections and tonal case 2,979.115
84,Walking to Dugwamuhosht,165.560
85,Word order and tonal case 1,1096.689


In [9]:
df.sort_values(by='Duration')

Unnamed: 0,Title,Duration
72,Tribal history 4,0.000
52,Reciprocal project 1,0.000
51,Put project 3,0.000
11,Description of a television show,0.000
40,Marriage 2,0.000
...,...,...
16,Dictionary words 4,2240.848
64,Tonal case 8,2261.315
63,Tonal case 7,2261.315
62,Tonal case 6,2261.315


In [None]:
import os
import random

DURATIONS_PATH = '../../../data/6_11_2025_tcc_cv/durations_by_directory.txt'
OUT_DIR = '../../../data/6_11_2025_tcc_cv/splits'

TEST_FRACTION = 0.15
VALID_FRACTION = 0.15
N_TRIALS = 2000 

random.seed(42)


def load_durations(path):
    dirs = []
    with open(path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split('\t')
            if len(parts) != 2:
                print("SKIPPING LINE (unexpected format):", line)
                continue
            name, dur_str = parts
            try:
                dur = float(dur_str)
            except ValueError:
                print("SKIPPING LINE (cannot parse duration):", line)
                continue
            dirs.append((name, dur))
    return dirs


def pick_subset_by_duration(items, target_amount, n_trials=1000):
    n = len(items)
    indices_all = list(range(n))

    best_subset = None
    best_diff = float('inf')

    for _ in range(n_trials):
        random.shuffle(indices_all)
        subset = []
        s = 0.0

        for idx in indices_all:
            dur = items[idx][1]
            if s + dur <= target_amount or not subset:
                subset.append(idx)
                s += dur

        diff = abs(target_amount - s)
        if diff < best_diff:
            best_diff = diff
            best_subset = subset

    return best_subset


def main():
    os.makedirs(OUT_DIR, exist_ok=True)

    dirs = load_durations(DURATIONS_PATH)
    if not dirs:
        print("No directories loaded from", DURATIONS_PATH)
        return

    total_duration = sum(d for _, d in dirs)
    print(f"Total duration: {total_duration/3600:.2f} h")

    test_target = total_duration * TEST_FRACTION
    valid_target = total_duration * VALID_FRACTION

    test_indices = pick_subset_by_duration(dirs, test_target, N_TRIALS)
    test_indices_set = set(test_indices)

    remaining_for_valid = [
        (name, dur)
        for i, (name, dur) in enumerate(dirs)
        if i not in test_indices_set
    ]
    remaining_indices_map = [
        i for i in range(len(dirs)) if i not in test_indices_set
    ]

    valid_indices_local = pick_subset_by_duration(
        remaining_for_valid, valid_target, N_TRIALS
    )
    valid_indices = [remaining_indices_map[i] for i in valid_indices_local]
    valid_indices_set = set(valid_indices)

    all_indices = set(range(len(dirs)))
    train_indices = list(all_indices - test_indices_set - valid_indices_set)

    def sum_duration(idxs):
        return sum(dirs[i][1] for i in idxs)

    test_dur = sum_duration(test_indices)
    valid_dur = sum_duration(valid_indices)
    train_dur = sum_duration(train_indices)

    print(f"TEST:  {len(test_indices)} dirs,  {test_dur/3600:.2f} h "
          f"({test_dur/total_duration*100:.1f}%)")
    print(f"VALID: {len(valid_indices)} dirs, {valid_dur/3600:.2f} h "
          f"({valid_dur/total_duration*100:.1f}%)")
    print(f"TRAIN: {len(train_indices)} dirs, {train_dur/3600:.2f} h "
          f"({train_dur/total_duration*100:.1f}%)")

    # --- zapis do plików ---
    test_dirs = [dirs[i][0] for i in sorted(test_indices)]
    valid_dirs = [dirs[i][0] for i in sorted(valid_indices)]
    train_dirs = [dirs[i][0] for i in sorted(train_indices)]

    with open(os.path.join(OUT_DIR, 'test_dirs.txt'), 'w', encoding='utf-8') as f:
        for d in test_dirs:
            f.write(d + '\n')

    with open(os.path.join(OUT_DIR, 'valid_dirs.txt'), 'w', encoding='utf-8') as f:
        for d in valid_dirs:
            f.write(d + '\n')

    with open(os.path.join(OUT_DIR, 'train_dirs.txt'), 'w', encoding='utf-8') as f:
        for d in train_dirs:
            f.write(d + '\n')


if __name__ == '__main__':
    main()


Total duration: 16.63 h
TEST:  23 dirs,  2.49 h (15.0%)
VALID: 9 dirs, 2.49 h (15.0%)
TRAIN: 56 dirs, 11.64 h (70.0%)
