In [1]:
import os
import pandas as pd
from collections import defaultdict

dir = "../../../models/DC-TCN/datasets/ouluvs2_words_v2"

def create_df(dir):
    word_data_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    
    for word in os.listdir(dir):
        word_path = os.path.join(dir, word)
        
        if not os.path.isdir(word_path):
            print(f"Skipping {word}: not a directory.")
            continue
    
        for split in ['train', 'val', 'test']:
            split_path = os.path.join(word_path, split)
            
            if not os.path.isdir(split_path):
                continue
        
            file_count = sum(1 for file in os.listdir(split_path) if file.endswith(".npz"))
            word_data_counts[word][split]["total"] = file_count
            
            for file in os.listdir(split_path):
                if file.endswith(".npz"):
                    try:
                        version = file.split("_")[2].replace(".npz", "")
                        word_data_counts[word][split][version] += 1
                    except IndexError:
                        print(f"Error processing file: {file} in {split_path}")
    
    df = pd.DataFrame(columns=["train", "val", "test"] + [f"v{i}" for i in range(1, 6)])
    
    for word, splits in word_data_counts.items():
        row = {}
        row["train"] = splits["train"]["total"]
        row["val"] = splits["val"]["total"]
        row["test"] = splits["test"]["total"]
        
        for version in range(1, 6):
            v_key = f"v{version}"
            row[v_key] = {
                "train": splits["train"].get(v_key, 0),
                "val": splits["val"].get(v_key, 0),
                "test": splits["test"].get(v_key, 0),
            }
        df.loc[word] = row
    
    df['total'] = df[['train', 'val', 'test']].sum(axis=1)
    return df

In [None]:
df = create_df("../../../models/DC-TCN/datasets/ouluvs2_words_v2")

In [3]:
print("\nWord and File Counts per Subfolder and Versions:")
print(df)


Word and File Counts per Subfolder and Versions:
          train  val  test                                 v1  \
GUNPOINT      8    2     5  {'train': 2, 'val': 0, 'test': 1}   
THEIR        24    6     5  {'train': 6, 'val': 0, 'test': 1}   
PROVOKED      3    1     1  {'train': 1, 'val': 0, 'test': 0}   
CLEANERS      3    1     1  {'train': 0, 'val': 1, 'test': 0}   
CHOSE         4    1     5  {'train': 0, 'val': 1, 'test': 1}   
...         ...  ...   ...                                ...   
EARLY        24    6    10  {'train': 4, 'val': 2, 'test': 2}   
BRIGHT        4    1    10  {'train': 1, 'val': 0, 'test': 2}   
SPRING       10    2     3  {'train': 3, 'val': 0, 'test': 0}   
THIS         44   11    15  {'train': 8, 'val': 3, 'test': 3}   
BALLS         3    1     1  {'train': 1, 'val': 0, 'test': 0}   

                                         v2  \
GUNPOINT  {'train': 2, 'val': 0, 'test': 1}   
THEIR     {'train': 4, 'val': 2, 'test': 1}   
PROVOKED  {'train': 0, 'val'

In [4]:
print("\nWord and File Counts per Subfolder:")
sorted_df = df.sort_index()
sorted_df['total'] = sorted_df[['train', 'val', 'test']].sum(axis=1)
sorted_df = sorted_df.sort_values(by='total', ascending=False)
print(sorted_df[['train', 'val', 'test', 'total']])


Word and File Counts per Subfolder:
          train  val  test  total
YOU        2564  641   935   4140
ARE        1068  268   380   1716
A           732  183   305   1220
THE         768  192   255   1215
TO          720  180   270   1170
...         ...  ...   ...    ...
WILLOWY       3    1     1      5
WINDOW        3    1     1      5
WITHDRAW      3    1     1      5
WOMAN         3    1     1      5
WOOL          3    1     1      5

[1203 rows x 4 columns]


In [5]:
total_train = df['train'].sum()
total_val = df['val'].sum()
total_test = df['test'].sum()

print(f"Total train: {total_train}")
print(f"Total val: {total_val}")
print(f"Total test: {total_test}")


Total train: 24001
Total val: 5895
Total test: 9466


In [6]:
gunpoint_v1_total = df.loc['GUNPOINT', 'v1']
total_v1 = sum(gunpoint_v1_total.values())

print(f"Total count of v1 for 'GUNPOINT': {total_v1}")

Total count of v1 for 'GUNPOINT': 3


In [7]:
total_v1_across_all_words = df['v1'].apply(lambda x: sum(x.values())).sum()
total_v2_across_all_words = df['v2'].apply(lambda x: sum(x.values())).sum()
total_v3_across_all_words = df['v3'].apply(lambda x: sum(x.values())).sum()
total_v4_across_all_words = df['v4'].apply(lambda x: sum(x.values())).sum()
total_v5_across_all_words = df['v5'].apply(lambda x: sum(x.values())).sum()
print(f"Total count of v1 across all words: {total_v1_across_all_words}")
print(f"Total count of v2 across all words: {total_v2_across_all_words}")
print(f"Total count of v3 across all words: {total_v3_across_all_words}")
print(f"Total count of v4 across all words: {total_v4_across_all_words}")
print(f"Total count of v5 across all words: {total_v5_across_all_words}")

Total count of v1 across all words: 7872
Total count of v2 across all words: 7873
Total count of v3 across all words: 7872
Total count of v4 across all words: 7872
Total count of v5 across all words: 7873


In [8]:
version_columns = ['v1', 'v2', 'v3', 'v4', 'v5']
split_columns = ['train', 'val', 'test']

total_versions_per_split = {split: {version: df[version].apply(lambda x: x.get(split, 0)).sum() for version in version_columns} for split in split_columns}

print("Total counts for each version per split:")
for split, versions in total_versions_per_split.items():
    print(f"\n{split.capitalize()}:")
    for version, count in versions.items():
        print(f"  {version}: {count}")


Total counts for each version per split:

Train:
  v1: 4728
  v2: 4845
  v3: 4799
  v4: 4810
  v5: 4819

Val:
  v1: 1203
  v2: 1177
  v3: 1186
  v4: 1166
  v5: 1163

Test:
  v1: 1941
  v2: 1851
  v3: 1887
  v4: 1896
  v5: 1891


In [9]:
total_train_zeros = (df['train'] == 0).sum()
total_val_zeros = (df['val'] == 0).sum()
total_test_zeros = (df['test'] == 0).sum()

print(f"Total train = 0: {total_train_zeros}")
print(f"Total val = 0: {total_val_zeros}")
print(f"Total test = 0: {total_test_zeros}")

Total train = 0: 0
Total val = 0: 0
Total test = 0: 0


In [18]:
# Find words where test or val count is higher than train
problematic_words = df[(df['test'] >= df['train']) | (df['val'] >= df['train'])]

print("Problematic words:\n", problematic_words[['train', 'val', 'test']])


Problematic words:
                  train  val  test
CHOSE                4    1     5
PLACED               4    1     5
FEW                  4    1     5
HUGE                 4    1    10
CARTOONS             4    1     5
WASTE                4    1     5
CLAY                 4    1     5
BEANS                4    1     5
THROUGH              8    2    15
PRIORITY             4    1     5
GENERALS             4    1     5
HOURS               12    3    15
ORGANIZATIONS        4    1     5
ILLUMINATING         8    2    15
IDENTICAL            4    1     5
DECORATE             4    1     5
ADDITION             4    1     5
CLASSROOMS           4    1     5
GARDEN              12    3    15
BOX                 12    3    15
CONSUMING            4    1     5
CREATE               8    2    15
KEEP                 8    2    15
COMPILE              8    2    10
BOWL                 4    1    10
MAKES                4    1    15
VEGETABLE            4    1     5
ENTER                4    1 

In [19]:
print(problematic_words.index)

Index(['CHOSE', 'PLACED', 'FEW', 'HUGE', 'CARTOONS', 'WASTE', 'CLAY', 'BEANS',
       'THROUGH', 'PRIORITY',
       ...
       'ATTENDANCE', 'TOOK', 'FELT', 'HIRES', 'TODD', 'WORD', 'COINS',
       'SUITABLE', 'ANKLE', 'BRIGHT'],
      dtype='object', length=147)


In [20]:
import shutil
source_dir = "../../../models/DC-TCN/datasets/ouluvs2_words_v2"

moved = 0
for word in problematic_words.index:
    word_folder = os.path.join(source_dir, word)
    moved_files = 0
    
    if os.path.exists(word_folder):
        train_folder = os.path.join(word_folder, 'train')
        val_folder = os.path.join(word_folder, 'val')
        test_folder = os.path.join(word_folder, 'test')
        
        if not os.path.exists(train_folder):
            os.makedirs(train_folder)
        
        if os.path.exists(val_folder):
            for filename in os.listdir(val_folder):
                src_path = os.path.join(val_folder, filename)
                dest_path = os.path.join(train_folder, filename)
                if os.path.isfile(src_path):
                    shutil.move(src_path, dest_path)
                    moved_files += 1
        
        if os.path.exists(test_folder):
            for filename in os.listdir(test_folder):
                src_path = os.path.join(test_folder, filename)
                dest_path = os.path.join(train_folder, filename)
                if os.path.isfile(src_path):
                    shutil.move(src_path, dest_path)
                    moved_files += 1
        if moved_files > 0:
            moved += 1
            print(f"Moved files for word: {word}. Total moved: {moved_files}")
    else:
        print(f"Word folder not found: {word_folder}")

print(f"\nTotal words processed with files moved: {moved}")

Moved files for word: CHOSE. Total moved: 12
Moved files for word: PLACED. Total moved: 12
Moved files for word: FEW. Total moved: 12
Moved files for word: HUGE. Total moved: 22
Moved files for word: CARTOONS. Total moved: 12
Moved files for word: WASTE. Total moved: 12
Moved files for word: CLAY. Total moved: 12
Moved files for word: BEANS. Total moved: 12
Moved files for word: THROUGH. Total moved: 34
Moved files for word: PRIORITY. Total moved: 12
Moved files for word: GENERALS. Total moved: 12
Moved files for word: HOURS. Total moved: 36
Moved files for word: ORGANIZATIONS. Total moved: 12
Moved files for word: ILLUMINATING. Total moved: 34
Moved files for word: IDENTICAL. Total moved: 12
Moved files for word: DECORATE. Total moved: 12
Moved files for word: ADDITION. Total moved: 12
Moved files for word: CLASSROOMS. Total moved: 12
Moved files for word: GARDEN. Total moved: 36
Moved files for word: BOX. Total moved: 36
Moved files for word: CONSUMING. Total moved: 12
Moved files fo

In [23]:
import os
import shutil
from sklearn.model_selection import train_test_split

source_dir = "../../../models/DC-TCN/datasets/ouluvs2_words_v2"
train_count = 0
val_count = 0
test_count = 0
moved = 0
for word in problematic_words.index:
    word_folder = os.path.join(source_dir, word)
    
    if os.path.isdir(word_folder):
        train_folder = os.path.join(word_folder, 'train')
        val_folder = os.path.join(word_folder, 'val')
        test_folder = os.path.join(word_folder, 'test')

        os.makedirs(train_folder, exist_ok=True)
        os.makedirs(val_folder, exist_ok=True)
        os.makedirs(test_folder, exist_ok=True)

        video_files = [f for f in os.listdir(train_folder) if f.endswith('.npz')]
        
        if len(video_files) > 1:
            # Split into train (70%), val (15%), test (15%)
            train_files, temp_files = train_test_split(video_files, test_size=0.3, random_state=42)
            val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)
        else:
            continue
            train_files, val_files, test_files = video_files, [], []  # Keep everything in train if only one file

        # Move files
        for split, file_list, dest_folder in [('train', train_files, train_folder),
                                              ('val', val_files, val_folder),
                                              ('test', test_files, test_folder)]:
            for file in file_list:
                src_video = os.path.join(train_folder, file)
                dst_video = os.path.join(dest_folder, file)
                shutil.move(src_video, dst_video)

                txt_file = file.replace('.npz', '.txt')
                src_txt = os.path.join(train_folder, txt_file)
                dst_txt = os.path.join(dest_folder, txt_file)

                if os.path.exists(src_txt):
                    shutil.move(src_txt, dst_txt)

                if split == 'train':
                    train_count += 1
                elif split == 'val':
                    val_count += 1
                elif split == 'test':
                    test_count += 1

        print(f"Processed {word_folder} - Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")
        moved += 1

print("✅ Done! Processed: ", moved)


Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/CHOSE - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/PLACED - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/FEW - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/HUGE - Train: 10, Val: 2, Test: 3
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/CARTOONS - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/WASTE - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/CLAY - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/BEANS - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/THROUGH - Train: 17, Val: 4, Test: 4
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/PRIORITY - Train: 7, Val: 1, Test: 2
Processed ../../../models/DC-TCN/datasets/ouluvs2_words_v2/GENE

In [2]:
v3_df = create_df("../../../models/DC-TCN/datasets/ouluvs2_words_v2")

NameError: name 'create_df' is not defined

In [28]:
# Find words where test or val count is higher than train
v3_problematic_words = v3_df[(v3_df['test'] >= v3_df['train']) | (v3_df['val'] >= v3_df['train'])]

print("Problematic words:\n", v3_problematic_words[['train', 'val', 'test']])


Problematic words:
 Empty DataFrame
Columns: [train, val, test]
Index: []


In [30]:
v3_total_versions_per_split = {split: {version: v3_df[version].apply(lambda x: x.get(split, 0)).sum() for version in version_columns} for split in split_columns}

print("Total counts for each version per split:")
for split, versions in v3_total_versions_per_split.items():
    print(f"\n{split.capitalize()}:")
    for version, count in versions.items():
        print(f"  {version}: {count}")


Total counts for each version per split:

Train:
  v1: 4867
  v2: 4955
  v3: 4921
  v4: 4926
  v5: 4959

Val:
  v1: 1211
  v2: 1182
  v3: 1201
  v4: 1189
  v5: 1168

Test:
  v1: 1794
  v2: 1736
  v3: 1750
  v4: 1757
  v5: 1746


In [31]:
v3_total_train = v3_df['train'].sum()
v3_total_val = v3_df['val'].sum()
v3_total_test = v3_df['test'].sum()

print(f"Total train: {v3_total_train}")
print(f"Total val: {v3_total_val}")
print(f"Total test: {v3_total_test}")

Total train: 24628
Total val: 5951
Total test: 8783


In [1]:
# Find words where test or val count is higher than train
v3_problematic_words = v3_df[(v3_df['test'] > (v3_df['val'] + 1))]

print("Problematic words:\n", v3_problematic_words[['train', 'val', 'test']])


NameError: name 'v3_df' is not defined

In [36]:
print(v3_problematic_words.index)

Index(['GUNPOINT', 'THAT', 'MOISTURE', 'ASSUME', 'YOUR', 'LARGER', 'DAMAGE',
       'THIN', 'JUICE', 'PLEASE',
       ...
       'SLIPPED', 'HER', 'THIEVES', 'NOTICES', 'INTO', 'IN', 'THOMAS', 'AM',
       'EARLY', 'THIS'],
      dtype='object', length=193)


In [15]:
import os
import shutil
import random

source_dir = "../../../models/DC-TCN/datasets/ouluvs2_words_dupe_v2"

moved = 0
for word in v4_problematic_words.index:
    word_folder = os.path.join(source_dir, word)
    moved_files = 0
    
    if os.path.exists(word_folder):
        train_folder = os.path.join(word_folder, 'train')
        val_folder = os.path.join(word_folder, 'val')
        test_folder = os.path.join(word_folder, 'test')
        
        if not os.path.exists(train_folder):
            os.makedirs(train_folder)
        
        num_val = len([f for f in os.listdir(val_folder) if f.endswith('.npz')]) if os.path.exists(val_folder) else 0
        num_test = len([f for f in os.listdir(test_folder) if f.endswith('.npz')]) if os.path.exists(test_folder) else 0
        
        files_needed = max(0, num_test - num_val)

        if files_needed > 0:
            test_files = [f for f in os.listdir(test_folder) if f.endswith('.npz')]
            if len(test_files) > files_needed:
                test_files = random.sample(test_files, files_needed)  # Randomly select files
            
            for npz_file in test_files:
                npz_path = os.path.join(test_folder, npz_file)
                txt_path = npz_path.replace('.npz', '.txt')
                
                shutil.move(npz_path, os.path.join(train_folder, npz_file))
                moved_files += 1
                
                if os.path.exists(txt_path):
                    shutil.move(txt_path, os.path.join(train_folder, os.path.basename(txt_path)))
                    moved_files += 1
        
        if moved_files > 0:
            moved += 1
            print(f"Moved {moved_files} files for word: {word}")
    else:
        print(f"Word folder not found: {word_folder}")

print(f"\nTotal words processed with files moved: {moved}")


Moved 6 files for word: GUNPOINT
Moved 26 files for word: THAT
Moved 6 files for word: MOISTURE
Moved 6 files for word: ASSUME
Moved 22 files for word: YOUR
Moved 6 files for word: LARGER
Moved 6 files for word: DAMAGE
Moved 6 files for word: THIN
Moved 4 files for word: JUICE
Moved 14 files for word: PLEASE
Moved 14 files for word: WATCH
Moved 8 files for word: IF
Moved 14 files for word: WHENEVER
Moved 6 files for word: SOYBEANS
Moved 6 files for word: SHOCKED
Moved 4 files for word: THINKS
Moved 6 files for word: JEWELS
Moved 14 files for word: OUR
Moved 4 files for word: ABOUT
Moved 14 files for word: HAND
Moved 6 files for word: TWICE
Moved 16 files for word: IT
Moved 10 files for word: WITH
Moved 4 files for word: TAXICAB
Moved 14 files for word: ACHES
Moved 244 files for word: A
Moved 4 files for word: DROP
Moved 6 files for word: FROM
Moved 4 files for word: TABLE
Moved 6 files for word: ANSWER
Moved 6 files for word: SUGGESTION
Moved 6 files for word: RACCOONS
Moved 10 files f

In [16]:
v4_df = create_df("../../../models/DC-TCN/datasets/ouluvs2_words_dupe_v2")

In [18]:
v4_total_train = v4_df['train'].sum()
v4_total_val = v4_df['val'].sum()
v4_total_test = v4_df['test'].sum()

print(f"Total train: {v4_total_train}")
print(f"Total val: {v4_total_val}")
print(f"Total test: {v4_total_test}")

Total train: 27088
Total val: 5951
Total test: 6323


In [19]:
v4_problematic_words = v4_df[(v4_df['test'] > (v4_df['val'] + 1))]

print("Problematic words:\n", v4_problematic_words[['train', 'val', 'test']])


Problematic words:
 Empty DataFrame
Columns: [train, val, test]
Index: []


In [20]:
version_columns = ['v1', 'v2', 'v3', 'v4', 'v5']
split_columns = ['train', 'val', 'test']
v4_total_versions_per_split = {split: {version: v4_df[version].apply(lambda x: x.get(split, 0)).sum() for version in version_columns} for split in split_columns}

print("Total counts for each version per split:")
for split, versions in v4_total_versions_per_split.items():
    print(f"\n{split.capitalize()}:")
    for version, count in versions.items():
        print(f"  {version}: {count}")


Total counts for each version per split:

Train:
  v1: 5362
  v2: 5450
  v3: 5388
  v4: 5454
  v5: 5434

Val:
  v1: 1211
  v2: 1182
  v3: 1201
  v4: 1189
  v5: 1168

Test:
  v1: 1299
  v2: 1241
  v3: 1283
  v4: 1229
  v5: 1271


In [25]:
v4_total_train_zeros = (v4_df['train'] == 0).sum()
v4_total_val_zeros = (v4_df['val'] == 0).sum()
v4_total_test_zeros = (v4_df['test'] == 0).sum()

print(f"\nTotal V4 train = 0: {v4_total_train_zeros}")
print(f"Total V4 val = 0: {v4_total_val_zeros}")
print(f"Total V4 test = 0: {v4_total_test_zeros}")


Total V4 train = 0: 0
Total V4 val = 0: 0
Total V4 test = 0: 0


In [24]:
print("\nWord and File Counts per Subfolder and Versions:")

pd.set_option('display.max_rows', None)
print(v4_df)


Word and File Counts per Subfolder and Versions:
                 train  val  test                                       v1  \
GUNPOINT            11    2     2        {'train': 3, 'val': 0, 'test': 0}   
THEIR               24    6     5        {'train': 6, 'val': 0, 'test': 1}   
PROVOKED             3    1     1        {'train': 1, 'val': 0, 'test': 0}   
CLEANERS             3    1     1        {'train': 0, 'val': 1, 'test': 0}   
CHOSE                7    1     2        {'train': 1, 'val': 0, 'test': 1}   
UNTIL               10    2     3        {'train': 2, 'val': 0, 'test': 1}   
PLACED               7    1     2        {'train': 2, 'val': 0, 'test': 0}   
BACKED               3    1     1        {'train': 0, 'val': 0, 'test': 1}   
ACCORDING            7    1     2        {'train': 2, 'val': 0, 'test': 0}   
CHIP                10    2     3        {'train': 2, 'val': 0, 'test': 1}   
SPLINTER             3    1     1        {'train': 1, 'val': 0, 'test': 0}   
FREQUENTLY    

In [26]:
total_rows = len(v4_df)
print(f"Total rows in v4_df: {total_rows}")


Total rows in v4_df: 1203
