In [1]:
import os
import shutil
import random
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split

'''
original_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
'''
original_dir = "../lrs3_may/preprocessed_videos_all"
dir = original_dir

In [2]:
# Duplicate folder for checkpoint
source_folder = "../../../models/DC-TCN/datasets/lrs3_words_original"
destination_folder = "../../../models/DC-TCN/datasets/lrs3_words_v2_retry"

if not os.path.exists(destination_folder):
    shutil.copytree(source_folder, destination_folder)
    print(f"Copied {source_folder} to {destination_folder}")
else:
    print(f"Destination folder '{destination_folder}' already exists!")


Destination folder '../../../models/DC-TCN/datasets/lrs3_words_v2_retry' already exists!


In [2]:
def create_df(dir):
    word_data_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    
    for word in os.listdir(dir):
        word_path = os.path.join(dir, word)
    
        if not os.path.isdir(word_path):
            print(f"Skipping {word}: not a directory.")
            continue
    
        for split in ['train', 'val', 'test']:
            split_path = os.path.join(word_path, split)
            
            if not os.path.isdir(split_path):
                continue
        
            file_count = sum(1 for file in os.listdir(split_path) if file.endswith(".mp4"))
            word_data_counts[word][split]["total"] = file_count

    df = pd.DataFrame(columns=["train", "val", "test"])
    
    for word, splits in word_data_counts.items():
        row = {}
        row["train"] = splits["train"]["total"]
        row["val"] = splits["val"]["total"]
        row["test"] = splits["test"]["total"]
        df.loc[word] = row
    
    df['total'] = df[['train', 'val', 'test']].sum(axis=1)
    return df

In [3]:
df = create_df(original_dir)

In [13]:
sorted_df = df.sort_index()
sorted_df['total'] = sorted_df[['train', 'val', 'test']].sum(axis=1)
sorted_df = sorted_df.sort_values(by='total', ascending=False)
print(sorted_df[['train', 'val', 'test', 'total']])

            train   val  test  total
THE         11261  2816   318  14395
TO           9084  2272   332  11688
AND          8692  2173   273  11138
THAT         6824  1706   243   8773
I            6745  1687   283   8715
...           ...   ...   ...    ...
MAXIMIZING      1     0     0      1
MAX'S           1     0     0      1
MAVERICK        1     0     0      1
MAVEN           1     0     0      1
KEITH           1     0     0      1

[15267 rows x 4 columns]


In [14]:
min = print(sorted_df['total'].min())
max = print(sorted_df['total'].max())

1
14395


In [15]:
max_value = sorted_df['total'].max()  # Get the max value from the 'total' column
for i in range(0, max_value + 1):  # Use max_value instead of max
    total = (sorted_df['total'] == i).sum()
    if total != 0:
        print(f"{i} files - {total}")

1 files - 6646
2 files - 2364
3 files - 1148
4 files - 799
5 files - 532
6 files - 422
7 files - 313
8 files - 245
9 files - 225
10 files - 185
11 files - 164
12 files - 124
13 files - 98
14 files - 101
15 files - 83
16 files - 95
17 files - 86
18 files - 77
19 files - 56
20 files - 60
21 files - 53
22 files - 41
23 files - 47
24 files - 48
25 files - 41
26 files - 44
27 files - 29
28 files - 28
29 files - 25
30 files - 21
31 files - 29
32 files - 20
33 files - 16
34 files - 26
35 files - 29
36 files - 13
37 files - 15
38 files - 22
39 files - 21
40 files - 29
41 files - 5
42 files - 19
43 files - 15
44 files - 15
45 files - 11
46 files - 20
47 files - 14
48 files - 12
49 files - 11
50 files - 8
51 files - 14
52 files - 13
53 files - 12
54 files - 11
55 files - 13
56 files - 11
57 files - 5
58 files - 10
59 files - 10
60 files - 8
61 files - 9
62 files - 6
63 files - 7
64 files - 7
65 files - 12
66 files - 10
67 files - 8
68 files - 6
69 files - 6
70 files - 2
71 files - 6
72 files - 4

In [16]:
print(f"Total of words with 0 or 1 or 2 files: {(sorted_df['total'] == 0).sum() + (sorted_df['total'] == 1).sum() +  (sorted_df['total'] == 2).sum()}")

Total of words with 0 or 1 or 2 files: 9010


In [17]:
# drop/delete the word folders that only contain 1 or 2 files
source_dir = "../lrs3_may/preprocessed_videos_all"
target_dir = "../lrs3_may/preprocessed_videos_dropped"

os.makedirs(target_dir, exist_ok=True)

words_to_delete = sorted_df[sorted_df['total'].isin([0, 1, 2])].index
print(words_to_delete)
moved = 0

for word in words_to_delete:
    folder_path = os.path.join(source_dir, word)

    if os.path.exists(folder_path):
        target_folder_path = os.path.join(target_dir, word)
        shutil.move(folder_path, target_folder_path)
        print(f"Moved {word} to {target_folder_path}")
        moved += 1
print(f"total moved: {moved}")

Index(['BIOMIMICRY', 'NEOCORTEX', 'AMANATULLAH', 'IPODS', 'SOFA', 'SLICK',
       'OBSERVATIONAL', 'IPCC', 'ROBOTOSPHERE', 'LOYALTY',
       ...
       'MAZATL', 'BODILY', 'BOGGLING', 'BOGUS', 'BOHR', 'MAXIMIZING', 'MAX'S',
       'MAVERICK', 'MAVEN', 'KEITH'],
      dtype='object', length=9010)
Moved BIOMIMICRY to ../lrs3_may/preprocessed_videos_dropped/BIOMIMICRY
Moved NEOCORTEX to ../lrs3_may/preprocessed_videos_dropped/NEOCORTEX
Moved AMANATULLAH to ../lrs3_may/preprocessed_videos_dropped/AMANATULLAH
Moved IPODS to ../lrs3_may/preprocessed_videos_dropped/IPODS
Moved SOFA to ../lrs3_may/preprocessed_videos_dropped/SOFA
Moved SLICK to ../lrs3_may/preprocessed_videos_dropped/SLICK
Moved OBSERVATIONAL to ../lrs3_may/preprocessed_videos_dropped/OBSERVATIONAL
Moved IPCC to ../lrs3_may/preprocessed_videos_dropped/IPCC
Moved ROBOTOSPHERE to ../lrs3_may/preprocessed_videos_dropped/ROBOTOSPHERE
Moved LOYALTY to ../lrs3_may/preprocessed_videos_dropped/LOYALTY
Moved MUTUALLY to ../lrs3_may/pre

In [18]:
new_dir = "../lrs3_may/preprocessed_videos_all"
v2_df = create_df(new_dir)

In [19]:
# Compare original vs v2

In [20]:
# Train Val Test Counts for Original Dataset
print("\nOriginal Word and File Counts per Subfolder:")
print(sorted_df[['train', 'val', 'test', 'total']])


Original Word and File Counts per Subfolder:
            train   val  test  total
THE         11261  2816   318  14395
TO           9084  2272   332  11688
AND          8692  2173   273  11138
THAT         6824  1706   243   8773
I            6745  1687   283   8715
...           ...   ...   ...    ...
MAXIMIZING      1     0     0      1
MAX'S           1     0     0      1
MAVERICK        1     0     0      1
MAVEN           1     0     0      1
KEITH           1     0     0      1

[15267 rows x 4 columns]


In [21]:
# Train Val Test Counts for V2 Dataset
print("\nV2 Word and File Counts per Subfolder:")
v2_sorted_df = v2_df.sort_index()
v2_sorted_df['total'] = v2_sorted_df[['train', 'val', 'test']].sum(axis=1)
v2_sorted_df = v2_sorted_df.sort_values(by='total', ascending=False)
print(v2_sorted_df[['train', 'val', 'test', 'total']])


V2 Word and File Counts per Subfolder:
           train   val  test  total
THE        11261  2816   318  14395
TO          9084  2272   332  11688
AND         8692  2173   273  11138
THAT        6824  1706   243   8773
I           6745  1687   283   8715
...          ...   ...   ...    ...
CRUSHING       1     1     1      3
FORMATION      2     1     0      3
FORREST        2     1     0      3
SHATTERED      2     1     0      3
COLDEST        1     1     1      3

[6257 rows x 4 columns]


In [22]:
# Total of each set
total_train = sorted_df['train'].sum()
total_val = sorted_df['val'].sum()
total_test = sorted_df['test'].sum()

print(f"Total OG train: {total_train}")
print(f"Total OG val: {total_val}")
print(f"Total OG test: {total_test}")

v2_total_train = v2_sorted_df['train'].sum()
v2_total_val = v2_sorted_df['val'].sum()
v2_total_test = v2_sorted_df['test'].sum()

print(f"\nTotal V2 train: {v2_total_train}")
print(f"Total V2 val: {v2_total_val}")
print(f"Total V2 test: {v2_total_test}")

Total OG train: 283034
Total OG val: 73735
Total OG test: 9890

Total V2 train: 274112
Total V2 val: 71452
Total V2 test: 9721


In [23]:
# The number of words that are empty in a set
total_train_zeros = (sorted_df['train'] == 0).sum()
total_val_zeros = (sorted_df['val'] == 0).sum()
total_test_zeros = (sorted_df['test'] == 0).sum()

print(f"Total OG train = 0: {total_train_zeros}")
print(f"Total OG val = 0: {total_val_zeros}")
print(f"Total OG test = 0: {total_test_zeros}")

v2_total_train_zeros = (v2_sorted_df['train'] == 0).sum()
v2_total_val_zeros = (v2_sorted_df['val'] == 0).sum()
v2_total_test_zeros = (v2_sorted_df['test'] == 0).sum()

print(f"\nTotal V2 train = 0: {v2_total_train_zeros}")
print(f"Total V2 val = 0: {v2_total_val_zeros}")
print(f"Total V2 test = 0: {v2_total_test_zeros}")

Total OG train = 0: 89
Total OG val = 0: 6730
Total OG test = 0: 13397

Total V2 train = 0: 1
Total V2 val = 0: 3
Total V2 test = 0: 4553


In [24]:
# The number of words that have only 1 or 2 files
print(f"Total of words in OG with 1 or 2 files: {(sorted_df['total'] == 1).sum() +  (sorted_df['total'] == 2).sum()}")
print(f"Total of words in v2 with 1 or 2 files: {(v2_sorted_df['total'] == 1).sum() +  (v2_sorted_df['total'] == 2).sum()}")

Total of words in OG with 1 or 2 files: 9010
Total of words in v2 with 1 or 2 files: 0


In [25]:
# Number of words
print(f"Number of words in OG: {sorted_df.shape[0]}")
print(f"Number of words in V2: {v2_sorted_df.shape[0]}")

Number of words in OG: 15267
Number of words in V2: 6257


In [26]:
# Combine trainval again
moved = 0

# Loop through each word
for word in os.listdir(dir):
    # Define folder paths for train and val
    train_folder_path = os.path.join(dir, word, 'train')
    val_folder_path = os.path.join(dir, word, 'val')

    # Check if val folder exists and has files to move
    if os.path.exists(val_folder_path):
        # Check if train folder exists, if not create it
        if not os.path.exists(train_folder_path):
            os.makedirs(train_folder_path)

        # Move all files from val folder to train folder
        for item in os.listdir(val_folder_path):
            val_item_path = os.path.join(val_folder_path, item)
            train_item_path = os.path.join(train_folder_path, item)

            # Move the file
            if os.path.isdir(val_item_path):
                shutil.move(val_item_path, train_item_path)
            else:
                shutil.move(val_item_path, train_item_path)

        print(f"Moved all files from {word}/val to {word}/train")
        moved += 1

print(f"Total words processed: {moved}")


Moved all files from VIEWED/val to VIEWED/train
Moved all files from RECENTLY/val to RECENTLY/train
Moved all files from ARRIVE/val to ARRIVE/train
Moved all files from THEIR/val to THEIR/train
Moved all files from ARABIC/val to ARABIC/train
Moved all files from CHEMOTHERAPY/val to CHEMOTHERAPY/train
Moved all files from BOLD/val to BOLD/train
Moved all files from PASSING/val to PASSING/train
Moved all files from PARTICULAR/val to PARTICULAR/train
Moved all files from BASKET/val to BASKET/train
Moved all files from INFORMED/val to INFORMED/train
Moved all files from ANTHROPOLOGY/val to ANTHROPOLOGY/train
Moved all files from KILL/val to KILL/train
Moved all files from GLAND/val to GLAND/train
Moved all files from SHAKING/val to SHAKING/train
Moved all files from SUM/val to SUM/train
Moved all files from SYRIA/val to SYRIA/train
Moved all files from RESEARCHING/val to RESEARCHING/train
Moved all files from INNOVATOR/val to INNOVATOR/train
Moved all files from SHOCK/val to SHOCK/train
Mo

In [8]:
updated_df = create_df(dir)

In [9]:
# Get the words that have total = 3
# Put all in train
words_with_3 = updated_df[(updated_df['total'] == 3)]
print(len(words_with_3))

1121


In [11]:
import os
import shutil

moved = 0

for word in words_with_3.index:
    train_folder_path = os.path.join(dir, word, 'train')
    val_folder_path = os.path.join(dir, word, 'val')
    test_folder_path = os.path.join(dir, word, 'test')

    os.makedirs(train_folder_path, exist_ok=True)
    os.makedirs(val_folder_path, exist_ok=True)
    os.makedirs(test_folder_path, exist_ok=True)

    total_files = 0
    if os.path.exists(train_folder_path):
        total_files += len([f for f in os.listdir(train_folder_path) if f.endswith('.mp4') and os.path.isfile(os.path.join(train_folder_path, f))])

    if os.path.exists(val_folder_path):
        total_files += len([f for f in os.listdir(val_folder_path) if f.endswith('.mp4') and os.path.isfile(os.path.join(val_folder_path, f))])

    if os.path.exists(test_folder_path):
        total_files += len([f for f in os.listdir(test_folder_path) if f.endswith('.mp4') and os.path.isfile(os.path.join(test_folder_path, f))])

    # If the total number of .npz files across all sets is exactly 3, move everything to the train folder
    if total_files == 3:

        # Move .npz files from val folder to train folder
        if os.path.exists(val_folder_path):
            for item in os.listdir(val_folder_path):
                val_item_path = os.path.join(val_folder_path, item)
                train_item_path = os.path.join(train_folder_path, item)

                if os.path.isfile(val_item_path):
                    shutil.move(val_item_path, train_item_path)
                    print(f"Moved {item} from {word}/val to {word}/train")

        # Move .npz files from test folder to train folder (if any)
        if os.path.exists(test_folder_path):
            for item in os.listdir(test_folder_path):
                test_item_path = os.path.join(test_folder_path, item)
                train_item_path = os.path.join(train_folder_path, item)

                if os.path.isfile(test_item_path):
                    shutil.move(test_item_path, train_item_path)
                    print(f"Moved {item} from {word}/test to {word}/train")

        moved += 1
        print(f"Moved all 3 .npz files from {word}/train, {word}/val, and {word}/test to {word}/train")

print(f"Total words processed: {moved}")


Moved all 3 .npz files from ARABIC/train, ARABIC/val, and ARABIC/test to ARABIC/train
Moved all 3 .npz files from ANTHROPOLOGY/train, ANTHROPOLOGY/val, and ANTHROPOLOGY/test to ANTHROPOLOGY/train
Moved all 3 .npz files from GLAND/train, GLAND/val, and GLAND/test to GLAND/train
Moved all 3 .npz files from SHAKING/train, SHAKING/val, and SHAKING/test to SHAKING/train
Moved all 3 .npz files from SYMPHONY/train, SYMPHONY/val, and SYMPHONY/test to SYMPHONY/train
Moved all 3 .npz files from DECEPTION/train, DECEPTION/val, and DECEPTION/test to DECEPTION/train
Moved all 3 .npz files from ANSWERING/train, ANSWERING/val, and ANSWERING/test to ANSWERING/train
Moved all 3 .npz files from BRIDE/train, BRIDE/val, and BRIDE/test to BRIDE/train
Moved all 3 .npz files from CLAPPING/train, CLAPPING/val, and CLAPPING/test to CLAPPING/train
Moved all 3 .npz files from APPRENTICESHIP/train, APPRENTICESHIP/val, and APPRENTICESHIP/test to APPRENTICESHIP/train
Moved all 3 .npz files from FREQUENTLY/train, FR

In [3]:
updated_df = create_df(dir)

In [4]:
# The words that have an empty set (either train, val, or test)
words_with_zeros = updated_df[(updated_df['train'] == 0) | (updated_df['val'] == 0) | (updated_df['test'] == 0)]
pd.set_option('display.max_rows', None)
print(words_with_zeros)

                  train  val  test  total
VIEWED                5    0     0      5
RECENTLY             31    0     0     31
ARRIVE                8    0     0      8
THEIR               838    0    14    852
ARABIC                3    0     0      3
CHEMOTHERAPY          5    0     1      6
BOLD                  4    0     0      4
PASSING              11    0     0     11
PARTICULAR           34    0     1     35
BASKET                7    0     0      7
INFORMED              9    0     1     10
ANTHROPOLOGY          3    0     0      3
KILL                 32    0     2     34
GLAND                 3    0     0      3
SHAKING               3    0     0      3
SUM                  22    0     0     22
SYRIA                 5    0     0      5
RESEARCHING          10    0     0     10
INNOVATOR             7    0     1      8
SHOCK                 6    0     0      6
SYMPHONY              3    0     0      3
FUNERAL               9    0     0      9
CHOSE                16    0     0

In [5]:
# 7. All the words that have total = 4, and 3 is in train
# make it put 1 from train in the empty set
words_4_files_3_train = words_with_zeros[(words_with_zeros['total'] == 4) & (words_with_zeros['train'] == 3) ]
print(f"Number of words: {len(words_4_files_3_train)}")
print(words_4_files_3_train)

Number of words: 70
               train  val  test  total
LUTHER             3    0     1      4
GEIGER             3    0     1      4
METHANE            3    0     1      4
WEALTHY            3    0     1      4
SICKNESS           3    0     1      4
UNCONDITIONAL      3    0     1      4
UNEXPECTED         3    0     1      4
FOOTPRINT          3    0     1      4
EARNED             3    0     1      4
DICTATOR           3    0     1      4
BEIJING            3    0     1      4
VIETNAM            3    0     1      4
OUTRAGED           3    0     1      4
NONSENSE           3    0     1      4
IVY                3    0     1      4
THIRTY             3    0     1      4
STICKY             3    0     1      4
DAN                3    0     1      4
HEALER             3    0     1      4
COMPLAIN           3    0     1      4
COPYING            3    0     1      4
JUNK               3    0     1      4
DADDY              3    0     1      4
INVADED            3    0     1      4
ANNOY

In [7]:
moved = 0

for word in words_4_files_3_train.index:
    train_path = os.path.join(dir, word, 'train')
    val_path = os.path.join(dir, word, 'val')
    test_path = os.path.join(dir, word, 'test')

    # Determine which set is empty
    target_path = None
    if len(os.listdir(val_path)) == 0:
        target_path = val_path
    elif len(os.listdir(test_path)) == 0:
        target_path = test_path

    if target_path and os.path.exists(train_path):
        train_files = [f for f in os.listdir(train_path) if f.endswith('.mp4')]

        if len(train_files) == 3:
            # Select 1 file to move
            file_to_move = random.choice(train_files)

            src_npz = os.path.join(train_path, file_to_move)
            dst_npz = os.path.join(target_path, file_to_move)
            shutil.move(src_npz, dst_npz)
            # shutil.move(src_npz.replace(".npz", ".txt"), dst_npz.replace(".npz", ".txt"))

            print(f"Moved {file_to_move} from {train_path} to {target_path}")
            moved += 1

print(f"Total files moved: {moved}")


Moved LUTHER_eMdh57x1UBQ_50003.mp4 from ../lrs3_may/preprocessed_videos_all/LUTHER/train to ../lrs3_may/preprocessed_videos_all/LUTHER/val
Moved GEIGER_BoRUrWcdkQ4_50010.mp4 from ../lrs3_may/preprocessed_videos_all/GEIGER/train to ../lrs3_may/preprocessed_videos_all/GEIGER/val
Moved METHANE_wsIrdxDJpSQ_50004.mp4 from ../lrs3_may/preprocessed_videos_all/METHANE/train to ../lrs3_may/preprocessed_videos_all/METHANE/val
Moved WEALTHY_aeiSXpmMQys_50020.mp4 from ../lrs3_may/preprocessed_videos_all/WEALTHY/train to ../lrs3_may/preprocessed_videos_all/WEALTHY/val
Moved SICKNESS_QISHX5UKky0_50003.mp4 from ../lrs3_may/preprocessed_videos_all/SICKNESS/train to ../lrs3_may/preprocessed_videos_all/SICKNESS/val
Moved UNCONDITIONAL_A71OktxTPac_50016.mp4 from ../lrs3_may/preprocessed_videos_all/UNCONDITIONAL/train to ../lrs3_may/preprocessed_videos_all/UNCONDITIONAL/val
Moved UNEXPECTED_lnvkNGc6YYM_50005.mp4 from ../lrs3_may/preprocessed_videos_all/UNEXPECTED/train to ../lrs3_may/preprocessed_videos_a

In [8]:
# Check updated
updated_df = create_df(dir)
updated_words_4_files_3_train = updated_df[(updated_df['total'] == 4) & (updated_df['train'] == 3) ]
print(updated_words_4_files_3_train)
print(f"Number of words: {len(updated_words_4_files_3_train)}")

Empty DataFrame
Columns: [train, val, test, total]
Index: []
Number of words: 0


In [9]:
# 9. All the words that have total = 4, and 2 is in train and 0 in val
# put 1 from test into val
# 10. All the words that have total = 4, and 2 is in train and 0 in test
# put 1 from val into test
words_4_files_2_test_2val = updated_df[(updated_df['total'] == 4) & (updated_df['train'] == 2) & (updated_df['test'] == 0) | (updated_df['total'] == 4) & (updated_df['train'] == 2) & (updated_df['val'] == 0)]
print(words_4_files_2_test_2val)
print(f"Number of words: {len(words_4_files_2_test_2val)}")

              train  val  test  total
ASPIRE            2    0     2      4
UPGRADE           2    0     2      4
JEFFERSON         2    0     2      4
CAPTURING         2    0     2      4
MANIPULATION      2    0     2      4
INTERFACE         2    0     2      4
MOMENTUM          2    0     2      4
HAMILTON          2    0     2      4
Number of words: 8


In [11]:
moved = 0

for word in words_4_files_2_test_2val.index:
    test_path = os.path.join(dir, word, 'test')
    val_path = os.path.join(dir, word, 'val')

    if os.path.exists(test_path) and os.path.exists(val_path):
        test_files = [f for f in os.listdir(test_path) if f.endswith('.mp4')]
        val_files = [f for f in os.listdir(val_path) if f.endswith('.mp4')]

        # Case 1: If train = 2 and val = 0, move 1 file from test to val
        if len(test_files) == 2 and len(val_files) == 0:
            file_to_move = random.choice(test_files)
            
            src_npz = os.path.join(test_path, file_to_move)
            dst_npz = os.path.join(val_path, file_to_move)
            shutil.move(src_npz, dst_npz)
            #shutil.move(src_npz.replace(".npz", ".txt"), dst_npz.replace(".npz", ".txt"))

            print(f"Moved {file_to_move} from {test_path} to {val_path}")
            moved += 1

        # Case 2: If train = 2 and test = 0, move 1 file from val to test
        elif len(val_files) == 2 and len(test_files) == 0:
            file_to_move = random.choice(val_files)
            
            src_npz = os.path.join(val_path, file_to_move)
            dst_npz = os.path.join(test_path, file_to_move)
            shutil.move(src_npz, dst_npz)
            #shutil.move(src_npz.replace(".npz", ".txt"), dst_npz.replace(".npz", ".txt"))

            print(f"Moved {file_to_move} from {val_path} to {test_path}")
            moved += 1

print(f"Total files moved: {moved}")


Moved ASPIRE_YzGjO5aHShQ_00007.mp4 from ../lrs3_may/preprocessed_videos_all/ASPIRE/test to ../lrs3_may/preprocessed_videos_all/ASPIRE/val
Moved UPGRADE_fxbCHn6gE3U_00005.mp4 from ../lrs3_may/preprocessed_videos_all/UPGRADE/test to ../lrs3_may/preprocessed_videos_all/UPGRADE/val
Moved JEFFERSON_3uSQlcGCHUU_00017.mp4 from ../lrs3_may/preprocessed_videos_all/JEFFERSON/test to ../lrs3_may/preprocessed_videos_all/JEFFERSON/val
Moved CAPTURING_mc0vhSseGk4_00001.mp4 from ../lrs3_may/preprocessed_videos_all/CAPTURING/test to ../lrs3_may/preprocessed_videos_all/CAPTURING/val
Moved MANIPULATION_AO4In7d6XSc_00001.mp4 from ../lrs3_may/preprocessed_videos_all/MANIPULATION/test to ../lrs3_may/preprocessed_videos_all/MANIPULATION/val
Moved INTERFACE_uzKBGtf0i0M_00004.mp4 from ../lrs3_may/preprocessed_videos_all/INTERFACE/test to ../lrs3_may/preprocessed_videos_all/INTERFACE/val
Moved MOMENTUM_VIgzTLDyObo_00001.mp4 from ../lrs3_may/preprocessed_videos_all/MOMENTUM/test to ../lrs3_may/preprocessed_vide

In [12]:
# Check updated
updated_df = create_df(dir)
updated_words_4_files_2_test_2val = updated_df[(updated_df['total'] == 4) & (updated_df['train'] == 2) & (updated_df['test'] == 0) | (updated_df['total'] == 4) & (updated_df['train'] == 2) & (updated_df['val'] == 0)]
print(updated_words_4_files_2_test_2val)
print(f"Number of words: {len(updated_words_4_files_2_test_2val)}")

Empty DataFrame
Columns: [train, val, test, total]
Index: []
Number of words: 0


In [13]:
# 11. All the words that have total = 4, and 1 is in train
# move 1 from val to train and 1 to test
words_4_files_1_train = updated_df[(updated_df['total'] == 4) & (updated_df['train'] == 1)]
print(words_4_files_1_train)
print(f"Number of words: {len(words_4_files_1_train)}")

Empty DataFrame
Columns: [train, val, test, total]
Index: []
Number of words: 0


In [None]:
moved = 0

for word in words_4_files_1_train.index:
    val_path = os.path.join(dir, word, 'val')
    train_path = os.path.join(dir, word, 'train')
    test_path = os.path.join(dir, word, 'test')

    if os.path.exists(val_path):
        val_files = [f for f in os.listdir(val_path) if f.endswith('.mp4')]
        
        if len(val_files) >= 2:
            os.makedirs(train_path, exist_ok=True)
            os.makedirs(test_path, exist_ok=True)

            # Move one file to train
            file_to_train = val_files.pop(random.randint(0, len(val_files) - 1))
            shutil.move(os.path.join(val_path, file_to_train), os.path.join(train_path, file_to_train))
            #shutil.move(os.path.join(val_path, file_to_train.replace(".npz", ".txt")),
                        #os.path.join(train_path, file_to_train.replace(".npz", ".txt")))
            
            # Move one file to test
            file_to_test = val_files.pop()
            shutil.move(os.path.join(val_path, file_to_test), os.path.join(test_path, file_to_test))
            #shutil.move(os.path.join(val_path, file_to_test.replace(".npz", ".txt")),
                        #os.path.join(test_path, file_to_test.replace(".npz", ".txt")))

            moved += 1

print(f"Total files moved: {moved}")

In [14]:
# Check updated
updated_df = create_df(dir)
updated_words_4_files_1_train = updated_df[(updated_df['total'] == 4) & (updated_df['train'] == 1)]
print(updated_words_4_files_1_train)
print(f"Number of words: {len(updated_words_4_files_1_train)}")

Empty DataFrame
Columns: [train, val, test, total]
Index: []
Number of words: 0


In [15]:
# 12. All the words that have total = 5, and 4 is in train
# put 1 from train into the empty set
words_5_files_4_train = words_with_zeros[(words_with_zeros['total'] == 5) & (words_with_zeros['train'] == 4)]
print(words_5_files_4_train)
print(f"Number of words: {len(words_5_files_4_train)}")

                train  val  test  total
STAGGERING          4    0     1      5
HUMANITARIAN        4    0     1      5
DEMANDING           4    0     1      5
MOORE'S             4    0     1      5
COMPARISON          4    0     1      5
HARNESS             4    0     1      5
PROGRESSIVE         4    0     1      5
MISSES              4    0     1      5
BOSNIA              4    0     1      5
EXAMINED            4    0     1      5
TANK                4    0     1      5
AUDIO               4    0     1      5
ADOPT               4    0     1      5
NAZI                4    0     1      5
CRACKED             4    0     1      5
COW                 4    0     1      5
CRITICALLY          4    0     1      5
SLOGAN              4    0     1      5
LUMP                4    0     1      5
SILICON             4    0     1      5
REPLACEMENT         4    0     1      5
GREENHOUSE          4    0     1      5
PREDATOR            4    0     1      5
DREAMED             4    0     1      5


In [16]:
moved = 0

for word in words_5_files_4_train.index:
    train_path = os.path.join(dir, word, 'train')
    val_path = os.path.join(dir, word, 'val')
    test_path = os.path.join(dir, word, 'test')

    train_files = [f for f in os.listdir(train_path) if f.endswith('.mp4')]
    val_files = [f for f in os.listdir(val_path) if f.endswith('.mp4')]
    test_files = [f for f in os.listdir(test_path) if f.endswith('.mp4')]

    if len(train_files) >= 1:
        file_to_move = random.choice(train_files)

        # Move to the empty set (prioritize val, then test)
        if len(val_files) == 0:
            dst_path = val_path
        elif len(test_files) == 0:
            dst_path = test_path
        else:
            continue  # Skip if both val and test are non-empty

        # Ensure the destination directory exists
        os.makedirs(dst_path, exist_ok=True)

        # Move both .npz and .txt files
        shutil.move(os.path.join(train_path, file_to_move), os.path.join(dst_path, file_to_move))
        #shutil.move(os.path.join(train_path, file_to_move.replace(".npz", ".txt")), 
                    #os.path.join(dst_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {train_path} to {dst_path}")
        moved += 1

print(f"Total files moved: {moved}")

Moved STAGGERING_BBSVLGf7zPI_50001.mp4 from ../lrs3_may/preprocessed_videos_all/STAGGERING/train to ../lrs3_may/preprocessed_videos_all/STAGGERING/val
Moved HUMANITARIAN_4N5N0pQUFbI_50001.mp4 from ../lrs3_may/preprocessed_videos_all/HUMANITARIAN/train to ../lrs3_may/preprocessed_videos_all/HUMANITARIAN/val
Moved DEMANDING_Df2JBnql8lc_50005.mp4 from ../lrs3_may/preprocessed_videos_all/DEMANDING/train to ../lrs3_may/preprocessed_videos_all/DEMANDING/val
Moved MOORE'S_APgMXDjLcT4_50006_2.mp4 from ../lrs3_may/preprocessed_videos_all/MOORE'S/train to ../lrs3_may/preprocessed_videos_all/MOORE'S/val
Moved COMPARISON_qQDC7c2I8l8_50006.mp4 from ../lrs3_may/preprocessed_videos_all/COMPARISON/train to ../lrs3_may/preprocessed_videos_all/COMPARISON/val
Moved HARNESS_0FQXicAGy5U_50008.mp4 from ../lrs3_may/preprocessed_videos_all/HARNESS/train to ../lrs3_may/preprocessed_videos_all/HARNESS/val
Moved PROGRESSIVE_tTvIEUGCigk_50003.mp4 from ../lrs3_may/preprocessed_videos_all/PROGRESSIVE/train to ../lr

In [17]:
# Check updated
updated_df = create_df(dir)
updated_words_5_files_4_train = updated_df[(updated_df['total'] == 5) & (updated_df['train'] == 4)]
print(updated_words_5_files_4_train)
print(f"Number of words: {len(updated_words_5_files_4_train)}")

Empty DataFrame
Columns: [train, val, test, total]
Index: []
Number of words: 0


In [18]:
# 13. All the words that have total = 5, and 3 is in train and test is empty
# put 1 from val into test
# 14. All the words that have total = 5, and 3 is in train and val is empty
# put 1 from test into val
words_5_files_0_testval = words_with_zeros[(updated_df['total'] == 5) & (updated_df['train'] == 3) & (updated_df['test'] == 0) | (updated_df['total'] == 5) & (updated_df['train'] == 3) & (updated_df['val'] == 0)]
print(words_5_files_0_testval)
print(f"Number of words: {len(words_5_files_0_testval)}")

               train  val  test  total
SUPPORTED          3    0     2      5
WORKFORCE          3    0     2      5
CONTRADICTION      3    0     2      5
Number of words: 3


In [19]:
moved = 0

for word in words_5_files_0_testval.index:
    val_path = os.path.join(dir, word, 'val')
    test_path = os.path.join(dir, word, 'test')

    val_files = [f for f in os.listdir(val_path) if f.endswith('.mp4')]
    test_files = [f for f in os.listdir(test_path) if f.endswith('.mp4')]

    # Case 1: Test is empty → Move 1 file from val to test
    if len(test_files) == 0 and len(val_files) == 2:
        file_to_move = random.choice(val_files)

        os.makedirs(test_path, exist_ok=True)

        shutil.move(os.path.join(val_path, file_to_move), os.path.join(test_path, file_to_move))
        #shutil.move(os.path.join(val_path, file_to_move.replace(".npz", ".txt")), 
                   # os.path.join(test_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {val_path} to {test_path}")
        moved += 1

    # Case 2: Val is empty → Move 1 file from test to val
    elif len(val_files) == 0 and len(test_files) == 2:
        file_to_move = random.choice(test_files)

        os.makedirs(val_path, exist_ok=True)  # Ensure val folder exists

        shutil.move(os.path.join(test_path, file_to_move), os.path.join(val_path, file_to_move))
        #shutil.move(os.path.join(test_path, file_to_move.replace(".npz", ".txt")), 
                   # os.path.join(val_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {test_path} to {val_path}")
        moved += 1

print(f"Total files moved: {moved}")


Moved SUPPORTED_87AEeLpodnE_00001.mp4 from ../lrs3_may/preprocessed_videos_all/SUPPORTED/test to ../lrs3_may/preprocessed_videos_all/SUPPORTED/val
Moved WORKFORCE_AegIbt2j0sU_00001.mp4 from ../lrs3_may/preprocessed_videos_all/WORKFORCE/test to ../lrs3_may/preprocessed_videos_all/WORKFORCE/val
Moved CONTRADICTIONS_FxtSMZKMdes_00005.mp4 from ../lrs3_may/preprocessed_videos_all/CONTRADICTION/test to ../lrs3_may/preprocessed_videos_all/CONTRADICTION/val
Total files moved: 3


In [20]:
updated_df = create_df(dir)
updated_words_5_files_0_testval = updated_df[(updated_df['total'] == 5) & (updated_df['train'] == 3) & (updated_df['test'] == 0) | (updated_df['total'] == 5) & (updated_df['train'] == 3) & (updated_df['val'] == 0)]
print(updated_words_5_files_0_testval)
print(f"Number of words: {len(updated_words_5_files_0_testval)}")

Empty DataFrame
Columns: [train, val, test, total]
Index: []
Number of words: 0


In [21]:
# 14. All the words that have total = 5, and 2 is in train
# put 1 from val to train and 1 to test
words_5_files_2_train = updated_df[(updated_df['total'] == 5) & (updated_df['train'] == 2)]
print(f"Number of words: {len(words_5_files_2_train)}")
print(words_5_files_2_train)

Number of words: 0
Empty DataFrame
Columns: [train, val, test, total]
Index: []


In [None]:
moved_to_train = 0
moved_to_test = 0

for word in words_5_files_2_train.index:
    val_path = os.path.join(dir, word, 'val')
    train_path = os.path.join(dir, word, 'train')
    test_path = os.path.join(dir, word, 'test')

    val_files = [f for f in os.listdir(val_path) if f.endswith('.npz')] if os.path.exists(val_path) else []

    # Move 1 file from val → train
    if len(val_files) > 1:  # Ensure at least 2 files in val
        file_to_move = random.choice(val_files)

        os.makedirs(train_path, exist_ok=True)  # Ensure train folder exists

        shutil.move(os.path.join(val_path, file_to_move), os.path.join(train_path, file_to_move))
        shutil.move(os.path.join(val_path, file_to_move.replace(".npz", ".txt")), 
                    os.path.join(train_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {val_path} to {train_path}")
        moved_to_train += 1

        val_files.remove(file_to_move)  # Update remaining val files

    # Move 1 file from val → test
    if len(val_files) > 0:  # Ensure at least 1 file remains in val
        file_to_move = random.choice(val_files)

        os.makedirs(test_path, exist_ok=True)  # Ensure test folder exists

        shutil.move(os.path.join(val_path, file_to_move), os.path.join(test_path, file_to_move))
        shutil.move(os.path.join(val_path, file_to_move.replace(".npz", ".txt")), 
                    os.path.join(test_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {val_path} to {test_path}")
        moved_to_test += 1

print(f"Total files moved: {moved_to_train} to train, {moved_to_test} to test")


In [22]:
# Check updated
updated_df = create_df(dir)
updated_words_5_files_2_train = updated_df[(updated_df['total'] == 5) & (updated_df['train'] == 2)]
print(f"Number of words: {len(updated_words_5_files_2_train)}")
print(updated_words_5_files_2_train)

Number of words: 0
Empty DataFrame
Columns: [train, val, test, total]
Index: []


In [23]:
# Check where test set >= train
# Move 1 to train and 1 to val
updated_words_test_higher = updated_df[(updated_df['test'] >= updated_df['train'])]
print(f"Number of words: {len(updated_words_test_higher)}")
print(updated_words_test_higher)

Number of words: 1
         train  val  test  total
TEXTING      3    0     3      6


In [37]:
moved_to_train = 0
moved_to_val = 0

for word in updated_words_test_higher.index:
    val_path = os.path.join(dir, word, 'val')
    train_path = os.path.join(dir, word, 'train')
    test_path = os.path.join(dir, word, 'test')

    test_files = [f for f in os.listdir(test_path) if f.endswith('.mp4')] if os.path.exists(test_path) else []
    '''
    # Move 1 file from test → train
    if len(test_path) > 1:  # Ensure at least 2 files in test
        file_to_move = random.choice(test_files)

        os.makedirs(train_path, exist_ok=True)

        shutil.move(os.path.join(test_path, file_to_move), os.path.join(train_path, file_to_move))
        #shutil.move(os.path.join(test_path, file_to_move.replace(".npz", ".txt")), 
                    #os.path.join(train_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {test_path} to {train_path}")
        moved_to_train += 1

        test_files.remove(file_to_move)  # Update remaining val files
    '''
    # Move 1 file from test → val
    if len(test_path) > 0:  # Ensure at least 1 file remains in test
        file_to_move = random.choice(test_files)

        os.makedirs(val_path, exist_ok=True)

        shutil.move(os.path.join(test_path, file_to_move), os.path.join(val_path, file_to_move))
        #shutil.move(os.path.join(test_path, file_to_move.replace(".npz", ".txt")), 
                    #os.path.join(val_path, file_to_move.replace(".npz", ".txt")))

        print(f"Moved {file_to_move} from {test_path} to {val_path}")
        test_files += 1
    
print(f"Total files moved: {moved_to_train} to train, {moved_to_val} to test")


IndexError: list index out of range

In [40]:
# Check updated
updated_df = create_df(dir)
updated_words_test_higher = updated_df[(updated_df['test'] >= updated_df['train'])]
print(f"Number of words: {len(updated_words_test_higher)}")
print(updated_words_test_higher)

Number of words: 0
Empty DataFrame
Columns: [train, val, test, total]
Index: []


In [41]:
print(updated_df.loc["TEXTING"])

train    4
val      1
test     1
total    6
Name: TEXTING, dtype: int64


In [43]:
# Duplicate folder for checkpoint
source_folder = "../lrs3_may/preprocessed_videos_all"
destination_folder = "../lrs3_may/preprocessed_videos_all_copy"

if not os.path.exists(destination_folder):
    shutil.copytree(source_folder, destination_folder)
    print(f"Copied {source_folder} to {destination_folder}")
else:
    print(f"Destination folder '{destination_folder}' already exists!")


Copied ../lrs3_may/preprocessed_videos_all to ../lrs3_may/preprocessed_videos_all_copy


In [42]:
# Skip words with proper splits
updated_skip_words = updated_df[
    ((updated_df['total'] == 4) & (updated_df['train'] == 2) & (updated_df['val'] == 1)) |
    ((updated_df['total'] == 5) & (updated_df['train'] == 3) & (updated_df['val'] == 1)) |
    ((updated_df['total'] == 6) & (updated_df['train'] == 4) & (updated_df['val'] == 1))
]
print(updated_skip_words)

                train  val  test  total
STAGGERING          3    1     1      5
HUMANITARIAN        3    1     1      5
TEXTING             4    1     1      6
LUTHER              2    1     1      4
GEIGER              2    1     1      4
DEMANDING           3    1     1      5
METHANE             2    1     1      4
MOORE'S             3    1     1      5
COMPARISON          3    1     1      5
WEALTHY             2    1     1      4
SICKNESS            2    1     1      4
ASPIRE              2    1     1      4
UPGRADE             2    1     1      4
UNCONDITIONAL       2    1     1      4
UNEXPECTED          2    1     1      4
HARNESS             3    1     1      5
FOOTPRINT           2    1     1      4
PROGRESSIVE         3    1     1      5
MISSES              3    1     1      5
EARNED              2    1     1      4
DICTATOR            2    1     1      4
BEIJING             2    1     1      4
VIETNAM             2    1     1      4
BOSNIA              3    1     1      5


In [45]:
for word in os.listdir(dir):  
    if word in updated_skip_words.index:  # Skip words that are in the skip list
        print(f"Skipping {word}, already properly split.")
        continue

    train_folder = os.path.join(dir, word, 'train')
    val_folder = os.path.join(dir, word, 'val')
    test_folder = os.path.join(dir, word, 'test')

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    # Collect all .npz files
    train_files = [os.path.join(train_folder, f) for f in os.listdir(train_folder) if f.endswith('.mp4')]
    val_files = [os.path.join(val_folder, f) for f in os.listdir(val_folder) if f.endswith('.mp4')]
    test_files = [os.path.join(test_folder, f) for f in os.listdir(test_folder) if f.endswith('.mp4')]

    # Get total file count
    total_files = len(train_files) + len(val_files) + len(test_files)

    # Calculate the ideal split
    target_train = round(0.7 * total_files)
    target_val = round(0.15 * total_files)
    target_test = total_files - target_train - target_val
    
    if total_files == 3:
        target_train, target_val, target_test = 2, 1, 0
    elif total_files == 4:
        target_train, target_val, target_test = 2, 1, 1
    elif total_files == 5:
        target_train, target_val, target_test = 3, 1, 1
        
    # Shuffle train files for redistribution
    remaining_files = train_files.copy()
    random.shuffle(remaining_files)

    # Adjust the test set first (if needed)
    while len(test_files) < target_test and remaining_files:
        file_to_move = remaining_files.pop()
        #txt_file = file_to_move.replace(".npz", ".txt")
        
        shutil.move(file_to_move, os.path.join(test_folder, os.path.basename(file_to_move)))
        
       #if os.path.exists(txt_file):
            #shutil.move(txt_file, os.path.join(test_folder, os.path.basename(txt_file)))
        
        test_files.append(file_to_move)


    # Assign remaining files to train and val
    new_train_files = remaining_files[:target_train]
    new_val_files = remaining_files[target_train:]
    
    for file in new_train_files:
        #txt_file = file.replace(".npz", ".txt")
        
        shutil.move(file, os.path.join(train_folder, os.path.basename(file)))
        
        #if os.path.exists(txt_file):
            #shutil.move(txt_file, os.path.join(train_folder, os.path.basename(txt_file)))
    for file in new_val_files:
        #txt_file = file.replace(".npz", ".txt")
        
        shutil.move(file, os.path.join(val_folder, os.path.basename(file)))
        
        #if os.path.exists(txt_file):
            #shutil.move(txt_file, os.path.join(val_folder, os.path.basename(txt_file)))
        
        
    print(f"Distributed {total_files} files for {word}:")
    print(f"  Train: {len([f for f in os.listdir(train_folder) if f.endswith('.mp4')])}, " 
      f"Val: {len([f for f in os.listdir(val_folder) if f.endswith('.mp4')])}, "
      f"Test: {len([f for f in os.listdir(test_folder) if f.endswith('.mp4')])}")


Distributed 5 files for VIEWED:
  Train: 3, Val: 1, Test: 1
Distributed 31 files for RECENTLY:
  Train: 22, Val: 5, Test: 4
Distributed 8 files for ARRIVE:
  Train: 6, Val: 1, Test: 1
Distributed 852 files for THEIR:
  Train: 596, Val: 128, Test: 128
Distributed 3 files for ARABIC:
  Train: 2, Val: 1, Test: 0
Distributed 6 files for CHEMOTHERAPY:
  Train: 4, Val: 1, Test: 1
Distributed 4 files for BOLD:
  Train: 2, Val: 1, Test: 1
Distributed 11 files for PASSING:
  Train: 8, Val: 2, Test: 1
Distributed 35 files for PARTICULAR:
  Train: 24, Val: 5, Test: 6
Distributed 7 files for BASKET:
  Train: 5, Val: 1, Test: 1
Distributed 10 files for INFORMED:
  Train: 7, Val: 2, Test: 1
Distributed 3 files for ANTHROPOLOGY:
  Train: 2, Val: 1, Test: 0
Distributed 34 files for KILL:
  Train: 24, Val: 5, Test: 5
Distributed 3 files for GLAND:
  Train: 2, Val: 1, Test: 0
Distributed 3 files for SHAKING:
  Train: 2, Val: 1, Test: 0
Distributed 22 files for SUM:
  Train: 15, Val: 3, Test: 4
Distribut

In [46]:
# Check updated
updated_df = create_df(dir)
print(updated_df)

                  train   val  test  total
VIEWED                3     1     1      5
RECENTLY             22     5     4     31
ARRIVE                6     1     1      8
THEIR               596   128   128    852
ARABIC                2     1     0      3
CHEMOTHERAPY          4     1     1      6
BOLD                  2     1     1      4
PASSING               8     2     1     11
PARTICULAR           24     5     6     35
BASKET                5     1     1      7
INFORMED              7     2     1     10
ANTHROPOLOGY          2     1     0      3
KILL                 24     5     5     34
GLAND                 2     1     0      3
SHAKING               2     1     0      3
SUM                  15     3     4     22
SYRIA                 3     1     1      5
RESEARCHING           7     2     1     10
INNOVATOR             6     1     1      8
SHOCK                 4     1     1      6
SYMPHONY              2     1     0      3
FUNERAL               6     1     2      9
CHOSE      

In [47]:
updated_total_train = updated_df['train'].sum()
updated_total_val = updated_df['val'].sum()
updated_total_test = updated_df['test'].sum()

print(f"\nTotal train: {updated_total_train}")
print(f"Total val: {updated_total_val}")
print(f"Total test: {updated_total_test}")


Total train: 246460
Total val: 54120
Total test: 53000


In [48]:
updated_v2_df = create_df(dir)

In [49]:
print("Number of classes not in test set: ", len(updated_v2_df[updated_v2_df['test'] == 0]), "/", len(updated_v2_df))
print("Number of classes in test set: ", len(updated_v2_df[updated_v2_df['test'] != 0]), "/", len(updated_v2_df))

Number of classes not in test set:  1121 / 6125
Number of classes in test set:  5004 / 6125


In [50]:
updated_df = create_df(dir)

min = print(updated_df['total'].min())
max = print(updated_df['total'].max())
max_value = updated_df['total'].max()  # Get the max value from the 'total' column
for i in range(0, max_value + 1):  # Use max_value instead of max
    total = (updated_df['total'] == i).sum()
    if total != 0:
        print(f"{i} files - {total}")

3
14395
3 files - 1121
4 files - 772
5 files - 525
6 files - 415
7 files - 303
8 files - 238
9 files - 223
10 files - 181
11 files - 161
12 files - 122
13 files - 95
14 files - 99
15 files - 80
16 files - 93
17 files - 85
18 files - 74
19 files - 56
20 files - 60
21 files - 52
22 files - 41
23 files - 45
24 files - 48
25 files - 41
26 files - 41
27 files - 28
28 files - 27
29 files - 25
30 files - 21
31 files - 28
32 files - 20
33 files - 16
34 files - 25
35 files - 28
36 files - 13
37 files - 15
38 files - 22
39 files - 21
40 files - 29
41 files - 5
42 files - 18
43 files - 14
44 files - 15
45 files - 11
46 files - 20
47 files - 14
48 files - 12
49 files - 11
50 files - 7
51 files - 14
52 files - 13
53 files - 11
54 files - 11
55 files - 13
56 files - 10
57 files - 5
58 files - 8
59 files - 10
60 files - 8
61 files - 9
62 files - 6
63 files - 6
64 files - 7
65 files - 12
66 files - 10
67 files - 8
68 files - 6
69 files - 6
70 files - 1
71 files - 6
72 files - 4
73 files - 5
74 files -

In [51]:
total_100 = (updated_df['total'] <= 100).sum()
print(f"<= 100 files - {total_100}")
total_100_high = (updated_df['total'] > 100).sum()
print(f"> 100 files - {total_100_high}")

<= 100 files - 5715
> 100 files - 410


In [5]:
def create_npz_df(dir):
    word_data_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    
    for word in os.listdir(dir):
        word_path = os.path.join(dir, word)
    
        if not os.path.isdir(word_path):
            print(f"Skipping {word}: not a directory.")
            continue
    
        for split in ['train', 'val', 'test']:
            split_path = os.path.join(word_path, split)
            
            if not os.path.isdir(split_path):
                continue
        
            file_count = sum(1 for file in os.listdir(split_path) if file.endswith(".npz"))
            word_data_counts[word][split]["total"] = file_count

    df = pd.DataFrame(columns=["train", "val", "test"])
    
    for word, splits in word_data_counts.items():
        row = {}
        row["train"] = splits["train"]["total"]
        row["val"] = splits["val"]["total"]
        row["test"] = splits["test"]["total"]
        df.loc[word] = row
    
    df['total'] = df[['train', 'val', 'test']].sum(axis=1)
    return df

In [6]:
# Check updated (after converting npz files)
new_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
updated_df = create_npz_df(new_dir)
print(updated_df)

updated_total_train = updated_df['train'].sum()
updated_total_val = updated_df['val'].sum()
updated_total_test = updated_df['test'].sum()

print(f"\nTotal train: {updated_total_train}")
print(f"Total val: {updated_total_val}")
print(f"Total test: {updated_total_test}")

             train  val  test  total
VIEWED           2    1     1      4
HOGG             0    0     0      0
CABBAGE          0    0     0      0
RECENTLY        22    5     4     31
SHAMELESSLY      0    0     0      0
...            ...  ...   ...    ...
HUMOR            6    1     1      8
THIS          2421  519   518   3458
MEAT            10    2     2     14
PROVIDER         2    1     1      4
ACTIVE          13    3     2     18

[6775 rows x 4 columns]

Total train: 246003
Total val: 54013
Total test: 52900


In [11]:
# drop/delete the word folders that only contain 0 files (accidentally create duration for the wrong folder)
words_to_delete = updated_df[updated_df['total'].isin([0])].index
print(words_to_delete)
deleted = 0

for word in words_to_delete:
    folder_path = os.path.join(new_dir, word)

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        deleted += 1
print(f"total deleted: {deleted}")

Index(['HOGG', 'CABBAGE', 'SHAMELESSLY', 'FREESTYLE', 'PHILOSOPHERS',
       'WARHEAD', 'GASCOIGNE', 'INVARIABLY', 'HERBAL', 'WELLS',
       ...
       'SMELLS', 'NOBILITY', 'NEEDLEWORK', 'NORTHWEST', 'JAY', 'CUDDLE',
       'CHARGER', 'PRESERVATION', 'MISLEADING', 'FARNESE'],
      dtype='object', length=705)
total deleted: 705


In [12]:
# Check updated (after converting npz files)
new_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
updated_df = create_npz_df(new_dir)
print(updated_df)

updated_total_train = updated_df['train'].sum()
updated_total_val = updated_df['val'].sum()
updated_total_test = updated_df['test'].sum()

print(f"\nTotal train: {updated_total_train}")
print(f"Total val: {updated_total_val}")
print(f"Total test: {updated_total_test}")

          train  val  test  total
VIEWED        2    1     1      4
RECENTLY     22    5     4     31
ARRIVE        6    1     1      8
THEIR       596  128   128    852
ARABIC        2    1     0      3
...         ...  ...   ...    ...
HUMOR         6    1     1      8
THIS       2421  519   518   3458
MEAT         10    2     2     14
PROVIDER      2    1     1      4
ACTIVE       13    3     2     18

[6070 rows x 4 columns]

Total train: 246003
Total val: 54013
Total test: 52900


In [13]:
print("Number of classes not in test set: ", len(updated_df[updated_df['test'] == 0]), "/", len(updated_df))
print("Number of classes in test set: ", len(updated_df[updated_df['test'] != 0]), "/", len(updated_df))

Number of classes not in test set:  1113 / 6070
Number of classes in test set:  4957 / 6070


In [14]:
total_100 = (updated_df['total'] <= 100).sum()
print(f"<= 100 files - {total_100}")
total_100_high = (updated_df['total'] > 100).sum()
print(f"> 100 files - {total_100_high}")

<= 100 files - 5661
> 100 files - 409


In [8]:
# Check updated (after converting npz files)
new_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
updated_df = create_npz_df(new_dir)
updated_df_sorted = updated_df.sort_values(by=['total'], ascending=False)
print(updated_df_sorted.head(10))

updated_total_train = updated_df['train'].sum()
updated_total_val = updated_df['val'].sum()
updated_total_test = updated_df['test'].sum()

print(f"\nTotal train: {updated_total_train}")
print(f"Total val: {updated_total_val}")
print(f"Total test: {updated_total_test}")


      train   val  test  total
THE   10076  2159  2160  14395
TO     8182  1753  1753  11688
AND    7797  1671  1670  11138
THAT   6141  1316  1316   8773
I      6100  1307  1308   8715
A      6040  1294  1295   8629
OF     5844  1253  1251   8348
WE     4320   926   925   6171
IN     3958   848   849   5655
YOU    3766   807   807   5380

Total train: 246003
Total val: 54013
Total test: 52900


In [9]:
# drop words that dont exist in the test set
updated_skip_words = updated_df[updated_df['test'] == 0]
print(updated_skip_words)

               train  val  test  total
ARABIC             2    1     0      3
ANTHROPOLOGY       2    1     0      3
GLAND              2    1     0      3
SHAKING            2    1     0      3
SYMPHONY           2    1     0      3
...              ...  ...   ...    ...
MUSTER             2    1     0      3
HEARTBREAKING      2    1     0      3
BOIL               2    1     0      3
ANALYZED           2    1     0      3
BLAMED             2    1     0      3

[1113 rows x 4 columns]


In [12]:
# drop words that dont exist in the test set
source_dir = "../../../models/DC-TCN/datasets/lrs3_words_may_dropped"
target_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"

os.makedirs(target_dir, exist_ok=True)

moved = 0

for word in updated_skip_words.index:
    folder_path = os.path.join(source_dir, word)

    if os.path.exists(folder_path):
        target_folder_path = os.path.join(target_dir, word)
        shutil.move(folder_path, target_folder_path)
        print(f"Moved {word} to {target_folder_path}")
        moved += 1
print(f"total moved: {moved}")

Moved ARABIC to ../../../models/DC-TCN/datasets/lrs3_words_may/ARABIC
Moved ANTHROPOLOGY to ../../../models/DC-TCN/datasets/lrs3_words_may/ANTHROPOLOGY
Moved GLAND to ../../../models/DC-TCN/datasets/lrs3_words_may/GLAND
Moved SHAKING to ../../../models/DC-TCN/datasets/lrs3_words_may/SHAKING
Moved SYMPHONY to ../../../models/DC-TCN/datasets/lrs3_words_may/SYMPHONY
Moved DECEPTION to ../../../models/DC-TCN/datasets/lrs3_words_may/DECEPTION
Moved ANSWERING to ../../../models/DC-TCN/datasets/lrs3_words_may/ANSWERING
Moved BRIDE to ../../../models/DC-TCN/datasets/lrs3_words_may/BRIDE
Moved CLAPPING to ../../../models/DC-TCN/datasets/lrs3_words_may/CLAPPING
Moved APPRENTICESHIP to ../../../models/DC-TCN/datasets/lrs3_words_may/APPRENTICESHIP
Moved FREQUENTLY to ../../../models/DC-TCN/datasets/lrs3_words_may/FREQUENTLY
Moved WANDER to ../../../models/DC-TCN/datasets/lrs3_words_may/WANDER
Moved TOLERANT to ../../../models/DC-TCN/datasets/lrs3_words_may/TOLERANT
Moved EXILE to ../../../models/D

In [6]:
# Check updated (after converting npz files)
new_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
updated_df = create_npz_df(new_dir)

In [17]:
print(f"Total of words with >= 50 test files: {(updated_df['test'] >= 50).sum()}")
print(f"Total of words with >= 100 test files: {(updated_df['test'] >= 100).sum()}")

words_50 = updated_df[(updated_df['test'] >= 50)]
words_100 = updated_df[(updated_df['test'] >= 100)]

Total of words with >= 50 test files: 146
Total of words with >= 100 test files: 86


In [18]:
# drop words that dont exist in the test set [npz]
source_dir = "../../../models/DC-TCN/datasets/lrs3_words_may"
target_dir = "../../../models/DC-TCN/datasets/lrs3_words_may_100filtered"

os.makedirs(target_dir, exist_ok=True)

moved = 0

for word in words_100.index:
    folder_path = os.path.join(source_dir, word)

    if os.path.exists(folder_path):
        target_folder_path = os.path.join(target_dir, word)
        shutil.move(folder_path, target_folder_path)
        print(f"Moved {word} to {target_folder_path}")
        moved += 1
print(f"total moved: {moved}")

Moved THEIR to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/THEIR
Moved THAT to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/THAT
Moved YOUR to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/YOUR
Moved GET to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/GET
Moved BE to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/BE
Moved IF to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/IF
Moved OUR to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/OUR
Moved ABOUT to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/ABOUT
Moved THING to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/THING
Moved MORE to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/MORE
Moved IT to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/IT
Moved WITH to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/WITH
Moved A to ../../../models/DC-TCN/datasets/lrs3_words_may_100filtered/A
Mo

In [11]:
# drop words that dont exist in the test set [mp4]
cropped_dir = "../../../datasets/lrs3_task4/speech_unit_type/words/words_preprocessed_video/test"
cropped_target = "../../../datasets/lrs3_task4/speech_unit_type/words/words_preprocessed_video/test_filtered"
uncropped_dir = "../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test"
uncropped_target = "../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered"
word_list_path = "../lrs3_labels.txt"
source_dir = uncropped_dir
target_dir = uncropped_target

# Read words from the text file into a set
with open(word_list_path, 'r') as file:
    start_words = set(line.strip() for line in file if line.strip())

os.makedirs(target_dir, exist_ok=True)

moved = 0

for filename in os.listdir(source_dir):
    if filename.endswith(".mp4"):
        for word in start_words:
            if filename.startswith(word):
                source_file = os.path.join(source_dir, filename)
                target_file = os.path.join(target_dir, filename)
                shutil.move(source_file, target_file)
                print(f"Moved {filename} to {target_file}")
                moved += 1
                break

print(f"Total moved: {moved}")

Moved US_JSSc7hYKstI_00016.mp4 to ../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered/US_JSSc7hYKstI_00016.mp4
Moved AIRTIME_ONM4JupBzSE_00005.mp4 to ../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered/AIRTIME_ONM4JupBzSE_00005.mp4
Moved I_SN7wO06Yz1E_00009.mp4 to ../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered/I_SN7wO06Yz1E_00009.mp4
Moved TOLERATE_ooAIIeo4AJQ_00002.mp4 to ../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered/TOLERATE_ooAIIeo4AJQ_00002.mp4
Moved THE_JISHzvXk5bk_00001.mp4 to ../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered/THE_JISHzvXk5bk_00001.mp4
Moved THE_qWNae7vYK6s_00008.mp4 to ../../../datasets/lrs3_task4/speech_unit_type/words/uncropped_words_combined/test_filtered/THE_qWNae7vYK6s_00008.mp4
Moved GOOD_PrK0CifulU0_00004.mp4 to ../../../datasets/lrs3_task4/speech_unit