In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


 ### Packages

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
from matplotlib import pyplot as plt

### Functions

In [None]:
from posixpath import basename


100%|██████████| 3081/3081 [00:00<00:00, 317385.07it/s]
100%|██████████| 3081/3081 [22:22<00:00,  2.29it/s]


**"always","bad","do","friend","good""how" "never","time","where","who","why", "you"**

### Compress Each Gloss Folder

In [None]:
import os
import tarfile
from tqdm import tqdm
import time  # For debugging


def compressFolder(base_folder, gloss_names):
  for gloss_name in gloss_names:
    print(f"///////////////////// Processing {gloss_name} //////////////////// ")
    orig_dir = os.path.join(base_folder, "ORIGINAL_DATA", gloss_name)
    dest_dir = os.path.join(base_folder, "COMPRESSED_DATA", gloss_name)

    # Create destination with strict permissions
    os.makedirs(dest_dir, exist_ok=True)
    os.chmod(dest_dir, 0o777)  # Ensure write permissions

    # Get files with validation
    all_files = [f for f in os.listdir(orig_dir)
                if os.path.isfile(os.path.join(orig_dir, f))]
    all_files.sort()

    print(f"Found {len(all_files)} files in {orig_dir}")
    print("First 5 files:", all_files[:5])  # Debug file list

    # Compression parameters
    chunk_size = 50
    total_chunks = (len(all_files) // chunk_size) + 1

    for chunk_idx in range(total_chunks):
        start = chunk_idx * chunk_size
        end = start + chunk_size
        batch = all_files[start:end]

        if not batch:
            continue

        tar_filename = f"{gloss_name}_part{chunk_idx+1}.tar.gz"
        tar_path = os.path.join(dest_dir, tar_filename)

        print(f"\nProcessing chunk {chunk_idx+1}/{total_chunks} -> {tar_path}")
        print(f"Files in batch: {len(batch)}")

        try:
            # DEBUG: Time the operation
            start_time = time.time()

            # Create archive with explicit mode and compression level
            with tarfile.open(tar_path, mode='w:gz', compresslevel=6) as tar:
                for fname in tqdm(batch, desc="Adding files"):
                    src_path = os.path.join(orig_dir, fname)
                    if not os.path.exists(src_path):
                        print(f"! Missing: {src_path}")
                        continue

                    # Verify file is readable
                    try:
                        with open(src_path, 'rb') as test_file:
                            test_file.read(1)  # Try reading 1 byte
                    except IOError as e:
                        print(f"! Unreadable: {src_path} - {str(e)}")
                        continue

                    # Add to archive with explicit arcname
                    tar.add(src_path, arcname=fname, recursive=False)

            # Verify the archive was created
            if not os.path.exists(tar_path):
                raise RuntimeError("Archive file not created!")

            archive_size = os.path.getsize(tar_path) / (1024*1024)  # in MB
            elapsed = time.time() - start_time

            print(f"✓ Success: {tar_filename} ({archive_size:.2f} MB)")
            print(f"  Time: {elapsed:.2f}s | {len(batch)/elapsed:.2f} files/sec")

        except Exception as e:
            print(f"❌ Critical error in chunk {chunk_idx+1}: {str(e)}")
            # Attempt to remove corrupted archive
            if 'tar_path' in locals() and os.path.exists(tar_path):
                os.remove(tar_path)

    # Final verification
    print("\n=== Compression Report ===")
    created_files = [f for f in os.listdir(dest_dir) if f.endswith('.tar.gz')]
    if created_files:
        print(f"Created {len(created_files)} archives:")
        for f in sorted(created_files):
            size = os.path.getsize(os.path.join(dest_dir, f)) / (1024*1024)
            print(f"• {f} ({size:.2f} MB)")
    else:
        print("⚠️ No archives were created!")


compressFolder(
    "/content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE",
    [

                      #   "yes",
                      # "wait",
                      # "tomorrow",
                      # "yesterday",
                      # "sibling",
                      #  "thanks",
                       "angel",
    ]
    )

///////////////////// Processing angel //////////////////// 
Found 126 files in /content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/ORIGINAL_DATA/angel
First 5 files: ['signer0_sample1109_color.pt', 'signer0_sample1121_color.pt', 'signer0_sample1244_color.pt', 'signer0_sample690_color.pt', 'signer0_sample957_color.pt']

Processing chunk 1/3 -> /content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/COMPRESSED_DATA/angel/angel_part1.tar.gz
Files in batch: 50


Adding files: 100%|██████████| 50/50 [08:04<00:00,  9.69s/it]


✓ Success: angel_part1.tar.gz (1258.07 MB)
  Time: 484.62s | 0.10 files/sec

Processing chunk 2/3 -> /content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/COMPRESSED_DATA/angel/angel_part2.tar.gz
Files in batch: 50


Adding files: 100%|██████████| 50/50 [08:46<00:00, 10.52s/it]


✓ Success: angel_part2.tar.gz (1264.72 MB)
  Time: 526.10s | 0.10 files/sec

Processing chunk 3/3 -> /content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/COMPRESSED_DATA/angel/angel_part3.tar.gz
Files in batch: 26


Adding files: 100%|██████████| 26/26 [04:05<00:00,  9.43s/it]

✓ Success: angel_part3.tar.gz (657.65 MB)
  Time: 245.20s | 0.11 files/sec

=== Compression Report ===
Created 3 archives:
• angel_part1.tar.gz (1258.07 MB)
• angel_part2.tar.gz (1264.72 MB)
• angel_part3.tar.gz (657.65 MB)





In [None]:
import pandas as pd
import os

def getState(input_path='data',output_file_path='data'):
    glosses= [i for i in os.listdir(input_path) if not i.__contains__(".")]
    data = {}
    for gloss in glosses:
        data[gloss] = len(os.listdir(os.path.join(input_path,gloss)))
    dataFrame = [["Gloss", "#Videos"]]
    [dataFrame.append([key,value]) for key,value in data.items()]
    dataFrame = pd.DataFrame(dataFrame)
    dataFrame.to_excel(os.path.join(output_file_path,"state.xlsx"))

getState(
    "/content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/ORIGINAL_DATA",
    "/content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/state"

)

In [None]:
df = pd.read_excel("/content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/state/state.xlsx")
df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,Gloss,#Videos
1,1,breakfast,125
2,2,difficult,121
3,3,sister,126
4,4,bed,127
5,5,I,127
6,6,police,126
7,7,single,126
8,8,baby,127
9,9,apologize,127


In [None]:
data = [t for t in os.listdir("/content/drive/MyDrive/Graduation Project (1)/Datasets/Australian Dataset/Data/CONTINUE/COMPRESSED_DATA") if t not in [
    'sister',
    "promise",
    "same",
    "sibling",
    "single",
    "tea",
    "thanks",
    "tomorrow",
    "wait",
    "where",
    "you",
    "who",
    "why",
    "good",
    'difficult.tar.gz',
    'breakfast.tar.gz',
]]

In [None]:
data[12*2:]

['eat',
 'far',
 'forbidden',
 'full',
 'get_well',
 'glove',
 'goodbye',
 'hurry',
 'male',
 'female',
 'yes',
 'yesterday']

In [None]:
len(['sister',
    "promise",
    "same",
    "sibling",
    "single",
    "tea",
    "thanks",
    "tomorrow",
    "wait",
    "where",
    "you",
    "who",
    "why",
    "good",])

14