### Extract Fullscene

In [2]:
import os
import tensorflow as tf
from utils import extract_fullscene, load_images, load_annotations

# List subset yang akan diproses
subsets = ["train", "val", "test"]

# Memuat anotasi
annotation_train, annotation_valid, annotation_test = load_annotations()
annotations = {"train": annotation_train, "val": annotation_valid, "test": annotation_test}

for subset in subsets:
    video_dir = os.path.join("dataset", "videos", subset)
    save_dir = os.path.join("dataset", "fullscene", subset)
    
    # Membuat folder output jika belum ada
    os.makedirs(save_dir, exist_ok=True)
    
    # Ekstraksi frame dari video
    extract_fullscene(video_dir, save_dir, num_images=10, image_size=(224, 224))
    
    print(f"[INFO] Ekstraksi selesai untuk {subset}. Frame disimpan di: {save_dir}")
    
    # Memuat anotasi sesuai subset
    annotation_data = annotations[subset]
    
    # Memuat gambar ke dalam tf.data.Dataset
    dataset = load_images(save_dir, annotation_data)
    
    # Menyimpan dataset dalam format tf.data
    save_ds_dir = os.path.join("data", "fullscene", f"{subset}_ds")
    os.makedirs(save_ds_dir, exist_ok=True)  # Buat folder jika belum ada
    dataset.save(save_ds_dir)
    print(f"[INFO] Dataset {subset} disimpan dalam format tf.data di: {save_ds_dir}")


Update annotation path !


100%|██████████| 3/3 [00:05<00:00,  1.88s/it]


[INFO] Ekstraksi selesai untuk train. Frame disimpan di: dataset\fullscene\train


100%|██████████| 3/3 [00:01<00:00,  1.54it/s]


[INFO] Dataset train disimpan dalam format tf.data di: data\fullscene\train_ds


100%|██████████| 1/1 [00:03<00:00,  3.17s/it]


[INFO] Ekstraksi selesai untuk val. Frame disimpan di: dataset\fullscene\val


100%|██████████| 1/1 [00:00<00:00,  1.57it/s]


[INFO] Dataset val disimpan dalam format tf.data di: data\fullscene\val_ds


100%|██████████| 1/1 [00:01<00:00,  1.57s/it]


[INFO] Ekstraksi selesai untuk test. Frame disimpan di: dataset\fullscene\test


100%|██████████| 1/1 [00:00<00:00,  1.98it/s]

[INFO] Dataset test disimpan dalam format tf.data di: data\fullscene\test_ds





### Extract Faces

In [3]:
import os
import sys
import tensorflow as tf
from utils import extract_face, load_images, load_annotations

# List subset yang akan diproses
subsets = ["train", "val", "test"]

# Memuat anotasi
annotation_train, annotation_valid, annotation_test = load_annotations()
annotations = {"train": annotation_train, "val": annotation_valid, "test": annotation_test}

for subset in subsets:
    image_dir = os.path.join("dataset", "fullscene", subset)
    save_dir = os.path.join("dataset", "faces", subset)
    
    # Membuat folder output jika belum ada
    os.makedirs(save_dir, exist_ok=True)
    
    # Ekstraksi frame dari video
    extract_face(image_dir, save_dir, image_size=(224, 224))
    
    print(f"[INFO] Ekstraksi selesai untuk {subset}. Frame disimpan di: {save_dir}")
    
    # Memuat anotasi sesuai subset
    annotation_data = annotations[subset]
    
    # Memuat gambar ke dalam tf.data.Dataset
    dataset = load_images(save_dir, annotation_data)
    
    # Menyimpan dataset dalam format tf.data
    save_ds_dir = os.path.join("data", "faces", f"{subset}_ds")
    os.makedirs(save_ds_dir, exist_ok=True)  # Buat folder jika belum ada
    dataset.save(save_ds_dir)
    print(f"[INFO] Dataset {subset} disimpan dalam format tf.data di: {save_ds_dir}")


Update annotation path !


100%|██████████| 3/3 [00:17<00:00,  5.83s/it]


[INFO] Ekstraksi selesai untuk train. Frame disimpan di: dataset\faces\train


100%|██████████| 3/3 [00:01<00:00,  2.63it/s]


[INFO] Dataset train disimpan dalam format tf.data di: data\faces\train_ds


100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


[INFO] Ekstraksi selesai untuk val. Frame disimpan di: dataset\faces\val


100%|██████████| 1/1 [00:00<00:00,  3.74it/s]


[INFO] Dataset val disimpan dalam format tf.data di: data\faces\val_ds


100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


[INFO] Ekstraksi selesai untuk test. Frame disimpan di: dataset\faces\test


100%|██████████| 1/1 [00:00<00:00,  3.80it/s]

[INFO] Dataset test disimpan dalam format tf.data di: data\faces\test_ds





### Process Text

In [1]:
import os
import re
import tensorflow as tf
import pandas as pd
import numpy as np
import gensim.downloader as api
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import process_text, preprocess_text, load_annotations, load_transcriptions

# Load Anotasi & Transkripsi
print("[INFO] Memuat anotasi dan transkripsi...")
annotation_train, annotation_valid, annotation_test = load_annotations()
transcr_train, transcr_valid, transcr_test = load_transcriptions()

annotations = {"train": annotation_train, "val": annotation_valid, "test": annotation_test}
transcriptions = {"train": transcr_train, "val": transcr_valid, "test": transcr_test}

# Memproses Teks dari Transkripsi
subsets = ["train", "val", "test"]
all_texts = []  # Untuk tokenizer
datasets = {}  # Menyimpan dataset per subset

for subset in subsets:
    video_dir = os.path.join("dataset", "videos", subset)
    save_text_dir = os.path.join("dataset", "text", subset)
    os.makedirs(save_text_dir, exist_ok=True)  

    annotation_data = annotations[subset]
    transcription_data = transcriptions[subset]

    # Proses teks menggunakan `process_text`
    text_df = process_text(video_dir, annotation_data, transcription_data)

    # Bersihkan teks menggunakan `preprocess_text`
    text_df["text"] = text_df["text"].apply(preprocess_text)

    # Simpan teks untuk tokenizer
    all_texts.extend(text_df["text"].tolist())

    # Simpan dataset sementara
    datasets[subset] = text_df

    print(f"[INFO] Teks dari {subset} diproses.")

# Membuat Tokenizer & Embedding GloVe
print("[INFO] Mengunduh GloVe embeddings (100D), mohon tunggu...")
glove_model = api.load("glove-wiki-gigaword-100")

# Buat tokenizer dari teks yang telah diproses
tokenizer = Tokenizer(oov_token="<OOV>")  
tokenizer.fit_on_texts(all_texts)  

# Sesuaikan vocab_size dengan jumlah kata unik dalam dataset
vocab_size = len(tokenizer.word_index) + 1  
embed_size = 100  

print(f"Total kata unik dalam dataset: {len(tokenizer.word_index)}")
print(f"Vocab Size yang digunakan: {vocab_size}")

# Buat embedding matrix
embed_matrix = np.zeros((vocab_size, embed_size))
for word, idx in tokenizer.word_index.items():
    if idx < vocab_size:
        embedding_vector = glove_model[word] if word in glove_model else None
        if embedding_vector is not None:
            embed_matrix[idx] = embedding_vector  

# Simpan embedding matrix
embed_dir = "data/text/"
os.makedirs(embed_dir, exist_ok=True)
np.save(os.path.join(embed_dir, "embed_matrix.npy"), embed_matrix)

print(f"[INFO] Embedding matrix disimpan dengan shape {embed_matrix.shape}")

# Menyimpan Dataset dalam Format `tf.data.Dataset` (Dalam Bentuk Token)
save_tf_dataset_dir = "data/text/"
os.makedirs(save_tf_dataset_dir, exist_ok=True)

def tokenize_and_pad(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=50)  # Sesuaikan dengan sentlen

for subset in subsets:
    text_df = datasets[subset]

    # Ubah teks menjadi tokenized sequences
    tokenized_texts = tokenize_and_pad(text_df["text"])

    # Simpan dalam format tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((tokenized_texts, text_df[['o', 'c', 'e', 'a', 'n']].values))
    dataset.save(os.path.join(save_tf_dataset_dir, f"{subset}_ds"))

print("[INFO] Dataset teks dalam format `tf.data.Dataset` selesai disimpan!")


[INFO] Memuat anotasi dan transkripsi...
Update annotation path !
Update transcriptions path !
[INFO] Teks dari train diproses.
[INFO] Teks dari val diproses.
[INFO] Teks dari test diproses.
[INFO] Mengunduh GloVe embeddings (100D), mohon tunggu...
Total kata unik dalam dataset: 88
Vocab Size yang digunakan: 89
[INFO] Embedding matrix disimpan dengan shape (89, 100)
[INFO] Dataset teks dalam format `tf.data.Dataset` selesai disimpan!
