In [None]:
# %pip install google-api-python-client google-auth google-auth-httplib2 google-auth-oauthlib
# %pip install opencv-python tqdm ipywidgets scikit-learn

In [None]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os

SCOPES = ["https://www.googleapis.com/auth/drive"]


def authenticate_drive():
    creds = None
    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
            creds = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(creds.to_json())
    return creds

authenticate_drive()

<google.oauth2.credentials.Credentials at 0x2304e3e45c0>

In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload
import os
import json
import cv2
from tqdm.notebook import tqdm


def download_video(drive_service, file_id, local_path):
    request = drive_service.files().get_media(fileId=file_id)
    with open(local_path, "wb") as file:
        downloader = MediaIoBaseDownload(file, request)
        done = False
        while not done:
            _, done = downloader.next_chunk()


def get_owner_info(drive_service, file_id):
    try:
        file_metadata = (
            drive_service.files().get(fileId=file_id, fields="owners").execute()
        )
        owner = file_metadata["owners"][0]  # Assumes one owner
        owner_info = {
            "name": owner.get("displayName", "unknown"),
        }
        return owner_info
    except HttpError as error:
        print(f"Error fetching owner for file {file_id}: {error}")
        return {"name": "unknown"}


def process_main_folder(main_folder_id, local_save_path):
    creds = authenticate_drive()
    drive_service = build("drive", "v3", credentials=creds)

    # Get subfolders (labels)
    try:
        results = (
            drive_service.files()
            .list(
                q=f"'{main_folder_id}' in parents and mimeType='application/vnd.google-apps.folder'",
                fields="files(id, name)",
            )
            .execute()
        )
        folders = results.get("files", [])
    except HttpError as error:
        print(f"An error occurred: {error}")
        return

    if not folders:
        print("No subfolders found in the main folder.")
        return

    video_data = {}

    for folder in tqdm(folders, desc="Processing folders"):
        label = folder["name"]
        folder_id = folder["id"]
        label_path = os.path.join(local_save_path, label)
        os.makedirs(label_path, exist_ok=True)

        # Get videos in the subfolder
        try:
            video_results = (
                drive_service.files()
                .list(
                    q=f"'{folder_id}' in parents and mimeType contains 'video/'",
                    fields="files(id, name)",
                )
                .execute()
            )
            videos = video_results.get("files", [])
        except HttpError as error:
            print(
                f"An error occurred while fetching videos for folder {label}: {error}"
            )
            continue

        if not videos:
            print(f"No videos found in folder: {label}")
            continue

        video_data[label] = []
        for idx, video in enumerate(tqdm(videos, desc=f"Processing videos in {label}", leave=False)):
            video_id = f"{label}_{idx+1:02d}"
            video_name = f"{video_id}.mp4"
            video_local_path = os.path.join(label_path, video_name)

            download_video(drive_service, video["id"], video_local_path)

            # Get video metadata
            cap = cv2.VideoCapture(video_local_path)
            duration = int(
                cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
            )
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            resolution = f"{width}x{height}"
            cap.release()

            owner_info = get_owner_info(drive_service, video["id"])

            # Save video data
            video_info = {
                "videoID": video_id,
                "contributor": owner_info["name"],
                "duration": duration,
                "resolution": resolution,
            }
            video_data[label].append(video_info)

    # Save all metadata to JSON
    output_json = os.path.join(local_save_path, "video_metadata.json")
    with open(output_json, "w") as json_file:
        json.dump(video_data, json_file, indent=4)
    print(f"All metadata saved to {output_json}")


# Usage
main_folder_id = "1Z5elhiU-za29NG8QdvSgY6ESYvqPyyOq"
local_save_path = "data_collected/"
process_main_folder(main_folder_id, local_save_path)

Processing folders:   0%|          | 0/30 [00:00<?, ?it/s]

Processing videos in buruk:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in minum:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in buka:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in senyum:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in anak:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in main:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in tidur:   0%|          | 0/9 [00:00<?, ?it/s]

Processing videos in makan:   0%|          | 0/9 [00:00<?, ?it/s]

Processing videos in jalan:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in haus:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in lapar:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in dengar:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in kertas:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in keluarga:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in gembira:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in teman:   0%|          | 0/9 [00:00<?, ?it/s]

Processing videos in panggil:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in nama:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in kucing:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in marah:   0%|          | 0/11 [00:00<?, ?it/s]

Processing videos in besar:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in adik:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in maaf:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in sedikit:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in orang:   0%|          | 0/9 [00:00<?, ?it/s]

Processing videos in rumah:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in guru:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in selamat:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in ibu:   0%|          | 0/10 [00:00<?, ?it/s]

Processing videos in lihat:   0%|          | 0/10 [00:00<?, ?it/s]

All metadata saved to data_collected/video_metadata.json


In [None]:
import shutil
from sklearn.model_selection import train_test_split


with open("data_collected/video_metadata.json", "r") as f:
    video_metadata = json.load(f)

SOURCE_DIR = "data_collected/"
DEST_DIR = "data_splits/"

os.makedirs(os.path.join(DEST_DIR, "train"), exist_ok=True)
os.makedirs(os.path.join(DEST_DIR, "test"), exist_ok=True)

file_paths = []
labels = []
video_ids = []

for word in os.listdir(SOURCE_DIR):
    word_dir = os.path.join(SOURCE_DIR, word)
    if os.path.isdir(word_dir):
        for file in os.listdir(word_dir):
            file_paths.append(os.path.join(word_dir, file))
            labels.append(word)

test_size = 0.2  # Proportion of test data
train_files, test_files, train_labels, test_labels = train_test_split(
    file_paths, labels, test_size=test_size, random_state=42, stratify=labels
)

train_ids = [os.path.splitext(os.path.basename(file))[0] for file in train_files]
test_ids = [os.path.splitext(os.path.basename(file))[0] for file in test_files]

# Update video_metadata with subset information
for label in video_metadata:
    for video in video_metadata[label]:
        video_id = video["videoID"]
        if video_id in train_ids:
            video["subset"] = "train"
        elif video_id in test_ids:
            video["subset"] = "test"

# Save updated metadata
with open("data_splits/video_metadata.json", "w") as f:
    json.dump(video_metadata, f, indent=4)

def save_files(files, labels, base_dir):
    for file, label in tqdm(
        zip(files, labels), total=len(files), desc=f"Copying to {base_dir}"
    ):
        save_dir = os.path.join(base_dir, label)
        os.makedirs(save_dir, exist_ok=True)  # Ensure class folder exists
        shutil.copy(file, os.path.join(save_dir, os.path.basename(file)))

# Save train and test files
save_files(train_files, train_labels, os.path.join(DEST_DIR, "train"))
save_files(test_files, test_labels, os.path.join(DEST_DIR, "test"))

Copying to data_splits/train:   0%|          | 0/237 [00:00<?, ?it/s]

Copying to data_splits/test:   0%|          | 0/60 [00:00<?, ?it/s]

In [None]:
from moviepy.video.io.VideoFileClip import VideoFileClip
from tqdm.notebook import tqdm
import os
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

def process_videos(subset):
    """
    Process videos for given subset (train/test) by removing audio

    Args:
        subset (str): Either 'train' or 'test'
    """
    # Create output directory for muted videos
    MUTED_DIR = "data_splits_muted/"
    SOURCE_DIR = "data_splits/"
    os.makedirs(os.path.join(MUTED_DIR, subset), exist_ok=True)

    # Process videos
    subset_dir = os.path.join(SOURCE_DIR, subset)
    for label in tqdm(os.listdir(subset_dir), desc=f"Processing {subset} folders"):
        label_dir = os.path.join(subset_dir, label)
        if os.path.isdir(label_dir):
            # Create label directory in muted folder
            muted_label_dir = os.path.join(MUTED_DIR, subset, label)
            os.makedirs(muted_label_dir, exist_ok=True)

            for video_file in tqdm(os.listdir(label_dir), desc=f"Processing {label} videos", leave=False):
                input_path = os.path.join(label_dir, video_file)
                output_path = os.path.join(muted_label_dir, video_file)

                # Load video and save without audio
                video = VideoFileClip(input_path)
                video.without_audio().write_videofile(output_path, codec='libx264', logger=None)
                video.close()

# Process both train and test sets
process_videos('train')
process_videos('test')

Processing train folders:   0%|          | 0/30 [00:00<?, ?it/s]

Processing adik videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing anak videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing besar videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing buka videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing buruk videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing dengar videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing gembira videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing guru videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing haus videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing ibu videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing jalan videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing keluarga videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing kertas videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing kucing videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing lapar videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing lihat videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing maaf videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing main videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing makan videos:   0%|          | 0/7 [00:00<?, ?it/s]

Processing marah videos:   0%|          | 0/9 [00:00<?, ?it/s]

Processing minum videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing nama videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing orang videos:   0%|          | 0/7 [00:00<?, ?it/s]

Processing panggil videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing rumah videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing sedikit videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing selamat videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing senyum videos:   0%|          | 0/8 [00:00<?, ?it/s]

Processing teman videos:   0%|          | 0/7 [00:00<?, ?it/s]

Processing tidur videos:   0%|          | 0/7 [00:00<?, ?it/s]

Processing test folders:   0%|          | 0/30 [00:00<?, ?it/s]

Processing adik videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing anak videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing besar videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing buka videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing buruk videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing dengar videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing gembira videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing guru videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing haus videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing ibu videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing jalan videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing keluarga videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing kertas videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing kucing videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing lapar videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing lihat videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing maaf videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing main videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing makan videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing marah videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing minum videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing nama videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing orang videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing panggil videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing rumah videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing sedikit videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing selamat videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing senyum videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing teman videos:   0%|          | 0/2 [00:00<?, ?it/s]

Processing tidur videos:   0%|          | 0/2 [00:00<?, ?it/s]