In [None]:
# ============================================================
# 2. DOWNLOAD DATASET FROM KAGGLE
# ============================================================
import kagglehub
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
dataset_path = kagglehub.dataset_download("mbulsss/fakeavceleb")
print("Path to dataset files:", dataset_path)


BASE_PATH = os.path.join(dataset_path, "FakeAVCeleb_v1.2")
meta_path = os.path.join(BASE_PATH, "meta_data.csv")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mbulsss/fakeavceleb?dataset_version_number=1...


100%|██████████| 5.97G/5.97G [02:34<00:00, 41.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mbulsss/fakeavceleb/versions/1


In [None]:


# ============================================================
# 0) MOUNT DRIVE & SET PROJECT DIR
# ============================================================
drive.mount("/content/drive")

PROJECT_DIR = "/content/drive/MyDrive/FinalProjectAI"
SPLIT_DIR   = os.path.join(PROJECT_DIR, "splits_identity")
os.makedirs(SPLIT_DIR, exist_ok=True)

print("Project dir :", PROJECT_DIR)
print("Split dir   :", SPLIT_DIR)

# ============================================================
# 1) LOAD METADATA ASLI DARI DATASET
# ============================================================
DATA_ROOT = "/root/.cache/kagglehub/datasets/mbulsss/fakeavceleb/versions/1"
META_PATH = os.path.join(DATA_ROOT, "FakeAVCeleb_v1.2", "meta_data.csv")  # sesuaikan jika namanya beda

print("Metadata path:", META_PATH)
all_videos = pd.read_csv(META_PATH)
print("Metadata shape:", all_videos.shape)
print(all_videos.head())
print("Columns:", all_videos.columns.tolist())

# Harus minimal punya kolom:
# ['source','target1','target2','method','category','type','race','gender','path', ...]
# Di beberapa versi, ada kolom 'rel_dir' juga; kalau belum ada, kita bikin.

# ============================================================
# 2) NORMALISASI KOLOM & TAMBAH rel_dir
# ============================================================
all_videos = all_videos.rename(columns={
    "source": "identity_id"   # id00076, dll
})

# Kalau metadata kamu sudah punya 'rel_dir', biarkan saja.
if "rel_dir" not in all_videos.columns:
    # Bangun rel_dir dari struktur folder standar FakeAVCeleb_v1.2
    # misal: FakeAVCeleb_v1.2/RealVideo-RealAudio/African/men/id00076
    def build_rel_dir(row):
        return os.path.join(
            "FakeAVCeleb_v1.2",
            row["type"],
            row["race"],
            row["gender"],
            row["identity_id"]
        )
    all_videos["rel_dir"] = all_videos.apply(build_rel_dir, axis=1)

# Pastikan kolom path ada (nama file video, mis: 00109.mp4)
if "path" not in all_videos.columns:
    raise ValueError("Kolom 'path' (nama file video) tidak ditemukan di metadata.")

print("\nSample rows after normalize:")
print(all_videos[["identity_id", "type", "race", "gender", "path", "rel_dir"]].head())

# ============================================================
# 3) LABEL BINER: 1 = REAL, 0 = FAKE
# ============================================================
def map_binary_label(t):
    if t == "RealVideo-RealAudio":
        return 1  # REAL
    else:
        return 0  # FAKE

all_videos["binary_label"] = all_videos["type"].apply(map_binary_label)

print("\nBinary label distribution (1=REAL, 0=FAKE):")
print(all_videos["binary_label"].value_counts())

# ============================================================
# 4) IDENTITY-BASED SPLIT 70/15/15
# ============================================================
identity_info = all_videos[["identity_id", "binary_label"]].copy()

# Satu label per identity untuk stratify (mayoritas)
identity_info = (
    identity_info
    .groupby("identity_id")["binary_label"]
    .agg(lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else x.iloc[0])
    .reset_index()
    .rename(columns={"binary_label": "identity_label"})
)

# Buang identity kosong kalau ada
identity_info = identity_info[identity_info["identity_id"] != ""].reset_index(drop=True)

print("\nUnique identities:", len(identity_info))
print(identity_info.head())

RANDOM_SEED = 42

# 4.1) train+val vs test (85/15)
train_val_ids, test_ids = train_test_split(
    identity_info["identity_id"].values,
    test_size=0.15,
    random_state=RANDOM_SEED,
    stratify=identity_info["identity_label"].values
)

# 4.2) train vs val dari train+val (70/15) → val fraction = 15/85 ≈ 0.176
train_val_info = identity_info[identity_info["identity_id"].isin(train_val_ids)].copy()

train_ids, val_ids = train_test_split(
    train_val_info["identity_id"].values,
    test_size=0.176,
    random_state=RANDOM_SEED,
    stratify=train_val_info["identity_label"].values
)

print("\nIdentity counts per split:")
print("Train identities :", len(train_ids))
print("Val identities   :", len(val_ids))
print("Test identities  :", len(test_ids))

# ============================================================
# 5) TURUNKAN KE LEVEL VIDEO DAN SIMPAN CSV
# ============================================================
train_df = all_videos[all_videos["identity_id"].isin(train_ids)].reset_index(drop=True)
val_df   = all_videos[all_videos["identity_id"].isin(val_ids)].reset_index(drop=True)
test_df  = all_videos[all_videos["identity_id"].isin(test_ids)].reset_index(drop=True)

def show_dist(name, df):
    total = len(df)
    counts = df["binary_label"].value_counts().to_dict()
    print(f"{name}: total={total}, distribusi={counts}")

print("\nBinary label distribution per split (1=REAL, 0=FAKE):")
show_dist("TRAIN", train_df)
show_dist("VAL",   val_df)
show_dist("TEST",  test_df)

# Simpan ke Drive
train_path = os.path.join(SPLIT_DIR, "train_videos_identity.csv")
val_path   = os.path.join(SPLIT_DIR, "val_videos_identity.csv")
test_path  = os.path.join(SPLIT_DIR, "test_videos_identity.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("\nSaved to Google Drive:")
print("TRAIN:", train_path)
print("VAL  :", val_path)
print("TEST :", test_path)

# Cek tidak ada identity overlap
train_ids_set = set(train_ids)
val_ids_set   = set(val_ids)
test_ids_set  = set(test_ids)

print("\nLeakage check (should all be 0):")
print("Train ∩ Val :", len(train_ids_set & val_ids_set))
print("Train ∩ Test:", len(train_ids_set & test_ids_set))
print("Val ∩ Test  :", len(val_ids_set & test_ids_set))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Project dir : /content/drive/MyDrive/FinalProjectAI
Split dir   : /content/drive/MyDrive/FinalProjectAI/splits_identity
Metadata path: /root/.cache/kagglehub/datasets/mbulsss/fakeavceleb/versions/1/FakeAVCeleb_v1.2/meta_data.csv
Metadata shape: (21566, 10)
    source target1 target2 method category                 type     race  \
0  id00076       -       -   real        A  RealVideo-RealAudio  African   
1  id00166       -       -   real        A  RealVideo-RealAudio  African   
2  id00173       -       -   real        A  RealVideo-RealAudio  African   
3  id00366       -       -   real        A  RealVideo-RealAudio  African   
4  id00391       -       -   real        A  RealVideo-RealAudio  African   

  gender       path                                         Unnamed: 9  
0    men  00109.mp4  FakeAVCeleb/RealVideo-RealAudio/African/men/id...  
1    men  0

In [None]:
import os
import cv2
import pandas as pd
from google.colab import drive
from tqdm import tqdm

# ============================================================
# 0) MOUNT DRIVE & SET PATH
# ============================================================
drive.mount("/content/drive")

PROJECT_DIR = "/content/drive/MyDrive/FinalProjectAI"
SPLIT_DIR   = os.path.join(PROJECT_DIR, "splits_identity")
FRAMES_DIR  = os.path.join(PROJECT_DIR, "frames_identity")
os.makedirs(PROJECT_DIR, exist_ok=True)
os.makedirs(SPLIT_DIR, exist_ok=True)
os.makedirs(FRAMES_DIR, exist_ok=True)

print("Project dir :", PROJECT_DIR)
print("Split dir   :", SPLIT_DIR)
print("Frames dir  :", FRAMES_DIR)

# ============================================================
# 1) ROOT DATASET (SUDAH ADA DARI kagglehub)
# ============================================================
BASE_ROOT = "/root/.cache/kagglehub/datasets/mbulsss/fakeavceleb/versions/1"
print("Dataset root:", BASE_ROOT)

# ============================================================
# 2) LOAD CSV SPLIT DARI DRIVE
# ============================================================
train_videos = pd.read_csv(os.path.join(SPLIT_DIR, "train_videos_identity.csv"))
val_videos   = pd.read_csv(os.path.join(SPLIT_DIR, "val_videos_identity.csv"))
test_videos  = pd.read_csv(os.path.join(SPLIT_DIR, "test_videos_identity.csv"))

print("Train videos:", train_videos.shape)
print("Val videos  :", val_videos.shape)
print("Test videos :", test_videos.shape)
print("Columns    :", train_videos.columns.tolist())

# Harus minimal ada:
# ['identity_id','target1','target2','method','category',
#  'type','race','gender','path','rel_dir','binary_label']

# ============================================================
# 3) FUNGSI BANGUN PATH VIDEO DARI rel_dir + path
# ============================================================
def get_video_path(row):
    rel_dir = str(row["rel_dir"])   # contoh: FakeAVCeleb_v1.2/RealVideo-RealAudio/African/men/id00076
    fname   = str(row["path"])      # contoh: 00109.mp4
    return os.path.join(BASE_ROOT, rel_dir, fname)

# ============================================================
# 4) FUNGSI EKSTRAK FRAME
# ============================================================
def extract_frames_from_video(video_path, out_dir, video_id,
                              max_frames=10, frame_step=5):
    os.makedirs(out_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Failed to open:", video_path)
        return []

    saved_paths = []
    frame_idx = 0
    saved = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Ambil setiap frame_step frame
        if frame_idx % frame_step == 0:
            out_name = f"{video_id}_f{frame_idx:05d}.jpg"
            out_path = os.path.join(out_dir, out_name)
            cv2.imwrite(out_path, frame)
            saved_paths.append(out_path)
            saved += 1
            if saved >= max_frames:
                break

        frame_idx += 1

    cap.release()
    return saved_paths

# ============================================================
# 5) LOOP SPLIT → FRAME-LEVEL CSV
# ============================================================
def process_split(df, split_name):
    records = []
    split_dir = os.path.join(FRAMES_DIR, split_name)
    os.makedirs(split_dir, exist_ok=True)

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Extracting {split_name}"):
        video_name   = str(row["path"])
        binary_label = int(row["binary_label"])   # 1=REAL, 0=FAKE
        identity_id  = str(row["identity_id"])
        video_id     = os.path.splitext(video_name)[0]

        video_path = get_video_path(row)
        if not os.path.exists(video_path):
            print("Missing video:", video_path)
            continue

        # Simpan ke folder per label: frames_identity/train/0, frames_identity/train/1, dst.
        label_dir = os.path.join(split_dir, str(binary_label))

        frame_paths = extract_frames_from_video(
            video_path=video_path,
            out_dir=label_dir,
            video_id=video_id,
            max_frames=10,   # ganti kalau mau lebih/kurang
            frame_step=5     # ganti kalau mau lebih rapat/jarang
        )

        for fp in frame_paths:
            records.append({
                "frame_path": fp,
                "video_id": video_id,
                "identity_id": identity_id,
                "binary_label": binary_label,
                "split": split_name
            })

    return pd.DataFrame(records)

train_frames = process_split(train_videos, "train")
val_frames   = process_split(val_videos, "val")
test_frames  = process_split(test_videos, "test")

print("Train frames:", train_frames.shape)
print("Val frames  :", val_frames.shape)
print("Test frames :", test_frames.shape)

# ============================================================
# 6) SIMPAN CSV FRAME-LEVEL KE DRIVE
# ============================================================
train_frames.to_csv(os.path.join(FRAMES_DIR, "train_frames.csv"), index=False)
val_frames.to_csv(os.path.join(FRAMES_DIR, "val_frames.csv"), index=False)
test_frames.to_csv(os.path.join(FRAMES_DIR, "test_frames.csv"), index=False)

print("Done. Frame CSVs saved in:", FRAMES_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Project dir : /content/drive/MyDrive/FinalProjectAI
Split dir   : /content/drive/MyDrive/FinalProjectAI/splits_identity
Frames dir  : /content/drive/MyDrive/FinalProjectAI/frames_identity
Dataset root: /root/.cache/kagglehub/datasets/mbulsss/fakeavceleb/versions/1
Train videos: (15106, 12)
Val videos  : (3187, 12)
Test videos : (3273, 12)
Columns    : ['identity_id', 'target1', 'target2', 'method', 'category', 'type', 'race', 'gender', 'path', 'Unnamed: 9', 'rel_dir', 'binary_label']


Extracting train: 100%|██████████| 15106/15106 [16:53<00:00, 14.91it/s]
Extracting val: 100%|██████████| 3187/3187 [04:24<00:00, 12.06it/s]
Extracting test: 100%|██████████| 3273/3273 [04:48<00:00, 11.35it/s]


Train frames: (151014, 5)
Val frames  : (31870, 5)
Test frames : (32722, 5)
Done. Frame CSVs saved in: /content/drive/MyDrive/FinalProjectAI/frames_identity


# **========================================**