In [1]:
import mlflow

# Point MLflow at server
mlflow.set_tracking_uri("http://localhost:5000")

# Start a new run and a test metric
with mlflow.start_run():
    mlflow.log_metric("test_metric", 42)

print("Logged test_metric = 42 to local MLflow server")

🏃 View run righteous-quail-432 at: http://localhost:5000/#/experiments/0/runs/4e51901f967e4ba98ccb6756d435f6c4
🧪 View experiment at: http://localhost:5000/#/experiments/0
Logged test_metric = 42 to local MLflow server


In [24]:
import cv2
import glob
import os
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython import get_ipython

# move up one directory (only)
cwd = os.getcwd()
if os.path.basename(cwd) == "notebooks":
    get_ipython().run_line_magic("cd", "..")
    print("Moved up to project root:", os.getcwd())
else:
    print("Already at project root:", cwd)

Already at project root: /home


In [22]:
# Gather real & fake lists
orig_paths = glob.glob("data/video_raw/original/*.mp4")
fake_paths = glob.glob("data/video_raw/manipulated/*.mp4")

# Combine into one list
video_paths = orig_paths + fake_paths
print(f"Found {len(video_paths)} total videos")

# Create DataFrame with matching labels
video_df = pd.DataFrame({
    "path": video_paths,
    "label": ["real"] * len(orig_paths) + ["fake"] * len(fake_paths)
})

# Peek
video_df.head()
print(video_df.label.value_counts())

Found 0 total videos
Series([], Name: count, dtype: int64)


In [19]:
# A helper to grab 1 frame per second
def extract_frames(path, interval_sec=1, max_frames=5):
    
    cap = cv2.VideoCapture(path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30 # default to 30 if unreadable
    frames = []
    count = 0

    while len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
        # Grab every 'interval_sec * fps' frames
        if count % int(interval_sec * fps) == 0:
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        count += 1
    cap.release()

    return frames

In [20]:
# Shows 3 raw frames, 1 second apart
if video_paths:
    frames = extract_frames(video_paths[0], interval_sec=1, max_frames=3)
    print("-> Frames extracted from first video:", len(frames))
else:
    raise RuntimeError("No videos found at data/video_raw/*.mp4")

fig, axs = plt.subplots(1, 3, figsize=(12, 4))
for ax, img in zip(axs, frames):
    ax.imshow(img)
    ax.axis("off")

plt.suptitle(os.path.basename(video_paths[0]))
plt.show()

RuntimeError: No videos found at data/video_raw/*.mp4

In [16]:
# Gather statistics on dataset
shapes = []
histograms =[]

for vf in video_paths[:20]: # limit to first 20
    f = extract_frames(vf, interval_sec=2, max_frames=1)
    if not f: 
        print("Warning: no frames for", vf)
        continue
    shapes.append(f[0].shape[:2])

    print("-> Collected shapes:", shapes[:5], "... total", len(shapes))

    # Flattened intensity histogram
    hist, _ = np.histogram(img.ravel(), bins=50, range=(0, 255))
    histograms.append(hist)

# Only plot if there is data
if shapes:
    # Plot distribution of heights and widths
    heights, widths = zip(*shapes)
    plt.figure(figsize=(6, 4))
    plt.hist(heights, bins=10, alpha=0.7, label="height")
    plt.hist(widths, bins=10, alpha=0.7, label="width")
    plt.legend()
    plt.title("Frame dimension distribution (first 20 videos)")
    plt.ylabel("Count")
    plt.xlabel("Pixels")
    plt.show
else:
    print("No shapes to plot")

No shapes to plot
