In [None]:
import os
import cv2
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, BinaryType, IntegerType

def create_spark_session():
    """Initialize and return a Spark session."""
    return SparkSession.builder \
        .appName("GIF Preprocessing Pipeline") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

def extract_frames(gif_path):
    """Extract frames from a GIF file and resize them."""
    cap = cv2.VideoCapture(gif_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize the frame to a standard size (e.g., 224x224)
        frame_resized = cv2.resize(frame, (224, 224))
        # Encode the frame as binary
        _, buffer = cv2.imencode(".jpg", frame_resized)
        frames.append(buffer.tobytes())
    cap.release()
    return frames

def save_frames_to_disk(frames, gif_id):
    """Save frames to disk as individual image files."""
    output_dir = f"output/frames/{gif_id}"
    os.makedirs(output_dir, exist_ok=True)
    for i, frame in enumerate(frames):
        with open(f"{output_dir}/frame_{i}.jpg", "wb") as f:
            f.write(frame)

def main():
    # Step 1: Create a Spark session
    spark = create_spark_session()

    # Step 2: Load the GIF metadata from the TSV file
    gif_df = spark.read.csv("tgif.tsv", sep="\t", header=False)

    # Step 3: Assign proper column names
    gif_df = gif_df.toDF("gif_path", "description")

    # Step 4: Display the schema for debugging
    gif_df.printSchema()

    # Step 5: Register a UDF for frame extraction
    extract_frames_udf = udf(lambda gif: extract_frames(gif), ArrayType(BinaryType()))
    gif_df = gif_df.withColumn("frames", extract_frames_udf(gif_df["gif_path"]))

    # Step 6: Add a column for the number of frames
    count_frames_udf = udf(lambda frames: len(frames), IntegerType())
    gif_df = gif_df.withColumn("frame_count", count_frames_udf(gif_df["frames"]))

    # Step 7: Save the frames to disk (optional)
    save_udf = udf(lambda frames, gif_id: save_frames_to_disk(frames, gif_id))
    gif_df = gif_df.withColumn("saved_frames", save_udf(gif_df["frames"], gif_df["gif_path"]))

    # Step 8: Save the processed data as Parquet files
    gif_df.write.mode("overwrite").format("parquet").save("output/processed_frames")


    print("Preprocessing complete. Processed frames are saved to the output directory.")

if __name__ == "__main__":
    main()


root
 |-- gif_path: string (nullable = true)
 |-- description: string (nullable = true)

Preprocessing complete. Processed frames are saved to the output directory.
