In [None]:
import gdown
import zipfile
import os

# --- Step 1: Download the file from the Google Drive link ---
# [cite_start]This link corresponds to the "Green Screen RGB clips* (TEST)" data. [cite: 13, 14]
file_id = '1qTIXFsu8M55HrCiaGv7vZ7GkdB3ubjaG'
download_path = 'rgb_videos.zip'
url = f'https://drive.google.com/uc?id={file_id}'

print(f"Downloading file from Google Drive link...")
gdown.download(url, download_path, quiet=False)
print("Download complete.")

# --- Step 2: Unzip the downloaded file ---
extract_path = 'dev_test_data_rgb'

# Create the extraction directory if it doesn't exist
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

print(f"\nExtracting {download_path} to {extract_path}...")
with zipfile.ZipFile(download_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("Extraction complete.")

# --- Step 3: Verify Contents ---
print("\n--- Verifying Extracted Videos ---")
# The zip file contains a nested folder, so we need to look inside it.
# e.g., Green_Screen_RGB_clips_TEST/
extracted_folder_name = os.listdir(extract_path)[0] # Get the name of the single folder inside
video_folder_path = os.path.join(extract_path, extracted_folder_name)

if os.path.isdir(video_folder_path):
    video_files = [f for f in os.listdir(video_folder_path) if f.endswith('.mp4')]
    print(f"Found {len(video_files)} video files in '{video_folder_path}'.")
    print("Here are the first few:")
    for video_name in video_files[:5]:
        print(f"  - {video_name}")
else:
    print("Could not find the nested video folder. Please check the contents of the zip file.")

Downloading file from Google Drive link...


Downloading...
From (original): https://drive.google.com/uc?id=1qTIXFsu8M55HrCiaGv7vZ7GkdB3ubjaG
From (redirected): https://drive.google.com/uc?id=1qTIXFsu8M55HrCiaGv7vZ7GkdB3ubjaG&confirm=t&uuid=7fec0082-fcea-42b6-ac49-b838732148b1
To: /content/rgb_videos.zip
100%|██████████| 2.41G/2.41G [00:31<00:00, 76.9MB/s]


Download complete.

Extracting rgb_videos.zip to dev_test_data_rgb...
Extraction complete.

--- Verifying Extracted Videos ---
Found 2343 video files in 'dev_test_data_rgb/raw_videos'.
Here are the first few:
  - G3g0-BeFN3c_29-5-rgb_front.mp4
  - G3bMqicS4bQ_16-5-rgb_front.mp4
  - g1xdqxCZxTg_5-3-rgb_front.mp4
  - g1ccEYTMGGY_16-10-rgb_front.mp4
  - G25fic3QxDk_1-10-rgb_front.mp4


In [None]:
import gdown
import pandas as pd

# --- Step 1: Download the manually re-aligned CSV from Google Drive ---

# IMPORTANT: Replace 'YOUR_FILE_ID_HERE' with the actual file ID from the shareable link.
file_id = '1AgwBZW26kFHS4CWNMQTCMPGkBPkH3qCu'
output_path = 'manual_realigned_labels.csv'
url = f'https://drive.google.com/uc?id={file_id}'

print(f"Downloading the re-aligned labels CSV...")
gdown.download(url, output_path, quiet=False)
print("Download complete.")

# --- Step 2: Load the CSV into a pandas DataFrame and display the first few rows ---
try:
    labels_df = pd.read_csv(output_path)
    print("\nSuccessfully loaded the CSV into a DataFrame. Here are the first 5 rows:")
    print(labels_df.head())
except FileNotFoundError:
    print(f"\nERROR: Could not find the downloaded file at '{output_path}'. The download may have failed.")
except Exception as e:
    print(f"An error occurred while reading the CSV: {e}")


Downloading the re-aligned labels CSV...


Downloading...
From: https://drive.google.com/uc?id=1AgwBZW26kFHS4CWNMQTCMPGkBPkH3qCu
To: /content/manual_realigned_labels.csv
100%|██████████| 424k/424k [00:00<00:00, 76.1MB/s]

Download complete.
An error occurred while reading the CSV: Error tokenizing data. C error: Expected 1 fields in line 4, saw 2






In [None]:
import pandas as pd

# Define the path to the downloaded file
output_path = 'manual_realigned_labels.csv'

try:
    # --- The Correct Fix ---
    # We now use sep='\t' to specify that the columns are separated by tabs.
    labels_df = pd.read_csv(output_path, sep='\t')

    print("Successfully loaded the CSV using a tab delimiter.")
    print("Here is the DataFrame head:")
    print(labels_df.head())

except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded the CSV using a tab delimiter.
Here is the DataFrame head:
      VIDEO_ID               VIDEO_NAME    SENTENCE_ID  \
0  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_0   
1  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_2   
2  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_3   
3  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_4   
4  -fZc293MpJk  -fZc293MpJk-1-rgb_front  -fZc293MpJk_5   

               SENTENCE_NAME  START_REALIGNED  END_REALIGNED  \
0  -fZc293MpJk_0-1-rgb_front             0.26           6.79   
1  -fZc293MpJk_2-1-rgb_front             7.27          20.30   
2  -fZc293MpJk_3-1-rgb_front            21.25          25.51   
3  -fZc293MpJk_4-1-rgb_front            27.75          44.64   
4  -fZc293MpJk_5-1-rgb_front            46.68          52.44   

                                            SENTENCE  
0                                                Hi!  
1  The aileron is the control surface in the wing...  
2  By moving the stick

In [None]:
import cv2
import numpy as np
import os
import pandas as pd # Assuming labels_df is in memory from the previous step

# Define image dimensions and the path to the video folder
IMG_SIZE = 64
VIDEO_FOLDER_PATH = 'dev_test_data_rgb/raw_videos'

def load_video(video_path):
    """
    Loads a video file, extracts its frames, resizes them, and returns
    them as a NumPy array.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize the frame to our standard size and normalize pixel values
        resized_frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        # You could also normalize here by dividing by 255.0
        frames.append(resized_frame)
    cap.release()
    return np.array(frames)

# --- Test the function on a single video ---

# Let's grab the first video from our DataFrame to test
if 'labels_df' in locals() and not labels_df.empty:
    sample_video_name = labels_df.loc[0, 'SENTENCE_NAME'] + '.mp4'
    sample_video_path = os.path.join(VIDEO_FOLDER_PATH, sample_video_name)

    print(f"Loading sample video: {sample_video_path}")

    if os.path.exists(sample_video_path):
        # Load the video frames
        video_frames = load_video(sample_video_path)

        # Display the output shape
        # Shape will be (num_frames, height, width, color_channels)
        print("\n--- Video Preprocessing Test Complete ---")
        print(f"Shape of the processed video data: {video_frames.shape}")
        print(f"Number of frames extracted: {video_frames.shape[0]}")
        print(f"Frame dimensions (Height, Width): {video_frames.shape[1]}x{video_frames.shape[2]}")
        print(f"Color channels (BGR): {video_frames.shape[3]}")
    else:
        print(f"ERROR: Sample video not found at '{sample_video_path}'")
        print("Please ensure the VIDEO_FOLDER_PATH is correct and the video files are in it.")
else:
    print("ERROR: `labels_df` not found or is empty. Please run the previous cell to load the labels CSV.")

Loading sample video: dev_test_data_rgb/raw_videos/-fZc293MpJk_0-1-rgb_front.mp4

--- Video Preprocessing Test Complete ---
Shape of the processed video data: (17, 64, 64, 3)
Number of frames extracted: 17
Frame dimensions (Height, Width): 64x64
Color channels (BGR): 3


In [None]:
import cv2
import numpy as np
import os
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Parameters ---
# We'll process a smaller subset for this smoke test to save time.
NUM_SAMPLES_TO_PROCESS = 50
MAX_FRAMES = 30 # As per the plan, we need a fixed number of frames for the model.
IMG_SIZE = 64
VIDEO_FOLDER_PATH = 'dev_test_data_rgb/raw_videos'

# --- Data Processing Loop ---
# Ensure the DataFrame is available
if 'labels_df' not in locals() or labels_df.empty:
    print("ERROR: `labels_df` not found. Please run the cell that loads the CSV.")
else:
    X_frames = []
    # Take a subset of the data for this test
    data_subset = labels_df.head(NUM_SAMPLES_TO_PROCESS)

    for index, row in data_subset.iterrows():
        video_name = row['SENTENCE_NAME'] + '.mp4'
        video_path = os.path.join(VIDEO_FOLDER_PATH, video_name)

        if os.path.exists(video_path):
            frames = load_video(video_path)

            # Standardize the number of frames
            if len(frames) > MAX_FRAMES:
                # Truncate if too long
                frames = frames[:MAX_FRAMES]
            elif len(frames) < MAX_FRAMES:
                # Pad with black frames if too short
                padding_needed = MAX_FRAMES - len(frames)
                pad_width = ((0, padding_needed), (0, 0), (0, 0), (0, 0))
                frames = np.pad(frames, pad_width, mode='constant', constant_values=0)

            X_frames.append(frames)
            print(f"Processed {video_name}, shape: {frames.shape}")

    # Convert the list of video arrays into a single large array
    X_frames = np.array(X_frames)

    print("\n--- Full Video Preprocessing Complete ---")
    print(f"Final shape of X_frames: {X_frames.shape}")
    print(f"This shape represents (num_samples, num_frames, height, width, channels).")

Processed -fZc293MpJk_0-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -fZc293MpJk_2-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -fZc293MpJk_3-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -fZc293MpJk_4-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -fZc293MpJk_5-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -fZc293MpJk_6-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -fZc293MpJk_7-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_0-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_1-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_10-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_11-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_12-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_13-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_14-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_15-1-rgb_front.mp4, shape: (30, 64, 64, 3)
Processed -g0iPSnQt6w_16-1-rgb_fro

In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, GlobalAveragePooling2D

# Define the model input shape
# (num_frames, height, width, channels)
input_shape = (30, 64, 64, 3)

# Load a pre-trained CNN model (MobileNetV2) as the feature extractor
# We exclude the top classification layer and freeze the weights.
base_model = MobileNetV2(
    input_shape=(64, 64, 3),
    include_top=False,
    weights='imagenet'
)
base_model.trainable = False

# --- Build the full CNN-LSTM architecture ---
# Define the input layer that expects sequences of video frames
video_input = Input(shape=input_shape)

# Use TimeDistributed to apply the CNN to each frame of the video
# This creates our feature extractor
cnn_features = TimeDistributed(base_model)(video_input)
cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features) # Flatten the features for each frame

# Feed the sequence of features into the LSTM layer to understand the motion
lstm_output = LSTM(64)(cnn_features)

# Add a Dense layer for classification
# We'll use 10 output classes as a placeholder for the smoke test
num_classes = 10
output_layer = Dense(num_classes, activation='softmax')(lstm_output)

# Create the final model
model = Model(inputs=video_input, outputs=output_layer)

# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Print the model summary to see the architecture
print("--- CNN-LSTM Model Architecture ---")
model.summary()

  base_model = MobileNetV2(


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
--- CNN-LSTM Model Architecture ---


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, GlobalAveragePooling2D

# --- Step 1: Prepare Labels and Get Class Count ---
data_subset = labels_df.head(50)
lb = LabelBinarizer()
y_labels = lb.fit_transform(data_subset['SENTENCE'])
num_classes = y_labels.shape[1] # Dynamically get the number of classes

print(f"Dynamically determined number of classes: {num_classes}")

# --- Step 2: Re-build the Model with the Correct Output Layer ---
input_shape = (30, 64, 64, 3)
base_model = MobileNetV2(input_shape=(64, 64, 3), include_top=False, weights='imagenet')
base_model.trainable = False

video_input = Input(shape=input_shape)
cnn_features = TimeDistributed(base_model)(video_input)
cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features)
lstm_output = LSTM(64)(cnn_features)

# Use the correct number of classes in the final layer
output_layer = Dense(num_classes, activation='softmax')(lstm_output)

model = Model(inputs=video_input, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print("\n--- Model Re-built with Correct Output Shape ---")
model.summary()

# --- Step 3: Split Data and Train the Corrected Model ---
X_train, X_test, y_train, y_test = train_test_split(X_frames, y_labels, test_size=0.20, random_state=42)

print("\n--- Starting Model Training ---")
history = model.fit(
    X_train,
    y_train,
    epochs=4,
    batch_size=8,
    validation_data=(X_test, y_test)
)

print("\n--- Smoke Test Complete ---")
print("The CNN-LSTM pipeline ran successfully from start to finish.")

Dynamically determined number of classes: 50


  base_model = MobileNetV2(input_shape=(64, 64, 3), include_top=False, weights='imagenet')



--- Model Re-built with Correct Output Shape ---



--- Starting Model Training ---
Epoch 1/4
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 15s/step - accuracy: 0.0000e+00 - loss: 4.1349 - val_accuracy: 0.0000e+00 - val_loss: 4.2985
Epoch 2/4
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 947ms/step - accuracy: 0.1240 - loss: 3.6921 - val_accuracy: 0.0000e+00 - val_loss: 4.4558
Epoch 3/4
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step - accuracy: 0.1510 - loss: 3.5444 - val_accuracy: 0.0000e+00 - val_loss: 4.5125
Epoch 4/4
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.1365 - loss: 3.3192 - val_accuracy: 0.0000e+00 - val_loss: 4.8045

--- Smoke Test Complete ---
The CNN-LSTM pipeline ran successfully from start to finish.
