In [None]:
# --- 0. 导入库 ---
from google.colab import drive
import zipfile
import os
import shutil

# --- 1. 挂载Drive ---
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# --- 2. 定义路径 ---
base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'
train_zip_gdrive_path = os.path.join(base_drive_path, 'train_rgb_front_clips.zip')
val_zip_gdrive_path = os.path.join(base_drive_path, 'val_rgb_front_clips.zip')

# Colab本地路径
train_extract_folder = 'frontal_train_videos'
val_extract_folder = 'frontal_val_videos'

# --- 3. 在本地解压训练集 ---
print("\nExtracting train videos to local disk (this will take time but speed up training later)...")
os.makedirs(train_extract_folder, exist_ok=True)
with zipfile.ZipFile(train_zip_gdrive_path, 'r') as zip_ref:
    zip_ref.extractall(train_extract_folder)
print("Train videos extracted locally.")

# --- 4. 在本地解压验证集 ---
print("\nExtracting validation videos to local disk...")
os.makedirs(val_extract_folder, exist_ok=True)
with zipfile.ZipFile(val_zip_gdrive_path, 'r') as zip_ref:
    zip_ref.extractall(val_extract_folder)
print("Validation videos extracted locally.")

print("\n--- Data Preparation for Phase 3 Complete ---")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.

Extracting train videos to local disk (this will take time but speed up training later)...
Train videos extracted locally.

Extracting validation videos to local disk...
Validation videos extracted locally.

--- Data Preparation for Phase 3 Complete ---


In [None]:
# (此单元格顶部的导入和标签加载部分保持不变)
# ...
import pandas as pd
import numpy as np
import math
import cv2
import optuna
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'
TRAIN_LABELS_CSV = os.path.join(base_drive_path, 'how2sign_realigned_train.csv')
VAL_LABELS_CSV = os.path.join(base_drive_path, 'how2sign_realigned_val.csv')
train_labels_df = pd.read_csv(TRAIN_LABELS_CSV, sep='\t')
val_labels_df = pd.read_csv(VAL_LABELS_CSV, sep='\t')
all_labels_df = pd.concat([train_labels_df, val_labels_df], ignore_index=True)
label_encoder = LabelEncoder(); label_encoder.fit(all_labels_df['SENTENCE'])
NUM_CLASSES = len(label_encoder.classes_)
IMG_SIZE = 64; MAX_FRAMES = 30; BATCH_SIZE = 32
TRAIN_VIDEO_FOLDER = 'frontal_train_videos/raw_videos'
VAL_VIDEO_FOLDER = 'frontal_val_videos/raw_videos'
# ...

# --- 2. 定义数据生成器 (最终修复版) ---
class SignLanguageGenerator(Sequence):
    def __init__(self, data_folder, labels_df, label_encoder, batch_size, num_classes):
        self.data_folder = data_folder; self.labels_df = labels_df.copy(); self.label_encoder = label_encoder; self.batch_size = batch_size; self.num_classes = num_classes
        all_disk_files = {os.path.splitext(f)[0] for f in os.listdir(self.data_folder) if f.endswith('.mp4')}
        all_csv_files = set(self.labels_df['SENTENCE_NAME'].tolist()); valid_files = list(all_disk_files.intersection(all_csv_files))
        self.video_files = valid_files; self.labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(self.video_files)]
        print(f"Generator for '{self.data_folder}' initialized with {len(self.video_files)} valid files.")

    def __len__(self):
        return math.floor(len(self.video_files) / self.batch_size)

    def __getitem__(self, idx):
        # --- THIS IS THE FINAL FIX ---
        batch_files = self.video_files[idx * self.batch_size:(idx + 1) * self.batch_size]

        # 我们根据实际的batch_files来过滤DataFrame，确保数量一致
        batch_labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(batch_files)]

        # X数组的大小应该基于过滤后的DataFrame的行数，即实际要处理的视频数
        X = np.zeros((len(batch_labels_df), MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=np.float32)
        y_text = []

        for i, row in enumerate(batch_labels_df.itertuples()):
            video_path = os.path.join(self.data_folder, row.SENTENCE_NAME + '.mp4')
            cap = cv2.VideoCapture(video_path)
            frames = []
            while True:
                ret, frame = cap.read()
                if not ret: break
                resized_frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
                frames.append(resized_frame)
            cap.release()
            frames = np.array(frames)

            if frames.size == 0:
                # 如果视频无法读取，跳过此样本，但我们需要确保X和y的长度仍然匹配
                # 一个简单的方法是创建一个占位的空帧数组，但这可能会影响训练
                # 更稳健的方法是在__init__阶段就检查视频是否可读，但会非常慢
                # 这里我们先跳过，但在循环外需要重新调整X的大小
                continue

            if len(frames) > MAX_FRAMES:
                frames = frames[:MAX_FRAMES]
            elif len(frames) < MAX_FRAMES:
                pad_width = ((0, MAX_FRAMES - len(frames)), (0, 0), (0, 0), (0, 0))
                frames = np.pad(frames, pad_width, mode='constant', constant_values=0)

            X[i,] = frames / 255.0
            y_text.append(row.SENTENCE)

        # 确保X的大小和y_text的长度一致
        if X.shape[0] != len(y_text):
            X = X[:len(y_text)]

        y_int = self.label_encoder.transform(y_text)
        y = to_categorical(y_int, num_classes=self.num_classes)
        return X, y
        # --- END OF FINAL FIX ---

# (实例化生成器和定义objective函数的代码保持不变)
# ...
train_generator = SignLanguageGenerator(data_folder=TRAIN_VIDEO_FOLDER, labels_df=train_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
validation_generator = SignLanguageGenerator(data_folder=VAL_VIDEO_FOLDER, labels_df=val_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
print("\nData Generators are ready for optimization trials.")
# ...
def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True); lstm_units = trial.suggest_int('lstm_units', 64, 256); dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet'); base_model.trainable = False
    video_input = Input(shape=(MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3)); cnn_features = TimeDistributed(base_model)(video_input)
    cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features); cnn_features = Dropout(dropout_rate)(cnn_features)
    lstm_output = LSTM(lstm_units)(cnn_features); lstm_output = Dropout(dropout_rate)(lstm_output)
    output_layer = Dense(NUM_CLASSES, activation='softmax')(lstm_output); model = Model(inputs=video_input, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0)
    history = model.fit(train_generator, validation_data=validation_generator, epochs=20, callbacks=[early_stopping], verbose=0)
    val_accuracy = np.max(history.history['val_accuracy']); return val_accuracy

Generator for 'frontal_train_videos/raw_videos' initialized with 31047 valid files.
Generator for 'frontal_val_videos/raw_videos' initialized with 1739 valid files.

Data Generators are ready for optimization trials.


In [None]:
import optuna

# --- 1. 创建一个Optuna研究 ---
# direction='maximize' 表示我们希望目标函数(objective)的返回值越大越好
study = optuna.create_study(direction='maximize')

# --- 2. 开始优化 ---
# n_trials=25 表示我们将进行25次不同的超参数组合试验
print("Starting Bayesian Optimization for 25 trials...")
study.optimize(objective, n_trials=25)

# --- 3. 打印最佳结果 ---
print("\n--- Optimization Finished ---")
print(f"Best trial number: {study.best_trial.number}")
print(f"Best validation accuracy: {study.best_value:.4f}")
print("Best hyperparameters found:")
for key, value in study.best_params.items():
    print(f"  - {key}: {value}")

[I 2025-07-18 20:22:41,252] A new study created in memory with name: no-name-eee6c93c-7e38-4cd6-a8cc-546cd0a16703
  base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet'); base_model.trainable = False


Starting Bayesian Optimization for 25 trials...


  self._warn_if_super_not_called()
  self._warn_if_super_not_called()
[I 2025-07-19 03:07:12,173] Trial 0 finished with value: 0.0017361111240461469 and parameters: {'learning_rate': 0.00021001054934091255, 'lstm_units': 93, 'dropout_rate': 0.45079469677480605}. Best is trial 0 with value: 0.0017361111240461469.
  base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet'); base_model.trainable = False
