In [None]:
# ==============================================================================
# Final Project Script: Train, Evaluate, and Summarize All Models (with Debugging)
# ==============================================================================

# --- 0. 导入所有必需的库 ---
# (省略... 与上一版本完全相同)
from google.colab import drive
import cv2; import numpy as np; import os; import pandas as pd; import math; import shutil
from tensorflow.keras.utils import Sequence, to_categorical; from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import MobileNetV2; from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam; from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import classification_report, accuracy_score

# --- 1. 环境设置：挂载Drive并准备原始数据 ---
print("--- Step 1: Setting up environment and preparing raw data ---")
drive.mount('/content/drive', force_remount=True)
base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'; train_zip_gdrive_path = os.path.join(base_drive_path, 'train_rgb_front_clips.zip'); val_zip_gdrive_path = os.path.join(base_drive_path, 'val_rgb_front_clips.zip'); test_zip_gdrive_path = os.path.join(base_drive_path, 'test_rgb_front_clips.zip')
train_zip_local_path = 'train_rgb_front_clips.zip'; val_zip_local_path = 'val_rgb_front_clips.zip'; test_zip_local_path = 'test_rgb_front_clips.zip'
train_extract_folder = 'frontal_train_videos'; val_extract_folder = 'frontal_val_videos'; test_extract_folder = 'frontal_test_videos'
print("\nCopying and extracting all video datasets to local disk...")
for gdrive_path, local_path, extract_folder in [(train_zip_gdrive_path, train_zip_local_path, train_extract_folder), (val_zip_gdrive_path, val_zip_local_path, val_extract_folder), (test_zip_gdrive_path, test_zip_local_path, test_extract_folder)]:
    print(f"Processing {os.path.basename(gdrive_path)}..."); shutil.copy(gdrive_path, local_path); os.makedirs(extract_folder, exist_ok=True)
    with zipfile.ZipFile(local_path, 'r') as zip_ref: zip_ref.extractall(extract_folder)
    os.remove(local_path)
print("All video data is ready on the local disk.")

# --- 2. 准备标签和常量 ---
print("\n--- Step 2: Preparing labels and constants ---")
TRAIN_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_train.csv'); VAL_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_val.csv'); TEST_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_test.csv')
shutil.copy(TRAIN_LABELS_CSV_GDRIVE, 'how2sign_realigned_train.csv'); shutil.copy(VAL_LABELS_CSV_GDRIVE, 'how2sign_realigned_val.csv'); shutil.copy(TEST_LABELS_CSV_GDRIVE, 'how2sign_realigned_test.csv')
IMG_SIZE = 64; MAX_FRAMES = 30; BATCH_SIZE = 32
TRAIN_VIDEO_FOLDER = 'frontal_train_videos/raw_videos'; VAL_VIDEO_FOLDER = 'frontal_val_videos/raw_videos'; TEST_VIDEO_FOLDER = 'frontal_test_videos/raw_videos'
train_labels_df = pd.read_csv('how2sign_realigned_train.csv', sep='\t'); val_labels_df = pd.read_csv('how2sign_realigned_val.csv', sep='\t'); test_labels_df = pd.read_csv('how2sign_realigned_test.csv', sep='\t')
all_labels_df = pd.concat([train_labels_df, val_labels_df, test_labels_df], ignore_index=True)
label_encoder = LabelEncoder(); label_encoder.fit(all_labels_df['SENTENCE']); NUM_CLASSES = len(label_encoder.classes_)
print(f"Total unique classes found across ALL datasets: {NUM_CLASSES}")

# --- 3. 定义数据生成器 ---
class SignLanguageGenerator(Sequence):
    def __init__(self, data_folder, labels_df, label_encoder, batch_size, num_classes):
        self.data_folder = data_folder; self.labels_df = labels_df.copy(); self.label_encoder = label_encoder; self.batch_size = batch_size; self.num_classes = num_classes
        all_disk_files = {os.path.splitext(f)[0] for f in os.listdir(self.data_folder) if f.endswith('.mp4')}
        all_csv_files = set(self.labels_df['SENTENCE_NAME'].tolist()); valid_files = list(all_disk_files.intersection(all_csv_files))
        self.video_files = valid_files; self.labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(self.video_files)]
    def __len__(self): return math.floor(len(self.video_files) / self.batch_size)
    def __getitem__(self, idx):
        batch_files = self.video_files[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(batch_files)]

        # --- NEW DEBUGGING CODE ---
        if len(batch_files) > 0 and len(batch_labels_df) == 0:
            print("\n" + "="*80)
            print(f"CRITICAL WARNING: Data Mismatch Detected in Batch Index {idx}")
            print(f"Generator sliced {len(batch_files)} filenames, but found 0 of them in the provided DataFrame.")
            print("This should not happen with the current logic. Printing slice for inspection:")
            print(batch_files)
            print("="*80 + "\n")
        # --- END DEBUGGING CODE ---

        X = np.zeros((len(batch_files), MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=np.float32); y_text = []
        for i, row in enumerate(batch_labels_df.itertuples()):
            video_path = os.path.join(self.data_folder, row.SENTENCE_NAME + '.mp4'); cap = cv2.VideoCapture(video_path); frames = []
            while True:
                ret, frame = cap.read()
                if not ret: break; resized_frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE)); frames.append(resized_frame)
            cap.release(); frames = np.array(frames)
            if frames.size == 0: continue
            if len(frames) > MAX_FRAMES: frames = frames[:MAX_FRAMES]
            elif len(frames) < MAX_FRAMES:
                pad_width = ((0, MAX_FRAMES - len(frames)), (0, 0), (0, 0), (0, 0)); frames = np.pad(frames, pad_width, mode='constant', constant_values=0)
            X[i,] = frames / 255.0; y_text.append(row.SENTENCE)
        try:
            if not y_text: # 如果y_text是空的，直接返回空的数组避免transform报错
                return np.zeros_like(X), np.zeros((X.shape[0], self.num_classes))
            y_int = self.label_encoder.transform(y_text); y = to_categorical(y_int, num_classes=self.num_classes)
        except ValueError: return np.zeros_like(X), np.zeros((X.shape[0], self.num_classes))
        return X, y

# --- (后面的代码保持不变) ---

# --- 4. 实例化训练和验证生成器 ---
print("\n--- Step 4: Creating Data Generators ---")
train_generator = SignLanguageGenerator(data_folder=TRAIN_VIDEO_FOLDER, labels_df=train_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
validation_generator = SignLanguageGenerator(data_folder=VAL_VIDEO_FOLDER, labels_df=val_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
print("Train and Validation Generators are ready.")

# --- 5. 训练最终的优化模型 ---
print("\n--- Step 5: Training the Final Optimized Model ---")
best_params = {'learning_rate': 0.00021001054934091255, 'lstm_units': 93, 'dropout_rate': 0.45079469677480605}
print("Using best hyperparameters found by Optuna:", best_params)

input_shape = (MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3)
base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')
base_model.trainable = False
video_input = Input(shape=input_shape)
cnn_features = TimeDistributed(base_model)(video_input); cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features); cnn_features = Dropout(best_params['dropout_rate'])(cnn_features)
lstm_output = LSTM(best_params['lstm_units'])(cnn_features); lstm_output = Dropout(best_params['dropout_rate'])(lstm_output)
output_layer = Dense(NUM_CLASSES, activation='softmax')(lstm_output)
final_model = Model(inputs=video_input, outputs=output_layer)

final_model.compile(optimizer=Adam(learning_rate=best_params['learning_rate']), loss='categorical_crossentropy', metrics=['accuracy'])
final_model.summary()

final_model_path = '/content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_optimized_model.h5'
model_checkpoint = ModelCheckpoint(filepath=final_model_path, save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

history_final = final_model.fit(train_generator, validation_data=validation_generator, epochs=50, callbacks=[model_checkpoint, early_stopping])
print("Final Model Training Complete.")

# --- 6. 评估最终模型并生成总结 ---
print("\n--- Step 6: Evaluating the Final Model and Generating Summary ---")
# 加载已保存的最佳模型
print("Loading the best saved optimized model for final evaluation...")
optimized_model = load_model(final_model_path)

# 创建测试集生成器
known_classes = set(label_encoder.classes_)
test_labels_df_filtered = test_labels_df[test_labels_df['SENTENCE'].isin(known_classes)]
test_generator = SignLanguageGenerator(data_folder=TEST_VIDEO_FOLDER, labels_df=test_labels_df_filtered, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)

# 在测试集上评估
print("\nEvaluating on the official TEST set...")
y_pred_one_hot = optimized_model.predict(test_generator)
y_pred_labels = np.argmax(y_pred_one_hot, axis=1)

batches_to_run = len(test_generator)
samples_to_consider = batches_to_run * BATCH_SIZE
y_true_text = test_generator.labels_df['SENTENCE'].iloc[:samples_to_consider]
y_true_int = label_encoder.transform(y_true_text)
y_pred_labels = y_pred_labels[:len(y_true_int)]

final_accuracy = accuracy_score(y_true_int, y_pred_labels)
report = classification_report(y_true_int, y_pred_labels, output_dict=True, zero_division=0)
final_f1_score = report['macro avg']['f1-score']

# --- 7. 生成最终总结表 ---
lstm_baseline_accuracy = 0.0017; lstm_baseline_f1 = 0.0 # 假设值
manual_cnn_lstm_accuracy = 0.0012; manual_cnn_lstm_f1 = 0.0 # 假设值

summary_data = {
    'Model': ['1. Baseline LSTM (Keypoints)', '2. Manually-Tuned CNN-LSTM (RGB)', '3. Optimized CNN-LSTM (RGB)'],
    'Test Accuracy': [f"{lstm_baseline_accuracy*100:.4f}%", f"{manual_cnn_lstm_accuracy*100:.4f}%", f"{final_accuracy*100:.4f}%"],
    'Test F1-Score (Macro Avg)': [f"{lstm_baseline_f1:.4f}", f"{manual_cnn_lstm_f1:.4f}", f"{final_f1_score:.4f}"]
}
summary_df = pd.DataFrame(summary_data)
print("\n\n--- PROJECT SUMMARY TABLE ---")
print(summary_df.to_markdown(index=False))

--- Step 1: Setting up environment and preparing raw data ---
Mounted at /content/drive

Copying and extracting all video datasets to local disk...
Processing train_rgb_front_clips.zip...
Processing val_rgb_front_clips.zip...
Processing test_rgb_front_clips.zip...
All video data is ready on the local disk.

--- Step 2: Preparing labels and constants ---
Total unique classes found across ALL datasets: 33483

--- Step 4: Creating Data Generators ---
Train and Validation Generators are ready.

--- Step 5: Training the Final Optimized Model ---
Using best hyperparameters found by Optuna: {'learning_rate': 0.00021001054934091255, 'lstm_units': 93, 'dropout_rate': 0.45079469677480605}


  base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')


  self._warn_if_super_not_called()


Epoch 1/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 1: val_accuracy improved from -inf to 0.00000, saving model to /content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_optimized_model.h5




[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6037s[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00
Epoch 2/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 2: val_accuracy did not improve from 0.00000
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5794s[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00
Epoch 3/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 3: val_accuracy did not improve from 0.00000
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5738s[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00
Epoch 4/50
[1m850/970[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m11:13[0m 6s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00