In [None]:
from google.colab import drive
import zipfile
import os
import shutil

# --- 1. Mount Drive ---
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# --- 2. Define the route ---
base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'
train_zip_gdrive_path = os.path.join(base_drive_path, 'train_rgb_front_clips.zip')
val_zip_gdrive_path = os.path.join(base_drive_path, 'val_rgb_front_clips.zip')

# Colab_local_route
train_zip_local_path = 'train_rgb_front_clips.zip'
val_zip_local_path = 'val_rgb_front_clips.zip'
train_extract_folder = 'frontal_train_videos'
val_extract_folder = 'frontal_val_videos'

# --- 3. Fetch .zip from drive ---
print("\nCopying zip files from Drive to local environment...")
shutil.copy(train_zip_gdrive_path, train_zip_local_path)
shutil.copy(val_zip_gdrive_path, val_zip_local_path)
print("Zip files copied.")

# --- 4. Unzip the .zip ---
print("\nExtracting train videos locally...")
os.makedirs(train_extract_folder, exist_ok=True)
with zipfile.ZipFile(train_zip_local_path, 'r') as zip_ref:
    zip_ref.extractall(train_extract_folder)
print("Train videos extracted.")

print("\nExtracting validation videos locally...")
os.makedirs(val_extract_folder, exist_ok=True)
with zipfile.ZipFile(val_zip_local_path, 'r') as zip_ref:
    zip_ref.extractall(val_extract_folder)
print("Validation videos extracted.")

# --- 5. Clear the .zip to save storage ---
os.remove(train_zip_local_path)
os.remove(val_zip_local_path)
print("\nLocal zip files cleaned up.")
print("--- Data is now ready on the fast local disk ---")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

Copying zip files from Drive to local environment...
Zip files copied.

Extracting train videos locally...
Train videos extracted.

Extracting validation videos locally...
Validation videos extracted.

Local zip files cleaned up.
--- Data is now ready on the fast local disk ---


In [None]:
# ==============================================================================
# Phase 2: CNN-LSTM
# ==============================================================================

# --- 0. IMPORT ---
from google.colab import drive
import cv2
import numpy as np
import os
import pandas as pd
import math
import shutil
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# --- 1. Mount drive and prepare ---
print("Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)
print("Google Drive mounted successfully.")

base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'
TRAIN_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_train.csv')
VAL_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_val.csv')
TRAIN_LABELS_CSV_LOCAL = 'how2sign_realigned_train.csv'
VAL_LABELS_CSV_LOCAL = 'how2sign_realigned_val.csv'

print("\nCopying label files from Google Drive to local environment...")
shutil.copy(TRAIN_LABELS_CSV_GDRIVE, TRAIN_LABELS_CSV_LOCAL)
shutil.copy(VAL_LABELS_CSV_GDRIVE, VAL_LABELS_CSV_LOCAL)
print("Label files are ready in the local environment.")

# --- 2. Define constants, load labels, and create encoder. ---
IMG_SIZE = 64; MAX_FRAMES = 30; BATCH_SIZE = 32
TRAIN_VIDEO_FOLDER = 'frontal_train_videos/raw_videos'; VAL_VIDEO_FOLDER = 'frontal_val_videos/raw_videos'
train_labels_df = pd.read_csv(TRAIN_LABELS_CSV_LOCAL, sep='\t')
val_labels_df = pd.read_csv(VAL_LABELS_CSV_LOCAL, sep='\t')
all_labels_df = pd.concat([train_labels_df, val_labels_df], ignore_index=True)
label_encoder = LabelEncoder(); label_encoder.fit(all_labels_df['SENTENCE'])
NUM_CLASSES = len(label_encoder.classes_)
print(f"\nTotal unique classes found: {NUM_CLASSES}")

# --- 3. SignLanguageGenerator ---
class SignLanguageGenerator(Sequence):
    def __init__(self, data_folder, labels_df, label_encoder, batch_size, num_classes):
        self.data_folder = data_folder; self.label_encoder = label_encoder; self.batch_size = batch_size; self.num_classes = num_classes
        self.labels_df = labels_df.copy()
        print(f"Verifying files for generator in '{self.data_folder}'...")
        all_disk_files = {os.path.splitext(f)[0] for f in os.listdir(self.data_folder) if f.endswith('.mp4')}
        all_csv_files = set(self.labels_df['SENTENCE_NAME'].tolist()); valid_files = list(all_disk_files.intersection(all_csv_files))
        self.video_files = valid_files; self.labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(self.video_files)]
        print(f"Found {len(self.video_files)} valid and labeled video files.")
    def __len__(self): return math.floor(len(self.video_files) / self.batch_size)
    def __getitem__(self, idx):
        batch_files = self.video_files[idx * self.batch_size:(idx + 1) * self.batch_size]; batch_labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(batch_files)]
        X = np.zeros((len(batch_files), MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=np.float32); y_text = []
        for i, row in enumerate(batch_labels_df.itertuples()):
            video_path = os.path.join(self.data_folder, row.SENTENCE_NAME + '.mp4'); cap = cv2.VideoCapture(video_path); frames = []
            while True:
                ret, frame = cap.read()
                if not ret: break
                resized_frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE)); frames.append(resized_frame)
            cap.release(); frames = np.array(frames)
            if frames.size == 0: continue
            if len(frames) > MAX_FRAMES: frames = frames[:MAX_FRAMES]
            elif len(frames) < MAX_FRAMES:
                pad_width = ((0, MAX_FRAMES - len(frames)), (0, 0), (0, 0), (0, 0)); frames = np.pad(frames, pad_width, mode='constant', constant_values=0)
            X[i,] = frames / 255.0; y_text.append(row.SENTENCE)
        y_int = self.label_encoder.transform(y_text); y = to_categorical(y_int, num_classes=self.num_classes)
        return X, y

# --- 4. Instantiation generator ---
print("\nCreating Data Generators...")
train_generator = SignLanguageGenerator(data_folder=TRAIN_VIDEO_FOLDER, labels_df=train_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
validation_generator = SignLanguageGenerator(data_folder=VAL_VIDEO_FOLDER, labels_df=val_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
print("Data Generators are ready.")

# --- 5. Load or build model (with　check) ---
model_path = '/content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_frontal_model_v1.h5'

if os.path.exists(model_path):
    print(f"\nFound existing model at '{model_path}'. Loading to resume training...")
    model = load_model(model_path)
    # ecompile to ensure the state is correct, especially for options like `run_eagerly`
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print("Model loaded and re-compiled successfully.")
else:
    print("\nNo existing model found. Building a new model from scratch...")
    input_shape = (MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3)
    base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')
    base_model.trainable = False
    video_input = Input(shape=input_shape)
    cnn_features = TimeDistributed(base_model)(video_input)
    cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features)
    cnn_features = Dropout(0.5)(cnn_features)
    lstm_output = LSTM(128)(cnn_features)
    lstm_output = Dropout(0.5)(lstm_output)
    output_layer = Dense(NUM_CLASSES, activation='softmax')(lstm_output)
    model = Model(inputs=video_input, outputs=output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print("New model built and compiled successfully.")

model.summary()

# --- 6. 定义回调函数 ---
checkpoint_path = model_path # 路径保持一致
model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# --- 7. 开始或继续训练 ---
print("\n--- Starting or Resuming CNN-LSTM Model Training ---")
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=50,
    callbacks=[model_checkpoint, early_stopping]
)

print("\n--- Model Training Complete ---")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.

Copying label files from Google Drive to local environment...
Label files are ready in the local environment.

Total unique classes found: 31592

Creating Data Generators...
Verifying files for generator in 'frontal_train_videos/raw_videos'...
Found 31047 valid and labeled video files.
Verifying files for generator in 'frontal_val_videos/raw_videos'...
Found 1739 valid and labeled video files.
Data Generators are ready.

Found existing model at '/content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_frontal_model_v1.h5'. Loading to resume training...




Model loaded and re-compiled successfully.



--- Starting or Resuming CNN-LSTM Model Training ---


  self._warn_if_super_not_called()


Epoch 1/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0019 - loss: 10.3464

  self._warn_if_super_not_called()



Epoch 1: val_accuracy improved from -inf to 0.00116, saving model to /content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_frontal_model_v1.h5




[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6111s[0m 6s/step - accuracy: 0.0019 - loss: 10.3465 - val_accuracy: 0.0012 - val_loss: 11.1479
Epoch 2/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0026 - loss: 10.1945
Epoch 2: val_accuracy did not improve from 0.00116
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5958s[0m 6s/step - accuracy: 0.0026 - loss: 10.1945 - val_accuracy: 0.0012 - val_loss: 11.7448
Epoch 3/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0031 - loss: 9.9439
Epoch 3: val_accuracy did not improve from 0.00116
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5906s[0m 6s/step - accuracy: 0.0031 - loss: 9.9439 - val_accuracy: 0.0012 - val_loss: 12.1941
Epoch 4/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0043 - loss: 9.6834
E

In [None]:
# ==============================================================================
# Phase 2: CNN-LSTM 模型完整流程
# ==============================================================================

# --- 0. 导入所有必需的库 ---
from google.colab import drive
import cv2
import numpy as np
import os
import pandas as pd
import math
import shutil
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, LSTM, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# --- 1. 挂载Drive并定义路径 ---
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'
TRAIN_VIDEO_FOLDER = os.path.join(base_drive_path, 'extracted_frontal_videos/train/raw_videos')
VAL_VIDEO_FOLDER = os.path.join(base_drive_path, 'extracted_frontal_videos/val/raw_videos')
TRAIN_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_train.csv')
VAL_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_val.csv')

# 定义Colab本地路径
TRAIN_LABELS_CSV_LOCAL = 'how2sign_realigned_train.csv'
VAL_LABELS_CSV_LOCAL = 'how2sign_realigned_val.csv'

# --- 2. 准备工作：将CSV文件从Drive复制到本地 ---
print("\nCopying label files from Google Drive to local environment for faster access...")
shutil.copy(TRAIN_LABELS_CSV_GDRIVE, TRAIN_LABELS_CSV_LOCAL)
shutil.copy(VAL_LABELS_CSV_GDRIVE, VAL_LABELS_CSV_LOCAL)
print("Label files are ready in the local environment.")

# --- 3. 参数与标签加载 ---
IMG_SIZE = 64
MAX_FRAMES = 30
BATCH_SIZE = 32

train_labels_df = pd.read_csv(TRAIN_LABELS_CSV_LOCAL, sep='\t')
val_labels_df = pd.read_csv(VAL_LABELS_CSV_LOCAL, sep='\t')
all_labels_df = pd.concat([train_labels_df, val_labels_df], ignore_index=True)

label_encoder = LabelEncoder()
label_encoder.fit(all_labels_df['SENTENCE'])
NUM_CLASSES = len(label_encoder.classes_)
print(f"\nTotal unique classes found: {NUM_CLASSES}")

# --- 4. 定义数据生成器 (SignLanguageGenerator) ---
class SignLanguageGenerator(Sequence):
    def __init__(self, data_folder, labels_df, label_encoder, batch_size, num_classes):
        self.data_folder = data_folder
        self.label_encoder = label_encoder
        self.batch_size = batch_size
        self.num_classes = num_classes
        all_disk_files = {os.path.splitext(f)[0] for f in os.listdir(self.data_folder) if f.endswith('.mp4')}
        all_csv_files = set(labels_df['SENTENCE_NAME'].tolist())
        valid_files = list(all_disk_files.intersection(all_csv_files))
        self.video_files = valid_files
        self.labels_df = labels_df[labels_df['SENTENCE_NAME'].isin(self.video_files)].copy()

    def __len__(self):
        return math.floor(len(self.video_files) / self.batch_size)

    def __getitem__(self, idx):
        batch_files = self.video_files[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(batch_files)]
        X = np.zeros((len(batch_files), MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3), dtype=np.float32)
        y_text = []
        for i, row in enumerate(batch_labels_df.itertuples()):
            video_path = os.path.join(self.data_folder, row.SENTENCE_NAME + '.mp4')
            cap = cv2.VideoCapture(video_path)
            frames = []
            while True:
                ret, frame = cap.read()
                if not ret: break
                resized_frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
                frames.append(resized_frame)
            cap.release()
            frames = np.array(frames)
            if frames.size == 0: continue
            if len(frames) > MAX_FRAMES:
                frames = frames[:MAX_FRAMES]
            elif len(frames) < MAX_FRAMES:
                pad_width = ((0, MAX_FRAMES - len(frames)), (0, 0), (0, 0), (0, 0))
                frames = np.pad(frames, pad_width, mode='constant', constant_values=0)
            X[i,] = frames / 255.0
            y_text.append(row.SENTENCE)
        y_int = self.label_encoder.transform(y_text)
        y = to_categorical(y_int, num_classes=self.num_classes)
        return X, y

# --- 5. 实例化生成器 ---
print("\nCreating Data Generators...")
train_generator = SignLanguageGenerator(data_folder=TRAIN_VIDEO_FOLDER, labels_df=train_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
validation_generator = SignLanguageGenerator(data_folder=VAL_VIDEO_FOLDER, labels_df=val_labels_df, label_encoder=label_encoder, batch_size=BATCH_SIZE, num_classes=NUM_CLASSES)
print("Data Generators are ready.")

# --- 6. 构建模型架构 ---
print("\nBuilding CNN-LSTM model...")
input_shape = (MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3)
base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')
base_model.trainable = False
video_input = Input(shape=input_shape)
cnn_features = TimeDistributed(base_model)(video_input)
cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features)
cnn_features = Dropout(0.5)(cnn_features)
lstm_output = LSTM(128)(cnn_features)
lstm_output = Dropout(0.5)(lstm_output)
output_layer = Dense(NUM_CLASSES, activation='softmax')(lstm_output)
model = Model(inputs=video_input, outputs=output_layer)

# --- 7. 编译模型 ---
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# --- 8. 定义回调函数 ---
checkpoint_path = '/content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_frontal_model_v1.h5'
model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# --- 9. 开始训练 ---
print("\n--- Starting CNN-LSTM Model Training ---")
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=50,
    callbacks=[model_checkpoint, early_stopping]
)

print("\n--- Model Training Complete ---")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

Copying label files from Google Drive to local environment for faster access...
Label files are ready in the local environment.

Total unique classes found: 31592

Creating Data Generators...
Data Generators are ready.

Building CNN-LSTM model...


  base_model = MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')



--- Starting CNN-LSTM Model Training ---


  self._warn_if_super_not_called()


Epoch 1/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - accuracy: 0.0017 - loss: 10.4032

  self._warn_if_super_not_called()



Epoch 1: val_accuracy improved from -inf to 0.00116, saving model to /content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_frontal_model_v1.h5




[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8254s[0m 8s/step - accuracy: 0.0017 - loss: 10.4033 - val_accuracy: 0.0012 - val_loss: 10.8065
Epoch 2/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0023 - loss: 10.3423
Epoch 2: val_accuracy did not improve from 0.00116
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6164s[0m 6s/step - accuracy: 0.0023 - loss: 10.3422 - val_accuracy: 0.0012 - val_loss: 11.2941
Epoch 3/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0032 - loss: 10.1566
Epoch 3: val_accuracy did not improve from 0.00116
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6136s[0m 6s/step - accuracy: 0.0032 - loss: 10.1566 - val_accuracy: 0.0012 - val_loss: 11.8183
Epoch 4/50
[1m970/970[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.0028 - loss: 10.0513
Epoch 4: val_accuracy did not improve from 0.00116
[1m970/970[0m [32m━