In [None]:
# --- 0. 导入所有必需的库 ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
import cv2
import math
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.preprocessing import LabelEncoder

# --- (请确保您已经从之前的Notebook复制了SignLanguageGenerator类的完整定义) ---
class SignLanguageGenerator(Sequence):
    def __init__(self, data_folder, labels_df, label_encoder, batch_size, num_classes):
        self.data_folder = data_folder; self.labels_df = labels_df.copy(); self.label_encoder = label_encoder; self.batch_size = batch_size; self.num_classes = num_classes
        print(f"Verifying files for generator in '{self.data_folder}'...")
        all_disk_files = {os.path.splitext(f)[0] for f in os.listdir(self.data_folder) if f.endswith('.mp4')}
        all_csv_files = set(self.labels_df['SENTENCE_NAME'].tolist()); valid_files = list(all_disk_files.intersection(all_csv_files))
        self.video_files = valid_files; self.labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(self.video_files)]
        print(f"Found {len(self.video_files)} valid and labeled video files.")
    def __len__(self): return math.floor(len(self.video_files) / self.batch_size)
    def __getitem__(self, idx):
        batch_files = self.video_files[idx * self.batch_size:(idx + 1) * self.batch_size]; batch_labels_df = self.labels_df[self.labels_df['SENTENCE_NAME'].isin(batch_files)]
        X = np.zeros((len(batch_files), 30, 64, 64, 3), dtype=np.float32); y_text = []
        for i, row in enumerate(batch_labels_df.itertuples()):
            video_path = os.path.join(self.data_folder, row.SENTENCE_NAME + '.mp4'); cap = cv2.VideoCapture(video_path); frames = []
            while True:
                ret, frame = cap.read()
                if not ret: break
                resized_frame = cv2.resize(frame, (64, 64)); frames.append(resized_frame)
            cap.release(); frames = np.array(frames)
            if frames.size == 0: continue
            if len(frames) > 30: frames = frames[:30]
            elif len(frames) < 30:
                pad_width = ((0, 30 - len(frames)), (0, 0), (0, 0), (0, 0)); frames = np.pad(frames, pad_width, mode='constant', constant_values=0)
            X[i,] = frames / 255.0; y_text.append(row.SENTENCE)
        try:
            y_int = self.label_encoder.transform(y_text)
            y = to_categorical(y_int, num_classes=self.num_classes)
        except ValueError:
            return np.zeros_like(X), np.zeros((X.shape[0], self.num_classes))
        return X, y
# --- (SignLanguageGenerator类定义结束) ---


# --- 1. 加载我们保存的最佳模型 ---
print("Loading the best CNN-LSTM model: cnn_lstm_frontal_model_v1.h5")
model_path = '/content/drive/MyDrive/train-CNN+LSTM+BO/cnn_lstm_frontal_model_v1.h5'
model = load_model(model_path)

# --- 2. 准备与训练时一致的标签编码器 ---
# --- THIS IS THE FIX ---
print("\nLoading ONLY TRAIN+VAL label files to create a consistent encoder...")
base_drive_path = '/content/drive/MyDrive/train-CNN+LSTM+BO'
# 定义三个CSV文件在Drive中的路径
TRAIN_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_train.csv')
VAL_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_val.csv')
TEST_LABELS_CSV_GDRIVE = os.path.join(base_drive_path, 'how2sign_realigned_test.csv')
# 复制到本地
shutil.copy(TRAIN_LABELS_CSV_GDRIVE, 'how2sign_realigned_train.csv')
shutil.copy(VAL_LABELS_CSV_GDRIVE, 'how2sign_realigned_val.csv')
shutil.copy(TEST_LABELS_CSV_GDRIVE, 'how2sign_realigned_test.csv')

# 只使用训练集和验证集来构建编码器
train_labels_df = pd.read_csv('how2sign_realigned_train.csv', sep='\t')
val_labels_df = pd.read_csv('how2sign_realigned_val.csv', sep='\t')
train_val_labels_df = pd.concat([train_labels_df, val_labels_df], ignore_index=True)

label_encoder = LabelEncoder()
label_encoder.fit(train_val_labels_df['SENTENCE'])
NUM_CLASSES = len(label_encoder.classes_)
print(f"Total classes the model was trained on: {NUM_CLASSES}") # 这应该是31592
# --- END OF FIX ---


# --- 3. 为测试集创建数据生成器 ---
print("\nCreating a generator for the TEST set...")
TEST_VIDEO_FOLDER = 'frontal_test_videos/raw_videos'
test_labels_df = pd.read_csv('how2sign_realigned_test.csv', sep='\t')

# 关键：过滤测试集，只保留模型认识的标签
known_classes = set(label_encoder.classes_)
test_labels_df_filtered = test_labels_df[test_labels_df['SENTENCE'].isin(known_classes)]
print(f"Original test samples: {len(test_labels_df)}, Filtered test samples: {len(test_labels_df_filtered)}")


test_generator = SignLanguageGenerator(
    data_folder=TEST_VIDEO_FOLDER,
    labels_df=test_labels_df_filtered, # 使用过滤后的DataFrame
    label_encoder=label_encoder,
    batch_size=32,
    num_classes=NUM_CLASSES
)

# --- 4. 在测试集上评估并生成报告 ---
print("\n--- Evaluating model on the official TEST data ---")

loss, accuracy = model.evaluate(test_generator, verbose=1)
print(f"\nOfficial Test Accuracy: {accuracy * 100:.4f}%")
print(f"Official Test Loss: {loss:.4f}")

print("\nGenerating classification report...")
y_pred_one_hot = model.predict(test_generator)
y_pred_labels = np.argmax(y_pred_one_hot, axis=1)

num_test_samples = len(test_generator.video_files)
batches_to_run = len(test_generator)
samples_to_consider = batches_to_run * 32
y_true_text = test_generator.labels_df['SENTENCE'].iloc[:samples_to_consider]
y_true_int = label_encoder.transform(y_true_text)
y_pred_labels = y_pred_labels[:len(y_true_int)]

report = classification_report(y_true_int, y_pred_labels, output_dict=True, zero_division=0)
print(f"Official F1-score (macro avg): {report['macro avg']['f1-score']:.4f}")
print("--- Report Generation Elements Complete ---")

Loading the best CNN-LSTM model: cnn_lstm_frontal_model_v1.h5


  self._warn_if_super_not_called()



Loading ONLY TRAIN+VAL label files to create a consistent encoder...
Total classes the model was trained on: 31592

Creating a generator for the TEST set...
Original test samples: 2357, Filtered test samples: 76
Verifying files for generator in 'frontal_test_videos/raw_videos'...
Found 76 valid and labeled video files.

--- Evaluating model on the official TEST data ---
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 1s/step - accuracy: 0.0000e+00 - loss: 8.1313

Official Test Accuracy: 0.0000%
Official Test Loss: 8.1375

Generating classification report...
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 470ms/step
Official F1-score (macro avg): 0.0000
--- Report Generation Elements Complete ---
