In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as T
import torchvision.models as models
import numpy as np
import cv2
import time
import sys

# ===================================================================
# B∆Ø·ªöC 1: ƒê·ªäNH NGHƒ®A L·∫†I KI·∫æN TR√öC V√Ä H√ÄM X·ª¨ L√ù
# (PH·∫¢I GI·ªêNG H·ªÜT FILE HU·∫§N LUY·ªÜN)
# ===================================================================

# --- C√°c h·∫±ng s·ªë ph·∫£i kh·ªõp v·ªõi l√∫c hu·∫•n luy·ªán ---
MAX_FRAMES = 128
RESIZE_TO = (128, 128)

# --- Ki·∫øn tr√∫c VideoTransformerClassifier (Copy t·ª´ file hu·∫•n luy·ªán) ---
class VideoTransformerClassifier(nn.Module):
    def __init__(self, num_classes, embed_dim=512, num_heads=8, num_layers=6, dropout=0.1):
        super(VideoTransformerClassifier, self).__init__()
        
        pretrained_cnn = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        modules = list(pretrained_cnn.children())[:-1]
        self.cnn_extractor = nn.Sequential(*modules)
        
        for param in self.cnn_extractor.parameters():
            param.requires_grad = False
            
        cnn_output_dim = 512
        self.projection = nn.Linear(cnn_output_dim, embed_dim)
        
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.positional_embedding = nn.Parameter(torch.zeros(1, MAX_FRAMES + 1, embed_dim))
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=num_heads, 
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        batch_size, num_frames, C, H, W = x.shape
        cnn_in = x.view(batch_size * num_frames, C, H, W)
        cnn_out = self.cnn_extractor(cnn_in)
        cnn_out = torch.flatten(cnn_out, 1)
        frame_features = cnn_out.view(batch_size, num_frames, -1)
        seq_in = self.projection(frame_features)
        
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        seq_in = torch.cat((cls_tokens, seq_in), dim=1)
        seq_in += self.positional_embedding
        
        transformer_out = self.transformer_encoder(seq_in)
        cls_output = transformer_out[:, 0, :]
        out = self.classifier(cls_output)
        return out

# --- H√†m ƒë·ªçc video t·ª´ file (d√πng OpenCV) ---
def load_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    if not cap.isOpened():
        print(f"L·ªói: Kh√¥ng th·ªÉ m·ªü file video t·∫°i: {video_path}")
        return torch.empty(0)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Chuy·ªÉn t·ª´ BGR (OpenCV) sang RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)
    cap.release()
    # Tr·∫£ v·ªÅ tensor ·ªü d·∫°ng (T, H, W, C)
    return torch.from_numpy(np.array(frames))

# --- H√†m ti·ªÅn x·ª≠ l√Ω cho video m·ªõi ---
def preprocess_single_video(video_tensor):
    # video_tensor ƒë·∫ßu v√†o c√≥ shape (T, H, W, C) t·ª´ OpenCV
    num_frames = video_tensor.shape[0]

    if num_frames > MAX_FRAMES:
        indices = torch.linspace(0, num_frames - 1, MAX_FRAMES).long()
        video_tensor = video_tensor[indices]
    elif num_frames < MAX_FRAMES:
        padding = torch.zeros((MAX_FRAMES - num_frames,) + video_tensor.shape[1:], dtype=video_tensor.dtype)
        video_tensor = torch.cat([video_tensor, padding], dim=0)

    # *** THAY ƒê·ªîI QUAN TR·ªåNG ***
    # Chuy·ªÉn t·ª´ (T, H, W, C) sang (T, C, H, W) m√† torchvision mong ƒë·ª£i
    video_tensor = video_tensor.permute(0, 3, 1, 2)

    transform = T.Compose([
        T.Resize(RESIZE_TO),
        T.ConvertImageDtype(torch.float32),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    # Gi·ªù m·ªói frame ƒë√£ c√≥ d·∫°ng (C, H, W)
    processed_frames = torch.stack([transform(frame) for frame in video_tensor])
    return processed_frames


if __name__ == '__main__':
    # ===================================================================
    # B∆Ø·ªöC 2: C·∫§U H√åNH V√Ä T·∫¢I M√î H√åNH
    # ===================================================================
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    MODEL_PATH = "sign_language_transformer_final.pth" # S·ª≠ d·ª•ng ƒë√∫ng file tr·ªçng s·ªë
    VIDEO_PATH = "random_10_videos/label_book_idx_3221.mp4" # <--- THAY ƒê·ªîI VIDEO B·∫†N MU·ªêN KI·ªÇM TRA ·ªû ƒê√ÇY

    CLASS_NAMES = [
        'again', 'bad', 'bathroom', 'book', 'busy', 'do not want', 'eat', 'father', 'fine', 'finish', 
        'forget', 'go', 'good', 'happy', 'hello', 'help', 'how', 'i', 'learn', 'like', 'meet', 'milk', 
        'more', 'mother', 'my', 'name', 'need', 'nice', 'no', 'please', 'question', 'right', 'sad', 
        'same', 'see you letter', 'thank you', 'want', 'what', 'when', 'where', 'which', 'who', 
        'why', 'wrong', 'yes', 'you', 'your'
    ]
    NUM_CLASSES = len(CLASS_NAMES)
    print(f"S·ª≠ d·ª•ng thi·∫øt b·ªã: {DEVICE}")

    # Kh·ªüi t·∫°o ƒê√öNG ki·∫øn tr√∫c m√¥ h√¨nh
    model = VideoTransformerClassifier(num_classes=NUM_CLASSES)
    
    print("ƒêang t·∫£i m√¥ h√¨nh...")
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
    model.to(DEVICE)
    model.eval()
    print("T·∫£i m√¥ h√¨nh th√†nh c√¥ng!")

    # ===================================================================
    # B∆Ø·ªöC 3: D·ª∞ ƒêO√ÅN V·ªöI VIDEO M·ªöI
    # ===================================================================
    print(f"\nƒêang x·ª≠ l√Ω video: {VIDEO_PATH}")
    start_time = time.time()
    
    video_frames_tensor = load_video_frames(VIDEO_PATH)
    
    if video_frames_tensor.shape[0] == 0:
        print("K·∫øt th√∫c do kh√¥ng ƒë·ªçc ƒë∆∞·ª£c video.")
    else:
        processed_video = preprocess_single_video(video_frames_tensor)
        
        # Th√™m chi·ªÅu batch (batch_size=1)
        # K√≠ch th∆∞·ªõc cu·ªëi c√πng: (1, T, C, H, W)
        input_tensor = processed_video.unsqueeze(0).to(DEVICE)
        
        print("B·∫Øt ƒë·∫ßu d·ª± ƒëo√°n...")
        with torch.no_grad():
            output = model(input_tensor)
            
            probabilities = torch.nn.functional.softmax(output, dim=1)
            top_prob, top_catid = torch.topk(probabilities, 1)
            
            predicted_class_index = top_catid[0].item()
            predicted_class_name = CLASS_NAMES[predicted_class_index]
            prediction_confidence = top_prob[0].item()

        end_time = time.time()

        print("\nüöÄ K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN:")
        print(f"==> T√™n l·ªõp d·ª± ƒëo√°n: {predicted_class_name}")
        print(f"==> ƒê·ªô tin c·∫≠y: {prediction_confidence:.2%}")
        print(f"==> Th·ªùi gian x·ª≠ l√Ω: {end_time - start_time:.2f} gi√¢y")

S·ª≠ d·ª•ng thi·∫øt b·ªã: cpu
ƒêang t·∫£i m√¥ h√¨nh...
T·∫£i m√¥ h√¨nh th√†nh c√¥ng!

ƒêang x·ª≠ l√Ω video: random_10_videos/label_book_idx_3221.mp4
B·∫Øt ƒë·∫ßu d·ª± ƒëo√°n...

üöÄ K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN:
==> T√™n l·ªõp d·ª± ƒëo√°n: book
==> ƒê·ªô tin c·∫≠y: 99.94%
==> Th·ªùi gian x·ª≠ l√Ω: 2.23 gi√¢y
