In [None]:
import os
import glob
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
import random
from tqdm import tqdm
import multiprocessing
from datetime import datetime

# C·∫•u h√¨nh GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"üöÄ GPU ƒë∆∞·ª£c ph√°t hi·ªán v√† c·∫•u h√¨nh: {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"‚ùå L·ªói c·∫•u h√¨nh GPU: {e}")

# B·∫≠t Mixed Precision ƒë·ªÉ t·ªëi ∆∞u hi·ªáu su·∫•t
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print(f"üéØ Mixed Precision enabled: {policy.name}")

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")
print(f"CPU cores: {multiprocessing.cpu_count()}")


In [None]:
CONFIG = {
    # ƒê∆∞·ªùng d·∫´n d·ªØ li·ªáu - C·∫¨P NH·∫¨T THEO ƒê∆Ø·ªúNG D·∫™N M·ªöI
    'base_path': 'khoaminh/data',
    'datasets': ['Coffee_room_01', 'Coffee_room_02', 'Home_01', 'Home_02'],
    
    # Tham s·ªë h√¨nh ·∫£nh
    'input_size': (416, 416),      # K√≠ch th∆∞·ªõc ƒë·∫ßu v√†o
    'num_keypoints': 17,           # COCO pose: 17 keypoints
    
    # Tham s·ªë training
    'batch_size': 16,
    'epochs': 100,
    'learning_rate': 1e-4,
    'validation_split': 0.2,
    
    # Tham s·ªë loss weights
    'lambda_bbox': 10.0,           # Weight cho bbox regression
    'lambda_pose': 5.0,            # Weight cho pose keypoints
    'lambda_conf': 1.0,            # Weight cho confidence
    
    # Tham s·ªë kh√°c
    'max_samples_per_video': 300,
    'confidence_threshold': 0.5,
    'save_model_path': 'models/',
}

# COCO Pose keypoints
KEYPOINT_NAMES = [
    'nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear',
    'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
    'left_wrist', 'right_wrist', 'left_hip', 'right_hip',
    'left_knee', 'right_knee', 'left_ankle', 'right_ankle'
]

# Skeleton connections cho v·∫Ω pose
SKELETON = [
    [16, 14], [14, 12], [17, 15], [15, 13], [12, 13],  # Ch√¢n
    [6, 12], [7, 13], [6, 7], [6, 8], [7, 9],         # Th√¢n + tay
    [8, 10], [9, 11], [2, 3], [1, 2], [1, 3],         # Tay + m·∫Øt
    [2, 4], [3, 5], [4, 6], [5, 7]                    # M·∫∑t + vai
]

print(f"üìã Person Detection + Pose Configuration:")
for key, value in CONFIG.items():
    print(f"   {key}: {value}")


In [None]:
def find_data_files():
    """T√¨m t·∫•t c·∫£ file video v√† annotation"""
    base_path = CONFIG['base_path']
    datasets = CONFIG['datasets']
    
    print(f"üîç T√¨m ki·∫øm d·ªØ li·ªáu trong: {base_path}")
    
    matched_data = []
    
    for dataset_name in datasets:
        print(f"\nüìÅ X·ª≠ l√Ω dataset: {dataset_name}")
        
        # ƒê∆∞·ªùng d·∫´n video v√† annotation - C·∫¨P NH·∫¨T THEO C·∫§U TR√öC M·ªöI
        video_dir = f"{base_path}/{dataset_name}/{dataset_name}/Videos"
        annotation_dir = f"{base_path}/{dataset_name}/{dataset_name}/Annotation_files_processed"
        
        # T√¨m files
        video_patterns = [
            f"{video_dir}/*.avi",
            f"{video_dir}/*.mp4",
            f"{video_dir}/*.mov",
        ]
        
        video_files = []
        for pattern in video_patterns:
            video_files.extend(glob.glob(pattern))
        
        annotation_files = glob.glob(f"{annotation_dir}/*_with_pose.txt")
        
        print(f"   üé¨ Videos: {len(video_files)}")
        print(f"   üìù Annotations: {len(annotation_files)}")
        
        # Gh√©p video v·ªõi annotation
        for ann_file in annotation_files:
            ann_basename = os.path.basename(ann_file)
            video_name_from_ann = ann_basename.replace('_with_pose.txt', '')
            video_name_expected = video_name_from_ann.replace('_', ' ')
            
            possible_names = [
                f"{video_name_expected}.avi",
                f"{video_name_expected}.mp4",
                f"{video_name_from_ann}.avi",
                f"{video_name_from_ann}.mp4",
            ]
            
            matching_videos = []
            for video_file in video_files:
                video_basename = os.path.basename(video_file)
                if video_basename in possible_names:
                    matching_videos.append(video_file)
            
            if matching_videos:
                matched_data.append({
                    'dataset': dataset_name,
                    'video_name': video_name_from_ann,
                    'video_path': matching_videos[0],
                    'annotation_path': ann_file
                })
    
    print(f"\nüìä T·ªïng k·∫øt: {len(matched_data)} c·∫∑p video-annotation ƒë∆∞·ª£c t√¨m th·∫•y")
    return matched_data


In [None]:
def parse_annotation_file(annotation_path):
    """Parse file annotation ƒë·ªÉ l·∫•y bbox v√† pose keypoints"""
    annotations = {}
    
    try:
        with open(annotation_path, 'r') as f:
            fall_start = int(f.readline().strip())
            fall_end = int(f.readline().strip())
            
            for line in f:
                parts = line.strip().split(',')
                if len(parts) >= 6:
                    frame_num = int(parts[0])
                    label = int(parts[1])
                    
                    # Bbox coordinates
                    bbox = {
                        'x1': int(parts[2]),
                        'y1': int(parts[3]),
                        'x2': int(parts[4]),
                        'y2': int(parts[5])
                    }
                    
                    # Parse pose keypoints
                    keypoints = []
                    for i in range(6, len(parts)):
                        if ':' in parts[i]:
                            kp_parts = parts[i].split(':')
                            if len(kp_parts) == 4:
                                x = float(kp_parts[1])
                                y = float(kp_parts[2])
                                conf = float(kp_parts[3])
                                keypoints.extend([x, y, conf])
                    
                    # ƒê·∫£m b·∫£o c√≥ ƒë·ªß 17 keypoints (51 values)
                    while len(keypoints) < 51:
                        keypoints.extend([0, 0, 0])
                    keypoints = keypoints[:51]
                    
                    annotations[frame_num] = {
                        'bbox': bbox,
                        'keypoints': keypoints,
                        'label': label
                    }
        
        return annotations, fall_start, fall_end
        
    except Exception as e:
        print(f"‚ùå L·ªói parse {annotation_path}: {e}")
        return {}, -1, -1


In [None]:
def normalize_bbox(bbox, img_width, img_height):
    """Chu·∫©n h√≥a bbox v·ªÅ [0,1]"""
    x1, y1, x2, y2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
    
    # Normalize to [0,1]
    x1_norm = x1 / img_width
    y1_norm = y1 / img_height
    x2_norm = x2 / img_width
    y2_norm = y2 / img_height
    
    return [x1_norm, y1_norm, x2_norm, y2_norm]

def normalize_keypoints(keypoints, img_width, img_height):
    """Normalize keypoints coordinates"""
    normalized_kp = []
    for i in range(0, len(keypoints), 3):
        if i+2 < len(keypoints):
            x = keypoints[i] / img_width
            y = keypoints[i+1] / img_height
            conf = keypoints[i+2]
            normalized_kp.extend([x, y, conf])
    
    return normalized_kp


In [None]:
@tf.function
def augment_image_and_targets(image, bbox, keypoints):
    """Augmentation cho ·∫£nh v√† targets"""
    # Random brightness
    image = tf.image.random_brightness(image, max_delta=0.2)
    
    # Random contrast
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    
    # Random saturation
    image = tf.image.random_saturation(image, lower=0.8, upper=1.2)
    
    # Random horizontal flip
    if tf.random.uniform([]) > 0.5:
        image = tf.image.flip_left_right(image)
        # Flip bbox
        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
        bbox = tf.stack([1.0 - x2, y1, 1.0 - x1, y2])
        
        # Flip keypoints (swap left-right pairs)
        kp_flipped = tf.identity(keypoints)
        # Flip x coordinates
        for i in range(0, 51, 3):
            kp_flipped = tf.tensor_scatter_nd_update(
                kp_flipped, [[i]], [1.0 - keypoints[i]]
            )
        keypoints = kp_flipped
    
    return image, bbox, keypoints


In [None]:
def create_person_pose_dataset(data_files):
    """T·∫°o dataset cho Person Detection + Pose training"""
    
    def data_generator():
        print("üîÑ T·∫°o Person Detection + Pose dataset...")
        sample_count = 0
        
        for data_item in tqdm(data_files, desc="Processing videos"):
            video_path = data_item['video_path']
            annotation_path = data_item['annotation_path']
            
            # Parse annotations
            annotations, fall_start, fall_end = parse_annotation_file(annotation_path)
            if not annotations:
                continue
            
            # M·ªü video
            cap = cv2.VideoCapture(video_path)
            if not cap.isOpened():
                continue
            
            frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            
            # Sample frames c√≥ annotation
            annotated_frames = list(annotations.keys())
            max_samples = min(len(annotated_frames), CONFIG['max_samples_per_video'])
            sampled_frames = random.sample(annotated_frames, max_samples)
            
            for frame_num in sampled_frames:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
                ret, frame = cap.read()
                
                if not ret:
                    continue
                
                # Resize frame
                frame_resized = cv2.resize(frame, CONFIG['input_size'])
                frame_normalized = frame_resized.astype(np.float32) / 255.0
                
                # Get annotation
                ann = annotations[frame_num]
                
                # Normalize bbox v√† keypoints
                bbox_norm = normalize_bbox(ann['bbox'], frame_width, frame_height)
                keypoints_norm = normalize_keypoints(ann['keypoints'], frame_width, frame_height)
                
                # Confidence (c√≥ ng∆∞·ªùi = 1.0)
                confidence = 1.0
                
                yield frame_normalized, bbox_norm, keypoints_norm, confidence
                sample_count += 1
            
            cap.release()
        
        print(f"‚úÖ Dataset ho√†n th√†nh: {sample_count} samples")
    
    # T·∫°o tf.data.Dataset v·ªõi format ph√π h·ª£p cho multi-output model
    dataset = tf.data.Dataset.from_generator(
        data_generator,
        output_signature=(
            tf.TensorSpec(shape=(*CONFIG['input_size'], 3), dtype=tf.float32),
            tf.TensorSpec(shape=(4,), dtype=tf.float32),  # bbox
            tf.TensorSpec(shape=(51,), dtype=tf.float32), # keypoints
            tf.TensorSpec(shape=(), dtype=tf.float32)     # confidence
        )
    )
    
    # Chuy·ªÉn ƒë·ªïi format cho multi-output model: (x, (y1, y2, y3))
    def reformat_data(image, bbox, keypoints, confidence):
        return image, {
            'bbox_output': bbox,
            'pose_output': keypoints, 
            'conf_output': confidence
        }
    
    dataset = dataset.map(reformat_data)
    
    return dataset


In [None]:
def create_person_pose_model():
    """T·∫°o custom CNN model cho person detection v√† pose estimation"""
    
    inputs = keras.Input(shape=(*CONFIG['input_size'], 3))
    
    # Backbone CNN
    x = layers.Conv2D(64, 7, strides=2, padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(3, strides=2, padding='same')(x)
    
    # ResNet-like blocks
    for filters in [64, 128, 256, 512]:
        # Block 1
        shortcut = x
        x = layers.Conv2D(filters, 3, padding='same', activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Conv2D(filters, 3, padding='same')(x)
        x = layers.BatchNormalization()(x)
        
        # Adjust shortcut if needed
        if shortcut.shape[-1] != filters:
            shortcut = layers.Conv2D(filters, 1, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
        
        x = layers.Add()([x, shortcut])
        x = layers.Activation('relu')(x)
        x = layers.MaxPooling2D(2)(x)
    
    # Global features
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(1024, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    
    # Multi-task outputs
    # 1. Bbox regression (4 values: x1, y1, x2, y2)
    bbox_output = layers.Dense(256, activation='relu', name='bbox_dense')(x)
    bbox_output = layers.Dense(4, activation='sigmoid', name='bbox_output', dtype='float32')(bbox_output)
    
    # 2. Pose keypoints (51 values: 17 keypoints * 3)
    pose_output = layers.Dense(512, activation='relu', name='pose_dense')(x)
    pose_output = layers.Dense(51, activation='sigmoid', name='pose_output', dtype='float32')(pose_output)
    
    # 3. Confidence score (1 value)
    conf_output = layers.Dense(128, activation='relu', name='conf_dense')(x)
    conf_output = layers.Dense(1, activation='sigmoid', name='conf_output', dtype='float32')(conf_output)
    
    model = keras.Model(
        inputs=inputs, 
        outputs=[bbox_output, pose_output, conf_output],
        name='PersonPoseDetector'
    )
    
    return model


In [None]:
def bbox_loss(y_true, y_pred):
    """Smooth L1 loss cho bbox regression"""
    diff = tf.abs(y_true - y_pred)
    less_than_one = tf.cast(tf.less(diff, 1.0), tf.float32)
    smooth_l1_loss = (less_than_one * 0.5 * diff**2) + (1.0 - less_than_one) * (diff - 0.5)
    return tf.reduce_mean(smooth_l1_loss)

def pose_loss(y_true, y_pred):
    """MSE loss cho pose keypoints v·ªõi confidence weighting"""
    # Extract confidence values (every 3rd element starting from index 2)
    confidence_mask = y_true[..., 2::3]  # [batch, 17]
    
    # Reshape ƒë·ªÉ t√≠nh loss
    y_true_reshaped = tf.reshape(y_true, [-1, 17, 3])
    y_pred_reshaped = tf.reshape(y_pred, [-1, 17, 3])
    
    # Ch·ªâ t√≠nh loss cho keypoints c√≥ confidence > 0
    valid_mask = tf.cast(confidence_mask > 0, tf.float32)
    valid_mask = tf.expand_dims(valid_mask, -1)  # [batch, 17, 1]
    
    # MSE loss
    mse = tf.square(y_true_reshaped - y_pred_reshaped)
    weighted_mse = mse * valid_mask
    
    return tf.reduce_mean(weighted_mse)

def confidence_loss(y_true, y_pred):
    """Binary crossentropy cho confidence"""
    return tf.keras.losses.binary_crossentropy(y_true, y_pred)


In [None]:
def train_person_pose_model():
    """Training Person Detection + Pose model"""
    
    print("üöÄ B·∫Øt ƒë·∫ßu training Person Detection + Pose model...")
    
    # T√¨m d·ªØ li·ªáu
    data_files = find_data_files()
    if not data_files:
        print("‚ùå Kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu!")
        return None
    
    # Chia train/validation
    train_files, val_files = train_test_split(
        data_files, 
        test_size=CONFIG['validation_split'], 
        random_state=42
    )
    
    print(f"üìä Chia d·ªØ li·ªáu: {len(train_files)} train, {len(val_files)} validation")
    
    # T·∫°o datasets
    print("üì¶ T·∫°o datasets...")
    train_dataset_raw = create_person_pose_dataset(train_files)
    val_dataset_raw = create_person_pose_dataset(val_files)
    
    # T·ªëi ∆∞u dataset
    train_dataset = train_dataset_raw.shuffle(1000).batch(CONFIG['batch_size']).prefetch(tf.data.AUTOTUNE)
    val_dataset = val_dataset_raw.batch(CONFIG['batch_size']).prefetch(tf.data.AUTOTUNE)
    
    # T·∫°o model
    model = create_person_pose_model()
    
    # Compile model v·ªõi custom losses
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=CONFIG['learning_rate']),
        loss={
            'bbox_output': bbox_loss,
            'pose_output': pose_loss,
            'conf_output': confidence_loss
        },
        loss_weights={
            'bbox_output': CONFIG['lambda_bbox'],
            'pose_output': CONFIG['lambda_pose'],
            'conf_output': CONFIG['lambda_conf']
        },
        metrics={
            'bbox_output': 'mae',
            'pose_output': 'mae',
            'conf_output': 'accuracy'
        }
    )
    
    print("üèóÔ∏è Person Detection + Pose Model architecture:")
    model.summary()
    
    # Callbacks
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_save_path = f"{CONFIG['save_model_path']}PersonPose_{timestamp}.h5"
    
    os.makedirs(CONFIG['save_model_path'], exist_ok=True)
    
    callbacks = [
        keras.callbacks.ModelCheckpoint(
            model_save_path,
            monitor='val_loss',
            save_best_only=True,
            mode='min',
            verbose=1
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.7,
            patience=8,
            min_lr=1e-8,
            verbose=1
        ),
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        )
    ]
    
    # Training
    print("üî• B·∫Øt ƒë·∫ßu training...")
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=CONFIG['epochs'],
        callbacks=callbacks,
        verbose=1
    )
    
    print(f"‚úÖ Training ho√†n th√†nh! Model ƒë√£ l∆∞u t·∫°i: {model_save_path}")
    
    return model, history


In [None]:
def draw_pose_skeleton(image, keypoints, confidence_threshold=0.3):
    """V·∫Ω skeleton pose l√™n ·∫£nh"""
    
    img_height, img_width = image.shape[:2]
    
    # V·∫Ω keypoints
    for i in range(0, len(keypoints), 3):
        if i+2 < len(keypoints):
            x = int(keypoints[i] * img_width)
            y = int(keypoints[i+1] * img_height)
            conf = keypoints[i+2]
            
            if conf > confidence_threshold:
                cv2.circle(image, (x, y), 4, (0, 255, 0), -1)
    
    # V·∫Ω skeleton
    for connection in SKELETON:
        kp1_idx = (connection[0] - 1) * 3  # COCO index b·∫Øt ƒë·∫ßu t·ª´ 1
        kp2_idx = (connection[1] - 1) * 3
        
        if kp1_idx < len(keypoints) and kp2_idx < len(keypoints):
            x1 = int(keypoints[kp1_idx] * img_width)
            y1 = int(keypoints[kp1_idx + 1] * img_height)
            conf1 = keypoints[kp1_idx + 2]
            
            x2 = int(keypoints[kp2_idx] * img_width)
            y2 = int(keypoints[kp2_idx + 1] * img_height)
            conf2 = keypoints[kp2_idx + 2]
            
            if conf1 > confidence_threshold and conf2 > confidence_threshold:
                cv2.line(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
    
    return image

def predict_and_visualize(model, test_files, num_samples=3):
    """Predict v√† visualize k·∫øt qu·∫£"""
    
    print(f"üß™ Testing Person Detection + Pose v·ªõi {num_samples} samples...")
    
    for data_item in test_files[:num_samples]:
        video_path = data_item['video_path']
        annotation_path = data_item['annotation_path']
        
        # Parse annotations
        annotations, _, _ = parse_annotation_file(annotation_path)
        
        # M·ªü video
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            continue
        
        # L·∫•y m·ªôt frame ng·∫´u nhi√™n c√≥ annotation
        annotated_frames = list(annotations.keys())
        if not annotated_frames:
            cap.release()
            continue
            
        random_frame = random.choice(annotated_frames)
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, random_frame)
        ret, frame = cap.read()
        
        if not ret:
            cap.release()
            continue
        
        # Preprocess
        frame_resized = cv2.resize(frame, CONFIG['input_size'])
        frame_normalized = frame_resized.astype(np.float32) / 255.0
        input_data = np.expand_dims(frame_normalized, axis=0)
        
        # Predict
        bbox_pred, pose_pred, conf_pred = model.predict(input_data, verbose=0)
        
        # Extract predictions
        bbox = bbox_pred[0]
        pose = pose_pred[0]
        confidence = conf_pred[0][0]
        
        # Convert bbox to pixel coordinates
        img_h, img_w = CONFIG['input_size']
        x1 = int(bbox[0] * img_w)
        y1 = int(bbox[1] * img_h)
        x2 = int(bbox[2] * img_w)
        y2 = int(bbox[3] * img_h)
        
        # Visualize
        result_img = frame_resized.copy()
        
        # V·∫Ω predicted bbox
        cv2.rectangle(result_img, (x1, y1), (x2, y2), (255, 0, 0), 2)
        cv2.putText(result_img, f'Conf: {confidence:.2f}', (x1, y1-10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
        
        # V·∫Ω predicted pose
        result_img = draw_pose_skeleton(result_img, pose)
        
        # Ground truth
        gt_ann = annotations[random_frame]
        frame_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
        frame_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
        
        # Scale ground truth bbox
        gt_bbox = gt_ann['bbox']
        gt_x1 = int(gt_bbox['x1'] * img_w / frame_width)
        gt_y1 = int(gt_bbox['y1'] * img_h / frame_height)
        gt_x2 = int(gt_bbox['x2'] * img_w / frame_width)
        gt_y2 = int(gt_bbox['y2'] * img_h / frame_height)
        
        gt_img = frame_resized.copy()
        cv2.rectangle(gt_img, (gt_x1, gt_y1), (gt_x2, gt_y2), (0, 255, 0), 2)
        
        # Scale ground truth keypoints
        gt_keypoints = normalize_keypoints(gt_ann['keypoints'], frame_width, frame_height)
        gt_img = draw_pose_skeleton(gt_img, gt_keypoints)
        
        # Display
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        plt.imshow(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
        plt.title('Original Frame')
        plt.axis('off')
        
        plt.subplot(1, 3, 2)
        plt.imshow(cv2.cvtColor(gt_img, cv2.COLOR_BGR2RGB))
        plt.title('Ground Truth')
        plt.axis('off')
        
        plt.subplot(1, 3, 3)
        plt.imshow(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB))
        plt.title(f'Prediction (Conf: {confidence:.2f})')
        plt.axis('off')
        
        plt.tight_layout()
        plt.show()
        
        cap.release()


In [None]:
def plot_training_history(history):
    """V·∫Ω bi·ªÉu ƒë·ªì training history"""
    
    plt.figure(figsize=(15, 10))
    
    # Total loss
    plt.subplot(2, 3, 1)
    plt.plot(history.history['loss'], label='Train Loss', linewidth=2)
    plt.plot(history.history['val_loss'], label='Val Loss', linewidth=2)
    plt.title('Total Loss', fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Bbox loss
    plt.subplot(2, 3, 2)
    plt.plot(history.history['bbox_output_loss'], label='Train Bbox Loss', linewidth=2)
    plt.plot(history.history['val_bbox_output_loss'], label='Val Bbox Loss', linewidth=2)
    plt.title('Bbox Loss', fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Pose loss
    plt.subplot(2, 3, 3)
    plt.plot(history.history['pose_output_loss'], label='Train Pose Loss', linewidth=2)
    plt.plot(history.history['val_pose_output_loss'], label='Val Pose Loss', linewidth=2)
    plt.title('Pose Loss', fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Confidence loss
    plt.subplot(2, 3, 4)
    plt.plot(history.history['conf_output_loss'], label='Train Conf Loss', linewidth=2)
    plt.plot(history.history['val_conf_output_loss'], label='Val Conf Loss', linewidth=2)
    plt.title('Confidence Loss', fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Bbox MAE
    plt.subplot(2, 3, 5)
    plt.plot(history.history['bbox_output_mae'], label='Train Bbox MAE', linewidth=2)
    plt.plot(history.history['val_bbox_output_mae'], label='Val Bbox MAE', linewidth=2)
    plt.title('Bbox MAE', fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Confidence accuracy
    plt.subplot(2, 3, 6)
    plt.plot(history.history['conf_output_accuracy'], label='Train Conf Acc', linewidth=2)
    plt.plot(history.history['val_conf_output_accuracy'], label='Val Conf Acc', linewidth=2)
    plt.title('Confidence Accuracy', fontweight='bold')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.suptitle('Person Detection + Pose Training Results', fontsize=16, fontweight='bold', y=1.02)
    plt.show()


In [None]:
if __name__ == "__main__":
    print("üéØ PERSON DETECTION + POSE ESTIMATION")
    print("=" * 50)
    print("üé™ Model CNN t√πy ch·ªânh cho person detection v√† pose estimation!")
    print("üìã Features:")
    print("   ‚úÖ Custom CNN architecture")
    print("   ‚úÖ Multi-task learning")
    print("   ‚úÖ Bbox detection")
    print("   ‚úÖ Pose keypoint estimation")
    print("   ‚úÖ Confidence scoring")
    print("   ‚úÖ End-to-end training")
    
    # Training
    model, history = train_person_pose_model()
    
    if model is not None and history is not None:
        # V·∫Ω bi·ªÉu ƒë·ªì training
        plot_training_history(history)
        
        # Test model
        data_files = find_data_files()
        if data_files:
            predict_and_visualize(model, data_files, num_samples=3)
    
    print("\nüéâ Person Detection + Pose training ho√†n th√†nh!")
