## **💡Reference Notebook - Fernandosr85 - Dashcam Collision Prediction Project 🚗**
#### **https://www.kaggle.com/code/fernandosr85/dashcam-collision-prediction-project**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

max_files = 10 

count = 0

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        count += 1
        if count >= max_files:
            break

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import time
import warnings
from multiprocessing import Pool

import cv2
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.models

warnings.filterwarnings("ignore")

# Check GPU availability and set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# Suppress unnecessary formatting warnings
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Paths to the CSV files
train_csv_path = '/kaggle/input/nexar-collision-prediction/train.csv'
test_csv_path = '/kaggle/input/nexar-collision-prediction/test.csv'
submission_csv_path = '/kaggle/input/nexar-collision-prediction/sample_submission.csv'

# Paths to the video directories
train_video_dir = '/kaggle/input/nexar-collision-prediction/train'
test_video_dir = '/kaggle/input/nexar-collision-prediction/test'

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
submission_df = pd.read_csv(submission_csv_path)

# (추가) id 컬럼을 문자열(str)로 변환해서 .0 문제 없애기
train_df['id'] = train_df['id'].astype(str)

# Display the first few rows of the DataFrames
print("Train.csv:")
print(train_df.head())

print("\nTest.csv:")
print(test_df.head())

print("\nSample Submission:")
print(submission_df.head())

# Optional: handle NaN values if needed, filling with zero or another value
train_df['time_of_event'] = train_df['time_of_event'].fillna(0)
train_df['time_of_alert'] = train_df['time_of_alert'].fillna(0)

# (추가) Check the video directory paths
print("\nVideo Directory Paths:")
print(f"Train videos are located at: {train_video_dir}")
print(f"Test videos are located at: {test_video_dir}")


## **Data Preprocessing and Feature Extraction** 

In [None]:
# 일반적으로 충돌이 발생하는 마지막 부분에 초점을 맞춰 비디오에서 주요 프레임을 추출
# 지수 분포를 사용하여 마지막에 가까운 프레임에 더 많은 가중치를 부여

def extract_keyframes(video_path, num_frames=12, target_size=(160, 160)):
    """
    Extracts key frames from the video, focusing on the final part where collisions typically occur.
    Uses exponential distribution to give more weight to frames closer to the end.
    """
    cap = cv2.VideoCapture(video_path) # 동영상을 불러오기 위해 OpenCV의 videoCapture 객체 생성 

    # 파일이 제대로 열리지 않았을 경우 대비한 예외 처리
    if not cap.isOpened():
        print(f"Could not open the video: {video_path}")
        return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)

    # 총 프레임 수와 초당 프레임 수(FPS)를 가져오기 
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    if total_frames <= 0:
        print(f"Video without frames: {video_path}")
        cap.release()
        return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)
    
    # 영상 길이(초 단위) 계산
    duration = total_frames / fps if fps > 0 else 0
    
    # 짧은 영상 (10초 미만): 균등한 간격으로 프레임 추출
    if duration < 10:
        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

    # 긴 영상 (10초 이상): 후반부에 더 집중해서 추출
    else:
        # 마지막 3초 동안 프레임의 80% 집중(중요 영역)
        end_frames = int(num_frames * 0.8)
        start_frames = num_frames - end_frames
        
        # 지난 3초 동안의 시작 인덱스를 계산
        last_seconds = 3
        last_frame_count = min(int(fps * last_seconds), total_frames - 1)
        start_idx = max(0, total_frames - last_frame_count)
        
        # 마지막 프레임에 더 많은 가중치를 부여하는 지수 분포
        # 이렇게 하면 마지막에 더 밀집된 인덱스가 생성된다 ("프레임을 뽑는 간격"자체를 조절 → 끝부분에 더 많이 몰리게 만드는 방식)
        end_indices = np.array([
            start_idx + int((total_frames - start_idx - 1) * (i/end_frames)**2) 
            for i in range(1, end_frames + 1)
        ])
        
        # context에 맞게 균일하게 배포된 초기 프레임 (초반부에서 균등하게 추출한 프레임들)
        # context란? 사고 직전에 어떤 상황이 펼쳐졌는지에 대한 흐름, 배경, 맥락 
        begin_indices = np.linspace(0, start_idx - 1, start_frames, dtype=int) if start_idx > 0 else np.zeros(start_frames, dtype=int)
        
        # 인덱스 결합
        frame_indices = np.concatenate([begin_indices, end_indices])
    
    # 선택한 프레임 추출 
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            # Use higher resolution and better interpolation
            frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_LANCZOS4)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        else:
            frames.append(np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8))
    
    cap.release()
    return np.array(frames, dtype=np.uint8)

# 먼저, 전역 범위에서 변환 클래스를 정의 
# 입력된 영상 프레임을 일정 확률로 좌우 반전시켜서, 데이터 다양성을 늘리는 역할
class RandomHorizontalFlip(object):
    def __init__(self, p=0.5):
        self.p = p
        
    def __call__(self, frames):
        if np.random.random() < self.p:
            return frames[:, :, ::-1, :].copy()  # horizontally flip each frame
        return frames

# 영상 프레임의 밝기와 대비를 무작위로 조정해, 다양한 조명 환경을 시뮬레이션하는 증강 클래스
class ColorJitter(object):
    def __init__(self, brightness=0, contrast=0):
        self.brightness = brightness
        self.contrast = contrast
        
    def __call__(self, frames):
        # Apply brightness jitter
        if self.brightness > 0:
            brightness_factor = np.random.uniform(max(0, 1-self.brightness), 1+self.brightness)
            frames = frames * brightness_factor
            frames = np.clip(frames, 0, 255)
        
        # Apply contrast jitter
        if self.contrast > 0:
            contrast_factor = np.random.uniform(max(0, 1-self.contrast), 1+self.contrast)
            frames = (frames - 128) * contrast_factor + 128
            frames = np.clip(frames, 0, 255)
            
        return frames

# 프레임에 흐릿한 안개 효과를 넣어, 시야가 나쁜 날씨 상황을 시뮬레이션하는 클래스
class AddFog(object):
    def __call__(self, frames):
        fog = np.random.uniform(0.7, 0.9, frames.shape).astype(np.float32)
        return frames * 0.8 + fog * 50  # Adjusted for 0-255 scale

# 프레임에 흰색 선형 노이즈(빗방울)를 추가해 비 오는 날씨를 시뮬레이션하는 클래스
class AddRain(object):
    def __call__(self, frames):
        h, w = frames.shape[1:3]
        rain = np.random.uniform(0, 1, (len(frames), h, w, 1)).astype(np.float32)
        rain = (rain > 0.97).astype(np.float32) * 200  # White rain drops
        return np.clip(frames * 0.9 + rain, 0, 255)  # Darken a bit and add drops

# 지정된 확률에 따라 어떤 변환을 적용할지 말지를 무작위로 결정하는 컨트롤러 클래스(랜덤성 부여)
class RandomApply(object):
    def __init__(self, transform, p=0.5):
        self.transform = transform
        self.p = p
        
    def __call__(self, frames):
        if np.random.random() < self.p:
            return self.transform(frames)
        return frames

# 여러 개의 변환(Flip, Jitter, Fog 등)을 순서대로 적용하는 데이터 증강 파이프라인 클래스
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms
        
    def __call__(self, frames):
        for t in self.transforms:
            frames = t(frames)
        return frames

# 영상 프레임 배열을 PyTorch 텐서로 바꾸고, 픽셀 값을 0~1 범위로 정규화하는 클래스
class ToTensor(object):
    def __call__(self, frames):
        # Convert from (T, H, W, C) to (T, C, H, W)
        frames = frames.transpose(0, 3, 1, 2)
        # Convert to tensor and normalize to [0, 1]
        return torch.from_numpy(frames).float() / 255.0

In [None]:
# 동영상에서 데이터 증강을 위한 변환을 반환

def get_video_transforms():
    """
    Returns transformations for data augmentation in videos.
    """
    return {
        'train': Compose([
            RandomHorizontalFlip(p=0.5),
            ColorJitter(brightness=0.3, contrast=0.3),
            RandomApply(AddFog(), p=0.15),
            RandomApply(AddRain(), p=0.15),
            RandomApply(RandomNoise(0.05), p=0.2), 
            RandomApply(RandomOcclusion(), p=0.1),
            ToTensor()
        ]),
        'val': Compose([
            ToTensor()  # Only tensor conversion for validation
        ])
    }

# 비디오 프레임에서 무작위 가우시안(정규분포) 노이즈를 추가하여, 실제 촬영 환경에서 
# 발생할 수 있는 잡음에 대해 모델이 더 강건해지도록 만드는 클래스
class RandomNoise(object):
    """
    Applies random Gaussian noise to video frames for data augmentation.
    
    This transformation helps the model become more robust to noise
    that may be present in real-world video data.
    
    Args:
        std (float): Standard deviation of the Gaussian noise as a fraction
                     of the pixel value range (default: 0.05)
    """
    def __init__(self, std=0.05):
        self.std = std
        
    def __call__(self, frames):
        """
        Apply random noise to the input frames.
        
        Args:
            frames (numpy.ndarray): Input video frames of shape (T, H, W, C)
                                   where T is number of frames
        
        Returns:
            numpy.ndarray: Noise-augmented frames, clipped to valid pixel range [0, 255]
        """
        # 지정된 표준 편차를 가진 가우시안 노이즈 생성
        noise = np.random.normal(0, self.std * 255, frames.shape).astype(np.float32)
        
        # 유효한 픽셀 범위에 노이즈 및 클립 추가하기
        # 영상은 정수형 데이터여야 하므로 형 변환 (astype)
        return np.clip(frames + noise, 0, 255).astype(np.uint8)

# 영상 프레임에 검은색 사각형을 무작위로 덮어 씌워, 일부 정보가 가려졌을 때도 모델이 견딜 수 있도록 훈련시키는 클래스
class RandomOcclusion(object):
    """
    Simulates occlusion in video frames by adding black rectangles.
    
    This transformation helps the model learn to handle partial occlusions
    that may occur in real-world scenarios when objects block the camera view.
    """
    def __call__(self, frames):
        """
        Apply random occlusion to the input frames.
        
        Args:
            frames (numpy.ndarray): Input video frames of shape (T, H, W, C)
                                   where T is number of frames
        
        Returns:
            numpy.ndarray: Frames with random occlusion applied
        """
        # 프레임 하나의 세로(h), 가로(w) 길이 가져오기
        h, w = frames.shape[1:3]
        
        # 전체 프레임 크기의 10%~25% 사이 크기의 가림 영역 크기 설정
        occl_h = np.random.randint(int(h * 0.1), int(h * 0.25))
        occl_w = np.random.randint(int(w * 0.1), int(w * 0.25))
        
        # 이 가림 영역이 들어갈 무작위 위치 좌표 설정 
        occl_x = np.random.randint(0, w - occl_w)
        occl_y = np.random.randint(0, h - occl_h)
        
        # 원본 프레임을 수정하지 않도록 복사본 만들기
        frames_copy = frames.copy()
        
        # 픽셀을 0(검정색)으로 설정하여 모든 프레임에 occlusion 적용
        for i in range(len(frames)):
            frames_copy[i, occl_y:occl_y+occl_h, occl_x:occl_x+occl_w, :] = 0
            
        return frames_copy

In [None]:
# 비디오 프레임 간 움직임(모션)을 추적하는 'optical_flow'를 계산해, 객체나 배경의 이동 방향과 속도를 벡터 형태로 반환하는 함수
# 두 연속된 이미지(또는 프레임) 사이에서, 각 픽셀이 어떻게 이동했는지를 벡터로 표현하는 기술 -> optical_flow
# Farneback 방식만 사용
# "모든 픽셀의 방향 + 속도 정보를 다 남김"
def compute_optical_flow_sequence(frames, skip_frames=1):
    """
    Calculates per-frame optical flow magnitudes as a sequence.
    
    Args:
        frames (numpy.ndarray): (T, H, W, C)
        
    Returns:
        numpy.ndarray: (T, 1) array of flow magnitudes (first frame is 0)
    """
    T = len(frames)
    if T < 2:
        return np.zeros((T, 1), dtype=np.float32)
    
    magnitudes = [0.0]  # 첫 프레임은 optical flow가 없으니 0으로 채움

    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_RGB2GRAY)
    
    for i in range(1, T, skip_frames):
        curr_gray = cv2.cvtColor(frames[i], cv2.COLOR_RGB2GRAY)
        try:
            flow = cv2.calcOpticalFlowFarneback(
                prev_gray, curr_gray,
                None, 0.5, 3, 15, 3, 5, 1.2, 0
            )
            flow_magnitude = np.linalg.norm(flow, axis=-1).mean()  # (H, W) → scalar mean
            magnitudes.append(flow_magnitude)
        except Exception as e:
            print(f"Error calculating optical flow: {str(e)}")
            magnitudes.append(0.0)
        
        prev_gray = curr_gray

    # 길이가 부족하면 padding
    while len(magnitudes) < T:
        magnitudes.append(0.0)
    
    return np.array(magnitudes, dtype=np.float32).reshape(T, 1)  # (T, 1)


In [None]:
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.applications import EfficientNetB0

**Positive이면 alert_event 사이만 뽑고, Negative이면 마지막 3초 구간을 기준으로 추출**
**그리고 num_frames만큼 균등하게 뽑고 CNN + Optical Flow 둘 다 계산** 

In [None]:
# InceptionV3 모델로 특성 추출
base_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
cnn_feature_dim = base_model.output_shape[-1]

def get_hybrid_feature_sequence(video_path, num_frames=12):
    """
    Extract per-frame hybrid features (CNN + Optical flow) as a sequence.
    
    Args:
        video_path (str): Path to video file.
        num_frames (int): Number of frames to extract.
    
    Returns:
        np.ndarray: (T, 1281) array of per-frame features.
    """
    # 1. 프레임 추출
    frames = extract_keyframes(video_path, num_frames=num_frames, target_size=(160,160))
    
    if len(frames) == 0:
        print(f"Skipping {video_path}: no frames")
        return np.zeros((num_frames, 1281), dtype=np.float32)

    # 2. CNN feature per frame (Inception expects (N, H, W, C))
    spatial_features = base_model.predict(
        preprocess_input(frames.astype('float32')),
        batch_size=32,
        verbose=0
    )  # shape: (T, 1280)

    # 3. Optical flow sequence
    flow_magnitudes = compute_optical_flow_sequence(frames)  # shape: (T, 1)

    # 4. Concatenate per frame
    hybrid_features = np.concatenate([spatial_features, flow_magnitudes], axis=1)  # (T, 1281)

    return hybrid_features


In [None]:
def get_hybrid_feature_sequence_from_frames(frames):
    """
    Extract per-frame hybrid features (CNN + Optical flow) from pre-loaded frames.
    
    Args:
        frames (torch.Tensor): (T, 3, 160, 160) tensor (after transform).
    
    Returns:
        np.ndarray: (T, 1281) array of per-frame features.
    """
    if len(frames) == 0:
        print("Warning: empty frames input")
        return np.zeros((1, 1281), dtype=np.float32)

    # 1️⃣ PyTorch tensor → numpy (T, 160, 160, 3), [0, 255] scale
    frames_np = frames.permute(0, 2, 3, 1).numpy() * 255.0  # [0,1] → [0,255]
    frames_np = frames_np.astype(np.uint8)

    # 2️⃣ CNN Features per frame
    spatial_features = base_model.predict(
        preprocess_input(frames_np.astype('float32')),
        batch_size=32,
        verbose=0
    )  # shape: (T, 1280)

    # 3️⃣ Optical Flow per frame
    flow_magnitudes = compute_optical_flow_sequence(frames_np)  # shape: (T, 1)

    # 4️⃣ Concatenate → (T, 1281)
    hybrid_features = np.concatenate([spatial_features, flow_magnitudes], axis=1)

    return hybrid_features


In [None]:
def compute_optical_flow_sequence(frames, skip_frames=1):
    """
    Computes per-frame optical flow magnitudes.
    
    Args:
        frames (np.ndarray): (T, H, W, 3) numpy array of frames.
    
    Returns:
        np.ndarray: (T, 1) array of per-frame optical flow magnitudes.
    """
    T = len(frames)
    if T < 2:
        return np.zeros((T, 1), dtype=np.float32)

    magnitudes = []

    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_RGB2GRAY)

    for i in range(1, T, skip_frames):
        curr_gray = cv2.cvtColor(frames[i], cv2.COLOR_RGB2GRAY)
        try:
            flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray,
                                                None, 0.5, 3, 15, 3, 5, 1.2, 0)
            # magnitude = sqrt(u^2 + v^2)
            mag = np.linalg.norm(flow, axis=-1)  # shape (H, W)
            avg_mag = np.mean(mag)  # scalar
            magnitudes.append(avg_mag)
        except Exception as e:
            print(f"Error calculating flow at frame {i}: {str(e)}")
            magnitudes.append(0.0)

        prev_gray = curr_gray

    # 마지막 길이 맞춤 (T, 1)
    if len(magnitudes) < T:
        magnitudes.append(0.0)  # 마지막 프레임은 flow가 없음

    magnitudes = np.array(magnitudes, dtype=np.float32).reshape(-1, 1)  # (T, 1)

    return magnitudes


In [None]:
# 아직도 최종적으로 Transformer에 넣은 (T, 1281) 시퀀스는 만들어지지 않음
# 1. 프레임별 CNN Feature 추출 (InceptionV3) 추출
# 2. optical flow sequence (compute_optical_flow_sequence) 추출
# 3. 두 결과물 concat
# 4. 이걸 Transformer의 input 시퀀스로 사용

# 전체 처리 함수 (이제 둘을 결합하는 함수 생성)
# CNN + optical flow 붙여서 (T, 1281) 만들어주는 함수 

def prepare_transformer_input(video_path, num_frames=12, target_size=(160, 160)):
    """
    Prepares (T, 1281) input sequence combining CNN features + optical flow
    for a given video.
    """
    # === Step 1: Extract frames ===
    frames = extract_keyframes(video_path, num_frames=num_frames, target_size=target_size)
    if frames.shape[0] == 0:
        print(f"Skipping video {video_path} (no frames)")
        return None  # or np.zeros((num_frames, 1281)) as fallback

    # === Step 2: Extract CNN (spatial) features per frame ===
    frames_float = preprocess_input(frames.astype('float32'))  # preprocess for InceptionV3
    spatial_features = base_model.predict(frames_float, batch_size=32, verbose=0)  # (T, 1280)

    # === Step 3: Compute optical flow sequence ===
    optical_flow_sequence = compute_optical_flow_sequence(frames)  # (T, 1)

    # === Step 4: Combine both ===
    combined_features = np.concatenate([spatial_features, optical_flow_sequence], axis=1)  # (T, 1281)

    return combined_features

In [None]:
from tqdm import tqdm

# 1️⃣ transform 준비
transforms = get_video_transforms()
train_transform = transforms['train']

# 2️⃣ feature를 저장할 리스트 준비
all_sequences = []

# 3️⃣ train_df 전체 반복 (여기 tqdm 적용!)
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    video_id = row['id']  # 'id' 사용!
    video_path = os.path.join(train_video_dir, f"{int(video_id):05d}.mp4")  # 5자리 zero-padding

    # prepare_transformer_input() 호출
    sequence = prepare_transformer_input(video_path, num_frames=12)

    if sequence is None:
        print(f"Skipping video {video_id} (no valid sequence)")
        continue

    all_sequences.append(sequence)

# 4️⃣ 결과를 numpy array로 변환
all_sequences = np.array(all_sequences)
print(f"\nAll sequences shape: {all_sequences.shape}")
# 예시 출력 → (n_videos, 12, 1281)


In [None]:
# Dataset 클래스 정의
import torch
from torch.utils.data import Dataset

class VideoSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (numpy.ndarray): shape (n_samples, T, feature_dim)
            labels (numpy.ndarray or list): shape (n_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x = self.sequences[idx]  # (T, feature_dim)
        y = self.labels[idx]     # scalar or class
        return x, y


In [None]:
# DataLoader 만들기 
from torch.utils.data import DataLoader, random_split

# target(label) 준비 (예: binary classification)
labels = train_df['target'].values  # shape (n_videos,)

# Dataset 객체 생성
dataset = VideoSequenceDataset(all_sequences, labels)

# Train/Val 분할 (예: 80/20)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader 생성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# Temporal Transformer 모델 설계
import torch
import torch.nn as nn

class TemporalTransformerModel(nn.Module):
    def __init__(self, input_dim=1281, embed_dim=256, num_heads=4, num_layers=2, dropout=0.1):
        super(TemporalTransformerModel, self).__init__()

        # 1. Input → embedding layer
        self.input_proj = nn.Linear(input_dim, embed_dim)

        # 2. Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 3. Classification head (binary classification)
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()  # Binary output (0~1)
        )

    def forward(self, x):
        """
        x: (batch_size, T, input_dim)
        """
        # Step 1: Project input features
        x = self.input_proj(x)  # → (batch_size, T, embed_dim)

        # Step 2: Apply Transformer Encoder
        x = self.transformer_encoder(x)  # → (batch_size, T, embed_dim)

        # Step 3: Aggregate (mean pooling over time)
        x = x.mean(dim=1)  # → (batch_size, embed_dim)

        # Step 4: Final classification
        out = self.classifier(x)  # → (batch_size, 1)

        return out

In [None]:
# Spatial Transformer 모델 설계

import torch
import torch.nn as nn

class SpatialTransformer(nn.Module):
    def __init__(self, input_dim=1280, hidden_dim=512, num_heads=8, num_layers=2, dropout=0.1):
        super(SpatialTransformer, self).__init__()

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 마지막 summary를 위한 pooling 또는 projection
        self.output_layer = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        """
        Args:
            x: shape (batch_size, T, input_dim) → per-frame spatial features
        
        Returns:
            out: shape (batch_size, input_dim) → aggregated spatial feature
        """
        # transformer expects (batch_size, T, input_dim)
        x_transformed = self.transformer(x)  # (batch_size, T, input_dim)

        # Pooling over time (mean pooling)
        x_pooled = x_transformed.mean(dim=1)  # (batch_size, input_dim)

        out = self.output_layer(x_pooled)  # (batch_size, input_dim)
        return out


### **1단계: 일반 전처리** 
#### **(1) 프레임 추출 - extract_keyframes()** 
#### **(2) Optical Flow 계산 - compute_optical_flow()** 
#### **(3) 기본 증강 - ColorJitter, AddRain, AddFog, ToTensor() 등**
#### **(4) 데이터 구성 - DashcamDataset 또는 PreprocessDashcamDataset으로 구성**

### **2단계: AAT-DA 전용 전처리 (기존 데이터에서 Transformer용 구조 변환)**
##### AAT-DA는 단순한 영상 프레임이 아니라, "객체 중심의 시공간 Attention 입력 구조"를 요구하기 때문에 기존 전처리된 데이터를 바탕으로 추가적인 전처리가 필요
#### **(1) 객체 감지 - 프레임에서 객체 감지 (Cascade R-CNN 등)** 
#### **(2) 객체 특징 추출 - 감지된 박스마다 VGG16 FC7 feature 추출 (4096-dim)** 
#### **(3) 시선 맵 예측 - Gate-DAP 등으로 driver attention heatmap 생성**
#### **(4) 주의 가중치 계산 - 시선 맵 + 객체 위치 → 객체별 attention weight αᵢ 계산**
#### **(5) Feature 가중 - 객체 feature αᵢ → 강조된 객체 feature**
#### **(6) 시퀀스 구성 - 모든 프레임의 결과를 (T, N, 4096) 시퀀스로 패딩 정리**