## **💡Reference Notebook - Fernandosr85 - Dashcam Collision Prediction Project 🚗**
#### **https://www.kaggle.com/code/fernandosr85/dashcam-collision-prediction-project**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

max_files = 10 

count = 0

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        count += 1
        if count >= max_files:
            break

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nexar-collision-prediction/sample_submission.csv
/kaggle/input/nexar-collision-prediction/train.csv
/kaggle/input/nexar-collision-prediction/test.csv
/kaggle/input/nexar-collision-prediction/test/02772.mp4
/kaggle/input/nexar-collision-prediction/test/02807.mp4
/kaggle/input/nexar-collision-prediction/test/02509.mp4
/kaggle/input/nexar-collision-prediction/test/00350.mp4
/kaggle/input/nexar-collision-prediction/test/02163.mp4
/kaggle/input/nexar-collision-prediction/test/02707.mp4
/kaggle/input/nexar-collision-prediction/test/02741.mp4
/kaggle/input/nexar-collision-prediction/train/02059.mp4


In [2]:
import os
import gc
import time
import warnings
from multiprocessing import Pool

import cv2
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.models

warnings.filterwarnings("ignore")

# Check GPU availability and set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Suppress unnecessary formatting warnings
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Paths to the CSV files
train_csv_path = '/kaggle/input/nexar-collision-prediction/train.csv'
test_csv_path = '/kaggle/input/nexar-collision-prediction/test.csv'
submission_csv_path = '/kaggle/input/nexar-collision-prediction/sample_submission.csv'

# Paths to the video directories
train_video_dir = '/kaggle/input/nexar-collision-prediction/train'
test_video_dir = '/kaggle/input/nexar-collision-prediction/test'

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
submission_df = pd.read_csv(submission_csv_path)

# (추가) id 컬럼을 문자열(str)로 변환해서 .0 문제 없애기
train_df['id'] = train_df['id'].astype(str)

# Display the first few rows of the DataFrames
print("Train.csv:")
print(train_df.head())

print("\nTest.csv:")
print(test_df.head())

print("\nSample Submission:")
print(submission_df.head())

# Optional: handle NaN values if needed, filling with zero or another value
train_df['time_of_event'] = train_df['time_of_event'].fillna(0)
train_df['time_of_alert'] = train_df['time_of_alert'].fillna(0)

# (추가) Check the video directory paths
print("\nVideo Directory Paths:")
print(f"Train videos are located at: {train_video_dir}")
print(f"Test videos are located at: {test_video_dir}")


Train.csv:
     id  time_of_event  time_of_alert  target
0  1924            NaN            NaN       0
1   822           19.5         18.633       1
2  1429            NaN            NaN       0
3   208           19.8         19.233       1
4  1904            NaN            NaN       0

Test.csv:
    id
0  204
1   30
2  146
3   20
4  511

Sample Submission:
    id  target
0  204       0
1   30       0
2  146       0
3   20       0
4  511       0

Video Directory Paths:
Train videos are located at: /kaggle/input/nexar-collision-prediction/train
Test videos are located at: /kaggle/input/nexar-collision-prediction/test


## **Data Preprocessing and Feature Extraction** 

In [4]:
# 일반적으로 충돌이 발생하는 마지막 부분에 초점을 맞춰 비디오에서 주요 프레임을 추출
# 지수 분포를 사용하여 마지막에 가까운 프레임에 더 많은 가중치를 부여

def extract_keyframes(video_path, num_frames=12, target_size=(160, 160)):
    """
    Extracts key frames from the video, focusing on the final part where collisions typically occur.
    Uses exponential distribution to give more weight to frames closer to the end.
    """
    cap = cv2.VideoCapture(video_path) # 동영상을 불러오기 위해 OpenCV의 videoCapture 객체 생성 

    # 파일이 제대로 열리지 않았을 경우 대비한 예외 처리
    if not cap.isOpened():
        print(f"Could not open the video: {video_path}")
        return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)

    # 총 프레임 수와 초당 프레임 수(FPS)를 가져오기 
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    if total_frames <= 0:
        print(f"Video without frames: {video_path}")
        cap.release()
        return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)
    
    # 영상 길이(초 단위) 계산
    duration = total_frames / fps if fps > 0 else 0
    
    # 짧은 영상 (10초 미만): 균등한 간격으로 프레임 추출
    if duration < 10:
        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

    # 긴 영상 (10초 이상): 후반부에 더 집중해서 추출
    else:
        # 마지막 3초 동안 프레임의 80% 집중(중요 영역)
        end_frames = int(num_frames * 0.8)
        start_frames = num_frames - end_frames
        
        # 지난 3초 동안의 시작 인덱스를 계산
        last_seconds = 3
        last_frame_count = min(int(fps * last_seconds), total_frames - 1)
        start_idx = max(0, total_frames - last_frame_count)
        
        # 마지막 프레임에 더 많은 가중치를 부여하는 지수 분포
        # 이렇게 하면 마지막에 더 밀집된 인덱스가 생성된다 ("프레임을 뽑는 간격"자체를 조절 → 끝부분에 더 많이 몰리게 만드는 방식)
        end_indices = np.array([
            start_idx + int((total_frames - start_idx - 1) * (i/end_frames)**2) 
            for i in range(1, end_frames + 1)
        ])
        
        # context에 맞게 균일하게 배포된 초기 프레임 (초반부에서 균등하게 추출한 프레임들)
        # context란? 사고 직전에 어떤 상황이 펼쳐졌는지에 대한 흐름, 배경, 맥락 
        begin_indices = np.linspace(0, start_idx - 1, start_frames, dtype=int) if start_idx > 0 else np.zeros(start_frames, dtype=int)
        
        # 인덱스 결합
        frame_indices = np.concatenate([begin_indices, end_indices])
    
    # 선택한 프레임 추출 
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            # Use higher resolution and better interpolation
            frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_LANCZOS4)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        else:
            frames.append(np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8))
    
    cap.release()
    return np.array(frames, dtype=np.uint8)

# 먼저, 전역 범위에서 변환 클래스를 정의 
# 입력된 영상 프레임을 일정 확률로 좌우 반전시켜서, 데이터 다양성을 늘리는 역할
class RandomHorizontalFlip(object):
    def __init__(self, p=0.5):
        self.p = p
        
    def __call__(self, frames):
        if np.random.random() < self.p:
            return frames[:, :, ::-1, :].copy()  # horizontally flip each frame
        return frames

# 영상 프레임의 밝기와 대비를 무작위로 조정해, 다양한 조명 환경을 시뮬레이션하는 증강 클래스
class ColorJitter(object):
    def __init__(self, brightness=0, contrast=0):
        self.brightness = brightness
        self.contrast = contrast
        
    def __call__(self, frames):
        # Apply brightness jitter
        if self.brightness > 0:
            brightness_factor = np.random.uniform(max(0, 1-self.brightness), 1+self.brightness)
            frames = frames * brightness_factor
            frames = np.clip(frames, 0, 255)
        
        # Apply contrast jitter
        if self.contrast > 0:
            contrast_factor = np.random.uniform(max(0, 1-self.contrast), 1+self.contrast)
            frames = (frames - 128) * contrast_factor + 128
            frames = np.clip(frames, 0, 255)
            
        return frames

# 프레임에 흐릿한 안개 효과를 넣어, 시야가 나쁜 날씨 상황을 시뮬레이션하는 클래스
class AddFog(object):
    def __call__(self, frames):
        fog = np.random.uniform(0.7, 0.9, frames.shape).astype(np.float32)
        return frames * 0.8 + fog * 50  # Adjusted for 0-255 scale

# 프레임에 흰색 선형 노이즈(빗방울)를 추가해 비 오는 날씨를 시뮬레이션하는 클래스
class AddRain(object):
    def __call__(self, frames):
        h, w = frames.shape[1:3]
        rain = np.random.uniform(0, 1, (len(frames), h, w, 1)).astype(np.float32)
        rain = (rain > 0.97).astype(np.float32) * 200  # White rain drops
        return np.clip(frames * 0.9 + rain, 0, 255)  # Darken a bit and add drops

# 지정된 확률에 따라 어떤 변환을 적용할지 말지를 무작위로 결정하는 컨트롤러 클래스(랜덤성 부여)
class RandomApply(object):
    def __init__(self, transform, p=0.5):
        self.transform = transform
        self.p = p
        
    def __call__(self, frames):
        if np.random.random() < self.p:
            return self.transform(frames)
        return frames

# 여러 개의 변환(Flip, Jitter, Fog 등)을 순서대로 적용하는 데이터 증강 파이프라인 클래스
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms
        
    def __call__(self, frames):
        for t in self.transforms:
            frames = t(frames)
        return frames

# 영상 프레임 배열을 PyTorch 텐서로 바꾸고, 픽셀 값을 0~1 범위로 정규화하는 클래스
class ToTensor(object):
    def __call__(self, frames):
        # Convert from (T, H, W, C) to (T, C, H, W)
        frames = frames.transpose(0, 3, 1, 2)
        # Convert to tensor and normalize to [0, 1]
        return torch.from_numpy(frames).float() / 255.0

In [5]:
# 동영상에서 데이터 증강을 위한 변환을 반환

def get_video_transforms():
    """
    Returns transformations for data augmentation in videos.
    """
    return {
        'train': Compose([
            RandomHorizontalFlip(p=0.5),
            ColorJitter(brightness=0.3, contrast=0.3),
            RandomApply(AddFog(), p=0.15),
            RandomApply(AddRain(), p=0.15),
            RandomApply(RandomNoise(0.05), p=0.2), 
            RandomApply(RandomOcclusion(), p=0.1),
            ToTensor()
        ]),
        'val': Compose([
            ToTensor()  # Only tensor conversion for validation
        ])
    }

# 비디오 프레임에서 무작위 가우시안(정규분포) 노이즈를 추가하여, 실제 촬영 환경에서 
# 발생할 수 있는 잡음에 대해 모델이 더 강건해지도록 만드는 클래스
class RandomNoise(object):
    """
    Applies random Gaussian noise to video frames for data augmentation.
    
    This transformation helps the model become more robust to noise
    that may be present in real-world video data.
    
    Args:
        std (float): Standard deviation of the Gaussian noise as a fraction
                     of the pixel value range (default: 0.05)
    """
    def __init__(self, std=0.05):
        self.std = std
        
    def __call__(self, frames):
        """
        Apply random noise to the input frames.
        
        Args:
            frames (numpy.ndarray): Input video frames of shape (T, H, W, C)
                                   where T is number of frames
        
        Returns:
            numpy.ndarray: Noise-augmented frames, clipped to valid pixel range [0, 255]
        """
        # 지정된 표준 편차를 가진 가우시안 노이즈 생성
        noise = np.random.normal(0, self.std * 255, frames.shape).astype(np.float32)
        
        # 유효한 픽셀 범위에 노이즈 및 클립 추가하기
        # 영상은 정수형 데이터여야 하므로 형 변환 (astype)
        return np.clip(frames + noise, 0, 255).astype(np.uint8)

# 영상 프레임에 검은색 사각형을 무작위로 덮어 씌워, 일부 정보가 가려졌을 때도 모델이 견딜 수 있도록 훈련시키는 클래스
class RandomOcclusion(object):
    """
    Simulates occlusion in video frames by adding black rectangles.
    
    This transformation helps the model learn to handle partial occlusions
    that may occur in real-world scenarios when objects block the camera view.
    """
    def __call__(self, frames):
        """
        Apply random occlusion to the input frames.
        
        Args:
            frames (numpy.ndarray): Input video frames of shape (T, H, W, C)
                                   where T is number of frames
        
        Returns:
            numpy.ndarray: Frames with random occlusion applied
        """
        # 프레임 하나의 세로(h), 가로(w) 길이 가져오기
        h, w = frames.shape[1:3]
        
        # 전체 프레임 크기의 10%~25% 사이 크기의 가림 영역 크기 설정
        occl_h = np.random.randint(int(h * 0.1), int(h * 0.25))
        occl_w = np.random.randint(int(w * 0.1), int(w * 0.25))
        
        # 이 가림 영역이 들어갈 무작위 위치 좌표 설정 
        occl_x = np.random.randint(0, w - occl_w)
        occl_y = np.random.randint(0, h - occl_h)
        
        # 원본 프레임을 수정하지 않도록 복사본 만들기
        frames_copy = frames.copy()
        
        # 픽셀을 0(검정색)으로 설정하여 모든 프레임에 occlusion 적용
        for i in range(len(frames)):
            frames_copy[i, occl_y:occl_y+occl_h, occl_x:occl_x+occl_w, :] = 0
            
        return frames_copy

In [6]:
# 비디오 프레임 간 움직임(모션)을 추적하는 'optical_flow'를 계산해, 객체나 배경의 이동 방향과 속도를 벡터 형태로 반환하는 함수
# 두 연속된 이미지(또는 프레임) 사이에서, 각 픽셀이 어떻게 이동했는지를 벡터로 표현하는 기술 -> optical_flow
# Farneback 방식만 사용
# "모든 픽셀의 방향 + 속도 정보를 다 남김"
def compute_optical_flow_sequence(frames, skip_frames=1):
    """
    Calculates per-frame optical flow magnitudes as a sequence.
    
    Args:
        frames (numpy.ndarray): (T, H, W, C)
        
    Returns:
        numpy.ndarray: (T, 1) array of flow magnitudes (first frame is 0)
    """
    T = len(frames)
    if T < 2:
        return np.zeros((T, 1), dtype=np.float32)
    
    magnitudes = [0.0]  # 첫 프레임은 optical flow가 없으니 0으로 채움

    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_RGB2GRAY)
    
    for i in range(1, T, skip_frames):
        curr_gray = cv2.cvtColor(frames[i], cv2.COLOR_RGB2GRAY)
        try:
            flow = cv2.calcOpticalFlowFarneback(
                prev_gray, curr_gray,
                None, 0.5, 3, 15, 3, 5, 1.2, 0
            )
            flow_magnitude = np.linalg.norm(flow, axis=-1).mean()  # (H, W) → scalar mean
            magnitudes.append(flow_magnitude)
        except Exception as e:
            print(f"Error calculating optical flow: {str(e)}")
            magnitudes.append(0.0)
        
        prev_gray = curr_gray

    # 길이가 부족하면 padding
    while len(magnitudes) < T:
        magnitudes.append(0.0)
    
    return np.array(magnitudes, dtype=np.float32).reshape(T, 1)  # (T, 1)


In [7]:
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.applications import EfficientNetB0

2025-05-03 06:21:03.293860: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746253263.565585      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746253263.640758      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**Positive이면 alert_event 사이만 뽑고, Negative이면 마지막 3초 구간을 기준으로 추출**
**그리고 num_frames만큼 균등하게 뽑고 CNN + Optical Flow 둘 다 계산** 

In [8]:
# InceptionV3 모델로 특성 추출
base_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
cnn_feature_dim = base_model.output_shape[-1]

def get_hybrid_feature_sequence(video_path, num_frames=12):
    """
    Extract per-frame hybrid features (CNN + Optical flow) as a sequence.
    
    Args:
        video_path (str): Path to video file.
        num_frames (int): Number of frames to extract.
    
    Returns:
        np.ndarray: (T, 1281) array of per-frame features.
    """
    # 1. 프레임 추출
    frames = extract_keyframes(video_path, num_frames=num_frames, target_size=(160,160))
    
    if len(frames) == 0:
        print(f"Skipping {video_path}: no frames")
        return np.zeros((num_frames, 1281), dtype=np.float32)

    # 2. CNN feature per frame (Inception expects (N, H, W, C))
    spatial_features = base_model.predict(
        preprocess_input(frames.astype('float32')),
        batch_size=32,
        verbose=0
    )  # shape: (T, 1280)

    # 3. Optical flow sequence
    flow_magnitudes = compute_optical_flow_sequence(frames)  # shape: (T, 1)

    # 4. Concatenate per frame
    hybrid_features = np.concatenate([spatial_features, flow_magnitudes], axis=1)  # (T, 1281)

    return hybrid_features


2025-05-03 06:21:18.364151: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [9]:
def get_hybrid_feature_sequence_from_frames(frames):
    """
    Extract per-frame hybrid features (CNN + Optical flow) from pre-loaded frames.
    
    Args:
        frames (torch.Tensor): (T, 3, 160, 160) tensor (after transform).
    
    Returns:
        np.ndarray: (T, 1281) array of per-frame features.
    """
    if len(frames) == 0:
        print("Warning: empty frames input")
        return np.zeros((1, 1281), dtype=np.float32)

    # 1️⃣ PyTorch tensor → numpy (T, 160, 160, 3), [0, 255] scale
    frames_np = frames.permute(0, 2, 3, 1).numpy() * 255.0  # [0,1] → [0,255]
    frames_np = frames_np.astype(np.uint8)

    # 2️⃣ CNN Features per frame
    spatial_features = base_model.predict(
        preprocess_input(frames_np.astype('float32')),
        batch_size=32,
        verbose=0
    )  # shape: (T, 1280)

    # 3️⃣ Optical Flow per frame
    flow_magnitudes = compute_optical_flow_sequence(frames_np)  # shape: (T, 1)

    # 4️⃣ Concatenate → (T, 1281)
    hybrid_features = np.concatenate([spatial_features, flow_magnitudes], axis=1)

    return hybrid_features


In [10]:
def compute_optical_flow_sequence(frames, skip_frames=1):
    """
    Computes per-frame optical flow magnitudes.
    
    Args:
        frames (np.ndarray): (T, H, W, 3) numpy array of frames.
    
    Returns:
        np.ndarray: (T, 1) array of per-frame optical flow magnitudes.
    """
    T = len(frames)
    if T < 2:
        return np.zeros((T, 1), dtype=np.float32)

    magnitudes = []

    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_RGB2GRAY)

    for i in range(1, T, skip_frames):
        curr_gray = cv2.cvtColor(frames[i], cv2.COLOR_RGB2GRAY)
        try:
            flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray,
                                                None, 0.5, 3, 15, 3, 5, 1.2, 0)
            # magnitude = sqrt(u^2 + v^2)
            mag = np.linalg.norm(flow, axis=-1)  # shape (H, W)
            avg_mag = np.mean(mag)  # scalar
            magnitudes.append(avg_mag)
        except Exception as e:
            print(f"Error calculating flow at frame {i}: {str(e)}")
            magnitudes.append(0.0)

        prev_gray = curr_gray

    # 마지막 길이 맞춤 (T, 1)
    if len(magnitudes) < T:
        magnitudes.append(0.0)  # 마지막 프레임은 flow가 없음

    magnitudes = np.array(magnitudes, dtype=np.float32).reshape(-1, 1)  # (T, 1)

    return magnitudes


In [11]:
# 아직도 최종적으로 Transformer에 넣은 (T, 1281) 시퀀스는 만들어지지 않음
# 1. 프레임별 CNN Feature 추출 (InceptionV3) 추출
# 2. optical flow sequence (compute_optical_flow_sequence) 추출
# 3. 두 결과물 concat
# 4. 이걸 Transformer의 input 시퀀스로 사용

# 전체 처리 함수 (이제 둘을 결합하는 함수 생성)
# CNN + optical flow 붙여서 (T, 1281) 만들어주는 함수 

def prepare_transformer_input(video_path, num_frames=12, target_size=(160, 160)):
    """
    Prepares (T, 1281) input sequence combining CNN features + optical flow
    for a given video.
    """
    # === Step 1: Extract frames ===
    frames = extract_keyframes(video_path, num_frames=num_frames, target_size=target_size)
    if frames.shape[0] == 0:
        print(f"Skipping video {video_path} (no frames)")
        return None  # or np.zeros((num_frames, 1281)) as fallback

    # === Step 2: Extract CNN (spatial) features per frame ===
    frames_float = preprocess_input(frames.astype('float32'))  # preprocess for InceptionV3
    spatial_features = base_model.predict(frames_float, batch_size=32, verbose=0)  # (T, 1280)

    # === Step 3: Compute optical flow sequence ===
    optical_flow_sequence = compute_optical_flow_sequence(frames)  # (T, 1)

    # === Step 4: Combine both ===
    combined_features = np.concatenate([spatial_features, optical_flow_sequence], axis=1)  # (T, 1281)

    return combined_features

In [12]:
# 중간 저장 포함 코드
import numpy as np
import os
from tqdm import tqdm

# 1️⃣ transform 준비
transforms = get_video_transforms()
train_transform = transforms['train']

# 2️⃣ 저장할 폴더 설정
output_dir = '/kaggle/working/'
os.makedirs(output_dir, exist_ok=True)

# 3️⃣ feature 저장용 리스트
all_sequences = []

# 4️⃣ 중간 저장 주기
save_every = 50

# 5️⃣ 반복
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    video_id = row['id']
    video_path = os.path.join(train_video_dir, f"{int(video_id):05d}.mp4")

    sequence = prepare_transformer_input(video_path, num_frames=12)

    if sequence is None:
        print(f"Skipping video {video_id} (no valid sequence)")
        continue

    all_sequences.append(sequence)

    # 🔥 N개마다 중간 저장
    if (idx + 1) % save_every == 0:
        partial_path = os.path.join(output_dir, f'all_sequences_partial_{idx+1}.npy')
        np.save(partial_path, np.array(all_sequences))
        print(f"Saved {idx + 1} sequences → {partial_path}")

# 6️⃣ 최종 저장
final_path = os.path.join(output_dir, 'all_sequences_final.npy')
np.save(final_path, np.array(all_sequences))
print(f"\nFinal saved → {final_path}")


  3%|▎         | 50/1500 [03:57<1:42:41,  4.25s/it]

Saved 50 sequences → /kaggle/working/all_sequences_partial_50.npy


  7%|▋         | 100/1500 [07:57<2:05:28,  5.38s/it]

Saved 100 sequences → /kaggle/working/all_sequences_partial_100.npy


 10%|█         | 150/1500 [11:51<1:42:24,  4.55s/it]

Saved 150 sequences → /kaggle/working/all_sequences_partial_150.npy


 13%|█▎        | 200/1500 [15:51<1:48:44,  5.02s/it]

Saved 200 sequences → /kaggle/working/all_sequences_partial_200.npy


 17%|█▋        | 250/1500 [19:44<1:45:17,  5.05s/it]

Saved 250 sequences → /kaggle/working/all_sequences_partial_250.npy


 20%|██        | 300/1500 [23:40<1:37:48,  4.89s/it]

Saved 300 sequences → /kaggle/working/all_sequences_partial_300.npy


 23%|██▎       | 350/1500 [27:39<1:20:30,  4.20s/it]

Saved 350 sequences → /kaggle/working/all_sequences_partial_350.npy


 27%|██▋       | 400/1500 [31:42<1:29:44,  4.89s/it]

Saved 400 sequences → /kaggle/working/all_sequences_partial_400.npy


 30%|███       | 450/1500 [35:48<1:28:02,  5.03s/it]

Saved 450 sequences → /kaggle/working/all_sequences_partial_450.npy


 33%|███▎      | 500/1500 [39:56<1:19:01,  4.74s/it]

Saved 500 sequences → /kaggle/working/all_sequences_partial_500.npy


 37%|███▋      | 550/1500 [43:52<1:21:18,  5.14s/it]

Saved 550 sequences → /kaggle/working/all_sequences_partial_550.npy


 40%|████      | 600/1500 [47:55<1:16:37,  5.11s/it]

Saved 600 sequences → /kaggle/working/all_sequences_partial_600.npy


 43%|████▎     | 650/1500 [51:52<1:15:37,  5.34s/it]

Saved 650 sequences → /kaggle/working/all_sequences_partial_650.npy


 47%|████▋     | 700/1500 [55:32<1:00:59,  4.57s/it]

Saved 700 sequences → /kaggle/working/all_sequences_partial_700.npy


 50%|█████     | 750/1500 [59:28<52:49,  4.23s/it]

Saved 750 sequences → /kaggle/working/all_sequences_partial_750.npy


 53%|█████▎    | 800/1500 [1:03:41<55:48,  4.78s/it]

Saved 800 sequences → /kaggle/working/all_sequences_partial_800.npy


 57%|█████▋    | 850/1500 [1:07:35<57:34,  5.31s/it]

Saved 850 sequences → /kaggle/working/all_sequences_partial_850.npy


 60%|██████    | 900/1500 [1:11:49<44:24,  4.44s/it]

Saved 900 sequences → /kaggle/working/all_sequences_partial_900.npy


 63%|██████▎   | 950/1500 [1:15:53<38:06,  4.16s/it]

Saved 950 sequences → /kaggle/working/all_sequences_partial_950.npy


 67%|██████▋   | 1000/1500 [1:19:46<41:36,  4.99s/it]

Saved 1000 sequences → /kaggle/working/all_sequences_partial_1000.npy


 70%|███████   | 1050/1500 [1:23:39<29:18,  3.91s/it]

Saved 1050 sequences → /kaggle/working/all_sequences_partial_1050.npy


 73%|███████▎  | 1100/1500 [1:27:48<33:54,  5.09s/it]

Saved 1100 sequences → /kaggle/working/all_sequences_partial_1100.npy


 77%|███████▋  | 1150/1500 [1:31:38<29:59,  5.14s/it]

Saved 1150 sequences → /kaggle/working/all_sequences_partial_1150.npy


 80%|████████  | 1200/1500 [1:35:27<22:51,  4.57s/it]

Saved 1200 sequences → /kaggle/working/all_sequences_partial_1200.npy


 83%|████████▎ | 1250/1500 [1:39:15<22:36,  5.43s/it]

Saved 1250 sequences → /kaggle/working/all_sequences_partial_1250.npy


 87%|████████▋ | 1300/1500 [1:43:18<16:06,  4.83s/it]

Saved 1300 sequences → /kaggle/working/all_sequences_partial_1300.npy


 90%|█████████ | 1350/1500 [1:47:30<12:05,  4.83s/it]

Saved 1350 sequences → /kaggle/working/all_sequences_partial_1350.npy


 93%|█████████▎| 1400/1500 [1:51:33<07:57,  4.77s/it]

Saved 1400 sequences → /kaggle/working/all_sequences_partial_1400.npy


 97%|█████████▋| 1450/1500 [1:55:39<04:12,  5.05s/it]

Saved 1450 sequences → /kaggle/working/all_sequences_partial_1450.npy


100%|██████████| 1500/1500 [1:59:44<00:00,  4.79s/it]

Saved 1500 sequences → /kaggle/working/all_sequences_partial_1500.npy






Final saved → /kaggle/working/all_sequences_final.npy


In [13]:
import os
import numpy as np

# 경로 설정
output_dir = '/kaggle/working/'
final_file = os.path.join(output_dir, 'all_sequences_final.npy')
combined_save_path = os.path.join(output_dir, 'all_sequences_combined.npy')

# ✅ 최종 저장된 파일만 불러오기
final_array = np.load(final_file)
print(f"Final array shape: {final_array.shape}")

# ✅ combined 파일로 따로 저장 (만약 필요할 때 대비)
np.save(combined_save_path, final_array)
print(f"Combined array saved to: {combined_save_path}")


Final array shape: (1500, 12, 2049)
Combined array saved to: /kaggle/working/all_sequences_combined.npy


In [14]:
# (1) combined feature 불러오기
import numpy as np
all_sequences = np.load('/kaggle/working/all_sequences_combined.npy')
print(all_sequences.shape)  # → (n_videos, 12, 2049) 같은 출력 확인

# (2) train.csv 다시 로드
import pandas as pd
train_df = pd.read_csv('/kaggle/input/nexar-collision-prediction/train.csv')

# (3) label 추출
labels = train_df['target'].values  # shape (n_videos,)


(1500, 12, 2049)


In [15]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

# 1️⃣ 저장된 feature 불러오기
all_sequences = np.load('all_sequences_combined.npy')
print(f"Loaded all_sequences shape: {all_sequences.shape}")  # (1500, 12, 2049)

# 2️⃣ train_df에서 label 불러오기 (주의: feature 개수에 맞게 잘라주기!)
labels = train_df['target'].values[:all_sequences.shape[0]]  # shape (1500,)
print(f"Labels shape: {labels.shape}")

# 3️⃣ Dataset 클래스 정의
class VideoSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (numpy.ndarray): shape (n_samples, T, feature_dim)
            labels (numpy.ndarray or list): shape (n_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x = self.sequences[idx]  # (T, feature_dim)
        y = self.labels[idx]     # scalar or class
        return x, y

# 4️⃣ Dataset 객체 생성
# Dataset 객체 생성
dataset = VideoSequenceDataset(all_sequences, labels)

# 정확한 길이 체크
print(f"Dataset length: {len(dataset)}")  # 꼭 찍어봐!

# Train/Val split
train_size = int(0.8 * len(dataset))  # 80% split → 1200 if 1500 total
val_size = len(dataset) - train_size

print(f"Train size: {train_size}, Val size: {val_size}")

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


Loaded all_sequences shape: (1500, 12, 2049)
Labels shape: (1500,)
Dataset length: 1500
Train size: 1200, Val size: 300


In [16]:
# # DataLoader 생성
# from torch.utils.data import DataLoader, random_split

# dataset = VideoSequenceDataset(all_sequences, labels)

# # Train/Val 분할
# train_size = int(0.8 * len(dataset))
# val_size = len(dataset) - train_size
# train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [17]:
# Temporal Transformer 모델 설계
import torch
import torch.nn as nn

class TemporalTransformerModel(nn.Module):
    def __init__(self, input_dim=1281, embed_dim=256, num_heads=4, num_layers=2, dropout=0.1):
        super(TemporalTransformerModel, self).__init__()

        # 1. Input → embedding layer
        self.input_proj = nn.Linear(input_dim, embed_dim)

        # 2. Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 3. Classification head (binary classification)
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()  # Binary output (0~1)
        )

    def forward(self, x):
        """
        x: (batch_size, T, input_dim)
        """
        # Step 1: Project input features
        x = self.input_proj(x)  # → (batch_size, T, embed_dim)

        # Step 2: Apply Transformer Encoder
        x = self.transformer_encoder(x)  # → (batch_size, T, embed_dim)

        # Step 3: Aggregate (mean pooling over time)
        x = x.mean(dim=1)  # → (batch_size, embed_dim)

        # Step 4: Final classification
        out = self.classifier(x)  # → (batch_size, 1)

        return out

In [18]:
# Spatial Transformer 모델 설계

import torch
import torch.nn as nn

class SpatialTransformer(nn.Module):
    def __init__(self, input_dim=1280, hidden_dim=512, num_heads=8, num_layers=2, dropout=0.1):
        super(SpatialTransformer, self).__init__()

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # 마지막 summary를 위한 pooling 또는 projection
        self.output_layer = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        """
        Args:
            x: shape (batch_size, T, input_dim) → per-frame spatial features
        
        Returns:
            out: shape (batch_size, input_dim) → aggregated spatial feature
        """
        # transformer expects (batch_size, T, input_dim)
        x_transformed = self.transformer(x)  # (batch_size, T, input_dim)

        # Pooling over time (mean pooling)
        x_pooled = x_transformed.mean(dim=1)  # (batch_size, input_dim)

        out = self.output_layer(x_pooled)  # (batch_size, input_dim)
        return out


In [19]:
class CollisionPredictionModel(nn.Module):
    def __init__(self, temporal_input_dim=2049, spatial_input_dim=1280, embed_dim=256, dropout=0.1):
        super(CollisionPredictionModel, self).__init__()

        self.temporal_transformer = TemporalTransformerModel(
            input_dim=temporal_input_dim, embed_dim=embed_dim
        )
        self.spatial_transformer = SpatialTransformer(
            input_dim=spatial_input_dim
        )

        fused_dim = embed_dim + spatial_input_dim  # Temporal + Spatial 출력 연결

        self.classifier = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 1),
            nn.Sigmoid()  # Binary classification
        )

    def forward(self, temporal_input, spatial_input):
        """
        temporal_input: (batch, T, 2049)
        spatial_input: (batch, T, 1280)
        """
        temporal_out = self.temporal_transformer(temporal_input)  # (batch, embed_dim)
        spatial_out = self.spatial_transformer(spatial_input)      # (batch, spatial_input_dim)

        # Fusion: concatenate
        fused = torch.cat([temporal_out, spatial_out], dim=1)  # (batch, fused_dim)

        out = self.classifier(fused)  # (batch, 1)

        return out


In [20]:
import torch
import torch.nn as nn

class TemporalTransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.1):
        super(TemporalTransformerModel, self).__init__()

        self.input_proj = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        x = self.input_proj(x)            # (batch, T, embed_dim)
        x = self.transformer_encoder(x)   # (batch, T, embed_dim)
        x = x.mean(dim=1)                 # (batch, embed_dim)
        return x


class SpatialTransformer(nn.Module):
    def __init__(self, input_dim=1280, hidden_dim=512, num_heads=8, num_layers=2, dropout=0.1):
        super(SpatialTransformer, self).__init__()

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        x_transformed = self.transformer(x)  # (batch, T, input_dim)
        x_pooled = x_transformed.mean(dim=1) # (batch, input_dim)
        out = self.output_layer(x_pooled)    # (batch, input_dim)
        return out


class CombinedModel(nn.Module):
    def __init__(self, temporal_input_dim=2049, spatial_input_dim=1280,
                 temporal_embed_dim=256, combined_dim=256, dropout=0.1):
        super(CombinedModel, self).__init__()

        self.temporal_transformer = TemporalTransformerModel(
            input_dim=temporal_input_dim, embed_dim=temporal_embed_dim
        )
        self.spatial_transformer = SpatialTransformer(
            input_dim=spatial_input_dim
        )

        # temporal (256) + spatial (1280) = 1536
        self.classifier = nn.Sequential(
            nn.Linear(1536, combined_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(combined_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, temporal_input, spatial_input):
        temporal_out = self.temporal_transformer(temporal_input)  # (batch, 256)
        spatial_out = self.spatial_transformer(spatial_input)     # (batch, 1280)

        combined = torch.cat([temporal_out, spatial_out], dim=1) # (batch, 1536)
        out = self.classifier(combined)                          # (batch, 1)

        return out


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

# 모델 준비
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CombinedModel(
    temporal_input_dim=2049, spatial_input_dim=1280,
    temporal_embed_dim=256, combined_dim=256
).to(device)

# 손실 함수 및 옵티마이저
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)  # (batch, T, 2049)
        labels = labels.to(device).unsqueeze(1)  # (batch, 1)

        # temporal_input = spatial(1280) + flow(1) + 추가 optical flow들 → (2049)
        temporal_input = inputs[:, :, :2049]

        # spatial_input = spatial part only → (1280)
        spatial_input = inputs[:, :, :1280]

        # forward
        outputs = model(temporal_input, spatial_input)
        loss = criterion(outputs, labels)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # === Validation ===
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).unsqueeze(1)
    
            temporal_input = inputs[:, :, :2049]
            spatial_input = inputs[:, :, :1280]
    
            outputs = model(temporal_input, spatial_input)
            predicted = (outputs > 0.5).float()
    
            total += labels.size(0)
            correct += (predicted == labels).sum().item()


    val_acc = correct / total
    print(f"Validation Accuracy: {val_acc:.4f}")


Epoch [1/20], Loss: 0.6834
Validation Accuracy: 0.5667
Epoch [2/20], Loss: 0.6250
Validation Accuracy: 0.6667
Epoch [3/20], Loss: 0.5442
Validation Accuracy: 0.6433
Epoch [4/20], Loss: 0.4935
Validation Accuracy: 0.6467
Epoch [5/20], Loss: 0.3240
Validation Accuracy: 0.6667
Epoch [6/20], Loss: 0.2347
Validation Accuracy: 0.6400
Epoch [7/20], Loss: 0.1393
Validation Accuracy: 0.6067
Epoch [8/20], Loss: 0.0583
Validation Accuracy: 0.5967
Epoch [9/20], Loss: 0.0777
Validation Accuracy: 0.6500
Epoch [10/20], Loss: 0.0611
Validation Accuracy: 0.6800
Epoch [11/20], Loss: 0.0505
Validation Accuracy: 0.6400
Epoch [12/20], Loss: 0.0149
Validation Accuracy: 0.6633
Epoch [13/20], Loss: 0.0724
Validation Accuracy: 0.6033
Epoch [14/20], Loss: 0.0141
Validation Accuracy: 0.6533
Epoch [15/20], Loss: 0.0007
Validation Accuracy: 0.6600
Epoch [16/20], Loss: 0.0003
Validation Accuracy: 0.6467
Epoch [17/20], Loss: 0.0001
Validation Accuracy: 0.6500
Epoch [18/20], Loss: 0.0001
Validation Accuracy: 0.6467
E

In [22]:
# 모델 저장 
torch.save(model.state_dict(), 'best_model.pth')

In [23]:
# 같은 모델 아키텍처 준비 (모델 불러오기)
model = CombinedModel(
    temporal_input_dim=2049,  # 주의: 학습할 때와 동일해야 한다
    spatial_input_dim=1280,
    temporal_embed_dim=256,
    combined_dim=256
)
model.load_state_dict(torch.load('best_model.pth'))
model.eval()


CombinedModel(
  (temporal_transformer): TemporalTransformerModel(
    (input_proj): Linear(in_features=2049, out_features=256, bias=True)
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (spatial_transformer): SpatialTransformer(
    (transformer): TransformerEncoder(
      (layers): Module

In [24]:
test_sequences = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Test Videos"):
    video_path = f"{test_video_dir}/{int(float(row['id'])):05d}.mp4"
    sequence = prepare_transformer_input(video_path, num_frames=12)
    if sequence is not None:
        test_sequences.append(sequence)

test_sequences = np.array(test_sequences)  # shape: (n_test, 12, 2049)


Processing Test Videos: 100%|██████████| 1344/1344 [1:39:29<00:00,  4.44s/it]


In [25]:
# 테스트용 Dataset, DataLoader

import torch
from torch.utils.data import DataLoader

# 테스트용: dummy labels (예측용이라 실제 label은 필요 없음)
dummy_labels = np.zeros(len(test_sequences))

# Dataset
test_dataset = VideoSequenceDataset(test_sequences, dummy_labels)

# DataLoader
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [26]:
# 테스트 데이터 예측 실행

model.eval()
all_predictions = []

with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        temporal_input = inputs[:, :, :2049]
        spatial_input = inputs[:, :, :1280]
        outputs = model(temporal_input, spatial_input)
        all_predictions.extend(outputs.cpu().numpy().flatten())

all_predictions = np.array(all_predictions)
print(f"Predictions shape: {all_predictions.shape}")


Predictions shape: (1344,)


In [27]:
# Kaggle 제출용 CSV
submission = pd.DataFrame({
    'id': test_df['id'],
    'score': all_predictions
})

submission.to_csv('submission.csv', index=False)
print("Saved submission.csv!")

# 요약 확인
print(submission.describe())

Saved submission.csv!
                id         score
count  1344.000000  1.344000e+03
mean   1906.876488  4.461332e-01
std     847.105655  4.724515e-01
min       1.000000  4.918239e-07
25%    1206.750000  7.033291e-05
50%    2243.500000  1.134222e-01
75%    2579.250000  9.994114e-01
max    2915.000000  9.999987e-01
