In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import random
import os
import math

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 42
seed_everything(SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {DEVICE}")

# ======================================================
# [V8] Bi-LSTM + Attention ë‹¨ì¼ ëª¨ë¸
# ======================================================
BATCH_SIZE = 64
LR_BASE = 1e-3
WARMUP_EPOCHS = 3
EPOCHS_BASE = 40
DROPOUT = 0.2
MAX_SEQ_LEN = 30
GRAD_CLIP = 1.0

# LSTM íŒŒë¼ë¯¸í„°
HIDDEN_DIM = 256       # LSTM hidden dimension
LSTM_LAYERS = 2        # LSTM ë ˆì´ì–´ ìˆ˜
BIDIRECTIONAL = True   # ì–‘ë°©í–¥ LSTM

print(f"[V8] Bi-LSTM + Attention ë‹¨ì¼ ëª¨ë¸")
print(f"  - Hidden Dim: {HIDDEN_DIM}")
print(f"  - Layers: {LSTM_LAYERS}")
print(f"  - Bidirectional: {BIDIRECTIONAL}")

In [None]:
# ======================================================
# 1. ê²½ë¡œ ì„¤ì • ë° Train ë¡œë“œ
# ======================================================
BASE_DIR = "./open_track1" # ë°ì´í„°ê°€ ìžˆëŠ” í´ë”ëª… (í™˜ê²½ì— ë§žê²Œ ìˆ˜ì •)
if not os.path.exists(BASE_DIR): BASE_DIR = "." # í´ë” ì—†ìœ¼ë©´ í˜„ìž¬ ê²½ë¡œ

TRAIN_PATH = os.path.join(BASE_DIR, "train.csv")
TEST_META_PATH = os.path.join(BASE_DIR, "test.csv") # ì‚¬ìš©ìžë‹˜ì´ ì˜¬ë¦¬ì‹  íŒŒì¼ëª…
MATCH_PATH = os.path.join(BASE_DIR, "match_info.csv")

# 1. Train ë¡œë“œ
train_df = pd.read_csv(TRAIN_PATH)
print(f"âœ… Train Loaded: {train_df.shape}")

# 2. Test í†µí•© ë¡œë“œ (í•µì‹¬ ìˆ˜ì • ì‚¬í•­)
if os.path.exists(TEST_META_PATH):
    test_meta = pd.read_csv(TEST_META_PATH)
    print(f"â„¹ï¸ Reading {len(test_meta)} test files...")
    
    test_dfs = []
    # tqdmìœ¼ë¡œ ì§„í–‰ìƒí™© í™•ì¸
    for _, row in tqdm(test_meta.iterrows(), total=len(test_meta), desc="Loading Test CSVs"):
        rel_path = row['path']
        
        # ê²½ë¡œ ë³´ì •: 1) ê·¸ëŒ€ë¡œ, 2) í´ë” ë¶™ì—¬ì„œ, 3) íŒŒì¼ëª…ë§Œìœ¼ë¡œ ì°¾ê¸°
        paths_to_try = [
            rel_path,
            os.path.join(BASE_DIR, rel_path.lstrip("./")),
            os.path.join(BASE_DIR, "test", str(row['game_id']), os.path.basename(rel_path))
        ]
        
        for p in paths_to_try:
            if os.path.exists(p):
                test_dfs.append(pd.read_csv(p))
                break
        
    if test_dfs:
        test_df = pd.concat(test_dfs, ignore_index=True)
        print(f"âœ… Test Data Merged: {test_df.shape}")
    else:
        raise FileNotFoundError("Test íŒŒì¼ë“¤ì„ ì°¾ì„ ìˆ˜ ì—†ìŠµë‹ˆë‹¤.")
else:
    raise FileNotFoundError("test.csv íŒŒì¼ì´ ì—†ìŠµë‹ˆë‹¤.")

# 3. Match Info ë³‘í•©
if os.path.exists(MATCH_PATH):
    match_info = pd.read_csv(MATCH_PATH)
    match_subset = match_info[['game_id', 'home_team_id', 'venue']]
    train_df = pd.merge(train_df, match_subset, on='game_id', how='left')
    test_df = pd.merge(test_df, match_subset, on='game_id', how='left')

# 4. ì „ì²˜ë¦¬ (ê²°ì¸¡ì¹˜ ì±„ìš°ê¸°)
def preprocess(df):
    if 'home_team_id' in df.columns:
        df['is_home'] = (df['team_id'] == df['home_team_id']).astype(float)
    else:
        df['is_home'] = 0.5
        
    # V2 ë¡œì§ ìž‘ë™ì„ ìœ„í•´ end_xê°€ ì—†ìœ¼ë©´ 0ìœ¼ë¡œ ì±„ì›€
    if 'end_x' not in df.columns:
        df['end_x'] = 0.0
        df['end_y'] = 0.0
    else:
        df['end_x'] = df['end_x'].fillna(0.0)
        df['end_y'] = df['end_y'].fillna(0.0)
    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df)

ID_COL = 'game_episode' if 'game_episode' in train_df.columns else 'episode_id'
print(f"Data Ready. ID Column: {ID_COL}")

In [None]:
# 1. ë²”ì£¼í˜• ë³€ìˆ˜ ì •ì˜ (Train ë°ì´í„° ë¶„ì„ ê¸°ë°˜)
TOP_TYPES = ['Pass', 'Carry', 'Recovery', 'Interception', 'Duel', 'Tackle', 
             'Throw-In', 'Clearance', 'Intervention', 'Block', 'Pass_Freekick', 
             'Cross', 'Goal Kick', 'Error', 'Shot']
ALL_RESULTS = ['Successful', 'Unsuccessful', 'On Target', 'Yellow_Card', 
               'Blocked', 'Keeper Rush-Out', 'Low Quality Shot', 'Off Target']

def make_features(group):
    n = len(group)
    
    # --- 1. ê¸°ë³¸ ì¢Œí‘œ & Context ---
    sx = group['start_x'].values / 105.0
    sy = group['start_y'].values / 68.0
    ex = group['end_x'].values / 105.0
    ey = group['end_y'].values / 68.0
    is_home = group['is_home'].values
    
    # --- 2. ì‹œê°„ & ì†ë„ ---
    if 'time_seconds' in group.columns:
        times = group['time_seconds'].values
        dt = np.zeros(n, dtype=np.float32)
        dt[1:] = times[1:] - times[:-1]
        dt = np.maximum(dt, 0.1)
    else:
        dt = np.ones(n, dtype=np.float32)

    # --- 3. ì´ë™ëŸ‰ & ê±°ë¦¬ ê³„ì‚° ---
    dx = ex - sx
    dy = ey - sy
    dist_meter = np.sqrt((dx*105)**2 + (dy*68)**2)
    
    # --- 4. í†µê³„ëŸ‰ (Lagged Features) ---
    cumsum_dx = np.cumsum(dx) / 105.0
    cumsum_dy = np.cumsum(dy) / 68.0
    
    lag_dist_m = np.roll(dist_meter, 1); lag_dist_m[0] = 0
    lag_cumsum_dx = np.roll(cumsum_dx, 1); lag_cumsum_dx[0] = 0
    lag_cumsum_dy = np.roll(cumsum_dy, 1); lag_cumsum_dy[0] = 0
    
    lag_dt = np.roll(dt, 1); lag_dt[0] = 1.0
    lag_speed = lag_dist_m / np.maximum(lag_dt, 0.1)
    
    # --- 5. ì„ ìˆ˜ ì •ë³´ ---
    if 'player_id' in group.columns:
        p_ids = group['player_id'].values
        is_same = np.zeros(n, dtype=np.float32)
        is_same[1:] = (p_ids[1:] == p_ids[:-1]).astype(np.float32)
    else:
        is_same = np.zeros(n, dtype=np.float32)

    # --- 6. ì§„í–‰ë¥  & ê¸°íƒ€ ---
    progress = np.arange(n) / max(n-1, 1)
    is_second_half = (group['period_id'].values > 1).astype(np.float32) if 'period_id' in group.columns else np.zeros(n)
    
    # ======================================================
    # [V6 Phase 1] ê³µê°„ í”¼ì²˜ ì¶”ê°€
    # ======================================================
    # 6-1. ê³¨ëŒ€ê¹Œì§€ ê±°ë¦¬ (ìƒëŒ€ ê³¨ëŒ€ = ì˜¤ë¥¸ìª½)
    GOAL_X = 105.0
    GOAL_Y = 34.0  # ê³¨ëŒ€ ì¤‘ì•™
    
    sx_real = sx * 105.0
    sy_real = sy * 68.0
    dist_to_goal = np.sqrt((sx_real - GOAL_X)**2 + (sy_real - GOAL_Y)**2) / 105.0  # ì •ê·œí™”
    
    # 6-2. ê³¨ëŒ€ ë°©í–¥ ê°ë„ (sin/cos)
    dx_to_goal = GOAL_X - sx_real
    dy_to_goal = GOAL_Y - sy_real
    angle_to_goal = np.arctan2(dy_to_goal, dx_to_goal)
    angle_sin = np.sin(angle_to_goal)
    angle_cos = np.cos(angle_to_goal)
    
    # 6-3. ê²½ê³„ì„ ê¹Œì§€ ê±°ë¦¬
    dist_to_sideline = np.minimum(sy_real, 68.0 - sy_real) / 68.0  # ìœ„/ì•„ëž˜ í„°ì¹˜ë¼ì¸
    dist_to_endline = np.minimum(sx_real, 105.0 - sx_real) / 105.0  # ì¢Œ/ìš° ê³¨ë¼ì¸
    
    # 6-4. í•„ë“œ êµ¬ì—­ (3êµ¬ì—­: ìˆ˜ë¹„/ì¤‘ì•™/ê³µê²©)
    def get_zone(x_norm):
        if x_norm < 35.0/105.0:
            return 0  # ìˆ˜ë¹„ êµ¬ì—­
        elif x_norm < 70.0/105.0:
            return 1  # ì¤‘ì•™ êµ¬ì—­
        else:
            return 2  # ê³µê²© êµ¬ì—­
    
    zones = np.array([get_zone(x) for x in sx])
    zone_onehot = np.zeros((n, 3), dtype=np.float32)
    for i, z in enumerate(zones):
        zone_onehot[i, z] = 1.0
    
    # --- 7. ë²”ì£¼í˜• ë³€ìˆ˜ One-hot Encoding ---
    types_onehot = np.zeros((n, len(TOP_TYPES) + 1), dtype=np.float32)
    curr_types = group['type_name'].values
    for i, t in enumerate(curr_types):
        if t in TOP_TYPES:
            idx = TOP_TYPES.index(t)
            types_onehot[i, idx] = 1.0
        else:
            types_onehot[i, -1] = 1.0

    results_onehot = np.zeros((n, len(ALL_RESULTS) + 1), dtype=np.float32)
    curr_results = group['result_name'].values
    for i, r in enumerate(curr_results):
        if r in ALL_RESULTS:
            idx = ALL_RESULTS.index(r)
            results_onehot[i, idx] = 1.0
        else:
            results_onehot[i, -1] = 1.0

    features = []
    
    # --- 8. ì‹œí€€ìŠ¤ ê²°í•© ---
    for i in range(n):
        # ìŠ¤ì¹¼ë¼ í”¼ì²˜ (11ê°œ â†’ 16ê°œ)
        scalars = [
            sx[i], sy[i],                    # 0-1: ìœ„ì¹˜
            lag_cumsum_dx[i], lag_cumsum_dy[i],  # 2-3: ëˆ„ì  íë¦„
            lag_dist_m[i] / 100.0,           # 4: ì§ì „ ê±°ë¦¬
            lag_speed[i] / 10.0,             # 5: ì§ì „ ì†ë„
            dt[i] / 10.0,                    # 6: ì‹œê°„ ê²½ê³¼
            progress[i],                     # 7: ì§„í–‰ë¥ 
            is_home[i],                      # 8: í™ˆ ì—¬ë¶€
            is_same[i],                      # 9: ì„ ìˆ˜ ì—°ì†ì„±
            is_second_half[i],               # 10: í›„ë°˜ì „
            # [V6 Phase 1] ìƒˆë¡œìš´ ê³µê°„ í”¼ì²˜ (11-15)
            dist_to_goal[i],                 # 11: ê³¨ëŒ€ ê±°ë¦¬
            angle_sin[i],                    # 12: ê³¨ëŒ€ ë°©í–¥ sin
            angle_cos[i],                    # 13: ê³¨ëŒ€ ë°©í–¥ cos
            dist_to_sideline[i],             # 14: í„°ì¹˜ë¼ì¸ ê±°ë¦¬
            dist_to_endline[i],              # 15: ê³¨ë¼ì¸ ê±°ë¦¬
        ]
        
        # Combine: 16 Scalar + 3 Zone + 16 Type + 9 Result = 44ì°¨ì›
        feat_vec = np.concatenate([scalars, zone_onehot[i], types_onehot[i], results_onehot[i]])
        
        # (1) Start Node
        features.append(feat_vec)
        
        # (2) End Node (ë§ˆì§€ë§‰ ì œì™¸)
        if i < n - 1:
            # End ì¢Œí‘œ ê³„ì‚°
            ex_real = ex[i] * 105.0
            ey_real = ey[i] * 68.0
            
            # End ìœ„ì¹˜ì—ì„œì˜ ê³µê°„ í”¼ì²˜ ìž¬ê³„ì‚°
            end_dist_to_goal = np.sqrt((ex_real - GOAL_X)**2 + (ey_real - GOAL_Y)**2) / 105.0
            end_dx_to_goal = GOAL_X - ex_real
            end_dy_to_goal = GOAL_Y - ey_real
            end_angle = np.arctan2(end_dy_to_goal, end_dx_to_goal)
            end_dist_to_sideline = min(ey_real, 68.0 - ey_real) / 68.0
            end_dist_to_endline = min(ex_real, 105.0 - ex_real) / 105.0
            
            end_zone = get_zone(ex[i])
            end_zone_onehot = np.zeros(3, dtype=np.float32)
            end_zone_onehot[end_zone] = 1.0
            
            scalars_end = scalars.copy()
            scalars_end[0] = ex[i]  # End X
            scalars_end[1] = ey[i]  # End Y
            scalars_end[2] = cumsum_dx[i]  # ëˆ„ì  íë¦„ ì—…ë°ì´íŠ¸
            scalars_end[3] = cumsum_dy[i]
            # End ìœ„ì¹˜ì˜ ê³µê°„ í”¼ì²˜
            scalars_end[11] = end_dist_to_goal
            scalars_end[12] = np.sin(end_angle)
            scalars_end[13] = np.cos(end_angle)
            scalars_end[14] = end_dist_to_sideline
            scalars_end[15] = end_dist_to_endline
            
            feat_vec_end = np.concatenate([scalars_end, end_zone_onehot, types_onehot[i], results_onehot[i]])
            features.append(feat_vec_end)
            
    return np.array(features, dtype=np.float32)

# ìž…ë ¥ ì°¨ì› ìžë™ ê³„ì‚°
# Scalar(16) + Zone(3) + Type(16) + Result(9) = 44
dummy_group = train_df.iloc[:5].copy()
dummy_feat = make_features(dummy_group)
INPUT_DIM = dummy_feat.shape[1]

print(f"âœ… [V6 Phase 1] Spatial Features Added!")
print(f"   Input Dimension: {INPUT_DIM}")
print(f"   - Base Scalars: 11")
print(f"   - Spatial Features: 5 (goal_dist, angle_sin/cos, sideline_dist, endline_dist)")
print(f"   - Field Zones: 3 (defensive/midfield/attacking)")
print(f"   - Event Types: 16")
print(f"   - Results: 9")
print(f"   Total: 16 + 3 + 16 + 9 = {INPUT_DIM}")

In [None]:
class SoccerDataset(Dataset):
    def __init__(self, df, mode='train'):
        self.mode = mode
        self.episodes = []
        self.targets = []
        self.team_ids = []
        self.episode_ids = []
        
        # ìˆœì„œ ì„žìž„ ë°©ì§€
        grouped = df.groupby(ID_COL, sort=False)
        
        for name, group in tqdm(grouped, desc=f"Dataset ({mode})"):
            if mode == 'train' and len(group) < 2: continue
            
            seq = make_features(group)

            team_id = group.iloc[0]['team_id']
            
            if mode == 'train':
                last = group.iloc[-1]
                self.targets.append([last['end_x']/105.0, last['end_y']/68.0])
                self.episodes.append(seq)
                self.team_ids.append(team_id)
            else:
                self.episodes.append(seq)
                self.team_ids.append(team_id)
                self.episode_ids.append(str(name))

    def __len__(self): return len(self.episodes)
    
    def __getitem__(self, idx):
        seq = torch.FloatTensor(self.episodes[idx])
        if len(seq) > MAX_SEQ_LEN: seq = seq[-MAX_SEQ_LEN:]
        
        if self.mode == 'train':
            # Train ë°˜í™˜: (seq, target, team_id)
            return seq, torch.FloatTensor(self.targets[idx]), self.team_ids[idx]
        else:
            # Test ë°˜í™˜: (seq, team_id, episode_id)
            return seq, self.team_ids[idx], self.episode_ids[idx]

# [V8 ìˆ˜ì •] collate_fn: lengths ì •ë³´ ì¶”ê°€
def collate_fn(batch):
    seqs = [b[0] for b in batch]
    lens = torch.LongTensor([len(s) for s in seqs])
    padded = pad_sequence(seqs, batch_first=True, padding_value=0)
    mask = torch.arange(padded.size(1))[None, :] >= lens[:, None]
    
    # ë°°ì¹˜ ì²« ë²ˆì§¸ ìš”ì†Œë¡œ ëª¨ë“œë¥¼ íŒë‹¨
    elem = batch[0]
    
    # Test Mode í™•ì¸ (3ë²ˆì§¸ ìš”ì†Œê°€ ë¬¸ìžì—´ IDì¸ ê²½ìš°)
    if isinstance(elem[2], str):
        # Test: b[1]ì´ Team ID
        team_ids = torch.LongTensor([b[1] for b in batch])
        episode_ids = [b[2] for b in batch]
        return padded, mask, lens, team_ids, episode_ids
        
    else:
        # Train Mode: b[1]ì€ Target(ì¢Œí‘œ), b[2]ê°€ Team ID
        targets = torch.stack([b[1] for b in batch])
        team_ids_train = torch.LongTensor([b[2] for b in batch]) 
        return padded, targets, mask, lens, team_ids_train

# ë°ì´í„°ì…‹ ìƒì„±
full_dataset = SoccerDataset(train_df, mode='train')
test_dataset = SoccerDataset(test_df, mode='test')

# ì—í”¼ì†Œë“œ ê¸¸ì´ ë¶„í¬ ë¶„ì„
episode_lengths = [len(ep) for ep in full_dataset.episodes]
print(f"\nðŸ“Š [V8] ì—í”¼ì†Œë“œ ê¸¸ì´ ë¶„í¬ ë¶„ì„")
print(f"   ì´ ì—í”¼ì†Œë“œ: {len(episode_lengths):,}ê°œ")
print(f"   í‰ê·  ê¸¸ì´: {np.mean(episode_lengths):.2f}")
print(f"   ì¤‘ì•™ê°’: {np.median(episode_lengths):.2f}")
print(f"   ìµœì†Œ/ìµœëŒ€: {min(episode_lengths)} / {max(episode_lengths)}")
print(f"   í‘œì¤€íŽ¸ì°¨: {np.std(episode_lengths):.2f}")

In [None]:
# ======================================================
# [V8] Bi-LSTM + Attention ëª¨ë¸ ì •ì˜
# ======================================================

class LSTMPredictor(nn.Module):
    """
    Bi-LSTM + Attention ëª¨ë¸
    - ìˆœì°¨ì  ì •ë³´ ë³´ì¡´
    - Attentionìœ¼ë¡œ ì¤‘ìš”í•œ ì•¡ì…˜ ê°•ì¡°
    - ë‹¨ìˆœí•˜ê³  íš¨ìœ¨ì 
    """
    def __init__(self, input_dim, hidden_dim, num_layers, dropout, bidirectional=True):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        # LSTM Layer
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
        # Attention Mechanism
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.attention = nn.Sequential(
            nn.Linear(lstm_output_dim, lstm_output_dim // 2),
            nn.Tanh(),
            nn.Linear(lstm_output_dim // 2, 1)
        )
        
        # Final Prediction Layer
        self.fc = nn.Sequential(
            nn.Linear(lstm_output_dim, lstm_output_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(lstm_output_dim // 2, 2)
        )
    
    def forward(self, x, mask=None, lengths=None, return_attention=False):
        """
        Args:
            x: (batch, seq_len, input_dim)
            mask: (batch, seq_len) - True for padding
            lengths: (batch,) - ì‹¤ì œ ì‹œí€€ìŠ¤ ê¸¸ì´
            return_attention: bool - Attention weights ë°˜í™˜ ì—¬ë¶€
        
        Returns:
            prediction: (batch, 2) - ì˜ˆì¸¡ ì¢Œí‘œ
            attn_weights: (batch, seq_len) - Attention weights (return_attention=Trueì¼ ë•Œë§Œ)
        """
        batch_size = x.size(0)
        seq_len = x.size(1)
        
        # Pack sequence for efficient LSTM processing
        if lengths is not None:
            lengths_cpu = lengths.cpu()
            packed_x = pack_padded_sequence(x, lengths_cpu, batch_first=True, enforce_sorted=False)
            packed_output, (hidden, cell) = self.lstm(packed_x)
            lstm_out, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=seq_len)
        else:
            lstm_out, (hidden, cell) = self.lstm(x)
        
        # Attention Weights ê³„ì‚°
        attn_weights = self.attention(lstm_out)  # (batch, seq_len, 1)
        
        # Mask ì ìš© (íŒ¨ë”©ëœ ë¶€ë¶„ì€ -infë¡œ)
        if mask is not None:
            attn_weights = attn_weights.masked_fill(mask.unsqueeze(-1), float('-inf'))
        
        attn_weights = torch.softmax(attn_weights, dim=1)  # (batch, seq_len, 1)
        
        # Weighted Sum
        context = torch.sum(lstm_out * attn_weights, dim=1)  # (batch, hidden*2)
        
        # Final Prediction
        prediction = self.fc(context)
        
        if return_attention:
            return prediction, attn_weights.squeeze(-1)  # (batch, 2), (batch, seq_len)
        else:
            return prediction

# ëª¨ë¸ ì´ˆê¸°í™”
model = LSTMPredictor(INPUT_DIM, HIDDEN_DIM, LSTM_LAYERS, DROPOUT, BIDIRECTIONAL).to(DEVICE)

print("=" * 60)
print("âœ… [V8] Bi-LSTM + Attention ëª¨ë¸ ì¤€ë¹„ ì™„ë£Œ")
print("=" * 60)
print(f"ðŸ“Š ëª¨ë¸ êµ¬ì¡°:")
print(f"   - Input Dim: {INPUT_DIM}")
print(f"   - Hidden Dim: {HIDDEN_DIM}")
print(f"   - Layers: {LSTM_LAYERS}")
print(f"   - Bidirectional: {BIDIRECTIONAL}")
print(f"   - Parameters: {sum(p.numel() for p in model.parameters()):,}")
print("=" * 60)

In [None]:
# ======================================================
# [V8] Bi-LSTM + Attention í•™ìŠµ
# ======================================================
from torch.optim.lr_scheduler import CosineAnnealingLR

SEEDS = [42, 2024, 777]
print(f"\n{'='*70}")
print(f"ðŸš€ [V8] Bi-LSTM + Attention í•™ìŠµ ì‹œìž‘")
print(f"{'='*70}")
print(f"ì „ëžµ: ë‹¨ì¼ LSTM ëª¨ë¸ë¡œ ëª¨ë“  ê¸¸ì´ ì²˜ë¦¬")
print(f"  - Bi-LSTM: ì–‘ë°©í–¥ìœ¼ë¡œ ìˆœì°¨ ì •ë³´ í¬ì°©")
print(f"  - Attention: ì¤‘ìš”í•œ ì•¡ì…˜ì— ì§‘ì¤‘")
print(f"  - ë‹¨ìˆœí•˜ê³  íš¨ìœ¨ì ì¸ êµ¬ì¡°")
print(f"{'='*70}\n")

# ížˆìŠ¤í† ë¦¬ ì €ìž¥ìš©
all_histories = []

for i, seed in enumerate(SEEDS):
    print(f"\n{'='*70}")
    print(f"ðŸ“¦ [ëª¨ë¸ {i+1}/3] Seed {seed} í•™ìŠµ ì‹œìž‘")
    print(f"{'='*70}")
    seed_everything(seed)
    
    # ë°ì´í„°ì…‹ ë¶„í• 
    train_idx, val_idx = train_test_split(range(len(full_dataset)), test_size=0.2, random_state=seed)
    train_loader = DataLoader(torch.utils.data.Subset(full_dataset, train_idx), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(torch.utils.data.Subset(full_dataset, val_idx), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # ëª¨ë¸ ì´ˆê¸°í™”
    model = LSTMPredictor(INPUT_DIM, HIDDEN_DIM, LSTM_LAYERS, DROPOUT, BIDIRECTIONAL).to(DEVICE)
    optimizer = optim.AdamW(model.parameters(), lr=LR_BASE, weight_decay=1e-4)
    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS_BASE - WARMUP_EPOCHS, eta_min=1e-6)
    criterion = nn.HuberLoss(delta=1.0)
    
    best_dist = float('inf')
    history = {'train_loss': [], 'val_dist': [], 'lr': []}
    
    for epoch in range(EPOCHS_BASE):
        # Warmup
        if epoch < WARMUP_EPOCHS:
            lr = LR_BASE * (epoch + 1) / WARMUP_EPOCHS
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        
        # Training
        model.train()
        train_loss = 0.0
        for seqs, targets, mask, lens, _ in train_loader:
            seqs, targets, mask, lens = seqs.to(DEVICE), targets.to(DEVICE), mask.to(DEVICE), lens.to(DEVICE)
            
            optimizer.zero_grad()
            pred = model(seqs, mask, lens)
            loss = criterion(pred, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        dists = []
        with torch.no_grad():
            for seqs, targets, mask, lens, _ in val_loader:
                seqs, targets, mask, lens = seqs.to(DEVICE), targets.to(DEVICE), mask.to(DEVICE), lens.to(DEVICE)
                pred = model(seqs, mask, lens)
                p_real = pred.cpu().numpy() * np.array([105.0, 68.0])
                t_real = targets.cpu().numpy() * np.array([105.0, 68.0])
                dists.extend(np.sqrt(np.sum((p_real - t_real)**2, axis=1)))
        
        avg_dist = np.mean(dists)
        avg_train_loss = train_loss / len(train_loader)
        
        if epoch >= WARMUP_EPOCHS:
            scheduler.step()
        
        current_lr = optimizer.param_groups[0]['lr']
        history['train_loss'].append(avg_train_loss)
        history['val_dist'].append(avg_dist)
        history['lr'].append(current_lr)
        
        if (epoch + 1) % 5 == 0 or epoch < 5:
            print(f"  [Epoch {epoch+1:2d}/{EPOCHS_BASE}] "
                  f"Loss: {avg_train_loss:.4f} | "
                  f"Val Dist: {avg_dist:.4f}m | "
                  f"LR: {current_lr:.6f}")
        
        if avg_dist < best_dist:
            best_dist = avg_dist
            torch.save(model.state_dict(), f'v8_model_{i}.pth')
    
    history['best_dist'] = best_dist
    history['seed'] = seed
    all_histories.append(history)
    
    print(f"  âœ… Model {i+1} ì™„ë£Œ. Best Val Dist: {best_dist:.4f}m")

print(f"\n{'='*70}")
print(f"âœ… ì „ì²´ í•™ìŠµ ì™„ë£Œ!")
print(f"{'='*70}")
print(f"ðŸ“Š í‰ê·  Best Distance: {np.mean([h['best_dist'] for h in all_histories]):.4f}m")
print(f"{'='*70}")

In [None]:
# ======================================================
# [V8] Bi-LSTM + Attention ì¶”ë¡ 
# ======================================================
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"\n{'='*70}")
print(f"ðŸ”® [V8] Bi-LSTM + Attention ì¶”ë¡  ì‹œìž‘")
print(f"{'='*70}")

# ëª¨ë¸ ë¡œë“œ (3ê°œ ì•™ìƒë¸”)
models = []
for i in range(3):
    model = LSTMPredictor(INPUT_DIM, HIDDEN_DIM, LSTM_LAYERS, DROPOUT, BIDIRECTIONAL).to(DEVICE)
    model.load_state_dict(torch.load(f'v8_model_{i}.pth'))
    model.eval()
    models.append(model)

print(f"âœ… ëª¨ë¸ ë¡œë“œ ì™„ë£Œ: 3ê°œ ì•™ìƒë¸”")

# ì•™ìƒë¸” ê°€ì¤‘ì¹˜
weights = [0.25, 0.25, 0.5]

# ì¶”ë¡ 
results = []
print(f"\nðŸ”„ ì¶”ë¡  ì¤‘...")

with torch.no_grad():
    for seqs, mask, lens, team_ids, episode_ids in tqdm(test_loader, desc="Inference"):
        seqs, mask, lens = seqs.to(DEVICE), mask.to(DEVICE), lens.to(DEVICE)
        
        # 3ê°œ ëª¨ë¸ ì˜ˆì¸¡
        m1 = models[0](seqs, mask, lens).cpu().numpy()
        m2 = models[1](seqs, mask, lens).cpu().numpy()
        m3 = models[2](seqs, mask, lens).cpu().numpy()
        
        # ì•™ìƒë¸”
        avg_pred = (m1 * weights[0]) + (m2 * weights[1]) + (m3 * weights[2])
        
        # ì¢Œí‘œ ë³€í™˜ ë° ì €ìž¥
        for i, eid in enumerate(episode_ids):
            px = avg_pred[i, 0] * 105.0
            py = avg_pred[i, 1] * 68.0
            results.append({'game_episode': eid, 'pred_x': px, 'pred_y': py})

# ì œì¶œ íŒŒì¼ ìƒì„±
pred_df = pd.DataFrame(results)

SUBMISSION_PATH = "open_track1/sample_submission.csv"
if os.path.exists(SUBMISSION_PATH):
    sub = pd.read_csv(SUBMISSION_PATH)
else:
    sub = pd.read_csv(TEST_META_PATH)
    col_map = {'episode_id': 'game_episode'}
    sub = sub.rename(columns=col_map)
    sub = sub[['game_episode']]

print(f"\nðŸ“„ ì œì¶œ íŒŒì¼ ìƒì„± ì¤‘...")
print(f"   Submission Shape: {sub.shape}, Prediction Shape: {pred_df.shape}")

final_sub = pd.merge(sub[['game_episode']], pred_df, on='game_episode', how='left')
final_sub = final_sub.rename(columns={'pred_x': 'end_x', 'pred_y': 'end_y'})

nan_count = final_sub.isnull().sum().sum()
if nan_count > 0:
    print(f"   âš ï¸ ê²½ê³ : {nan_count}ê°œì˜ ê²°ì¸¡ì¹˜ê°€ ë°œìƒí–ˆìŠµë‹ˆë‹¤.")
    final_sub = final_sub.fillna(50.0)

filename = "v8_lstm_submit.csv"
final_sub.to_csv(filename, index=False)

print(f"\n{'='*70}")
print(f"âœ… [V8] Bi-LSTM + Attention ì¶”ë¡  ì™„ë£Œ!")
print(f"{'='*70}")
print(f"ðŸ“ ì œì¶œ íŒŒì¼: {filename}")
print(f"ðŸ“Š ëª¨ë¸ íŠ¹ì§•:")
print(f"   - Bi-LSTM: ì–‘ë°©í–¥ ì‹œí€€ìŠ¤ ì²˜ë¦¬")
print(f"   - Attention: ì¤‘ìš” ì•¡ì…˜ ê°•ì¡°")
print(f"   - 3-Seed ì•™ìƒë¸”: ì•ˆì •ì  ì˜ˆì¸¡")
print(f"{'='*70}")
print(final_sub.head())

In [None]:
# ======================================================
# [V8] Bi-LSTM + Attention í•™ìŠµ ê³¡ì„  ì‹œê°í™”
# ======================================================
import matplotlib.pyplot as plt

# í•œê¸€ í°íŠ¸ ì„¤ì •
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('[V8] Bi-LSTM + Attention í•™ìŠµ ê³¡ì„ ', fontsize=16, fontweight='bold')

colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = [f'Model {i+1} (Seed {h["seed"]})' for i, h in enumerate(all_histories)]

# 1. Training Loss
ax1 = axes[0, 0]
for i, history in enumerate(all_histories):
    epochs = range(1, len(history['train_loss']) + 1)
    ax1.plot(epochs, history['train_loss'], label=labels[i], color=colors[i], linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Huber Loss')
ax1.set_title('Training Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Validation Distance
ax2 = axes[0, 1]
for i, history in enumerate(all_histories):
    epochs = range(1, len(history['val_dist']) + 1)
    ax2.plot(epochs, history['val_dist'], label=labels[i], color=colors[i], linewidth=2)
    best_idx = np.argmin(history['val_dist'])
    best_val = history['val_dist'][best_idx]
    ax2.scatter(best_idx + 1, best_val, color=colors[i], s=100, zorder=5, marker='*')
    ax2.annotate(f'{best_val:.2f}m', xy=(best_idx + 1, best_val), 
                 xytext=(5, 5), textcoords='offset points', fontsize=9, color=colors[i])
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Distance (m)')
ax2.set_title('Validation Distance')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Learning Rate Schedule
ax3 = axes[1, 0]
history = all_histories[0]
epochs = range(1, len(history['lr']) + 1)
ax3.plot(epochs, history['lr'], color='purple', linewidth=2)
ax3.axvline(x=WARMUP_EPOCHS, color='red', linestyle='--', label=f'Warmup End (Epoch {WARMUP_EPOCHS})')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Learning Rate')
ax3.set_title('Learning Rate Schedule (Warmup + Cosine Annealing)')
ax3.legend()
ax3.grid(True, alpha=0.3)
ax3.set_yscale('log')

# 4. Best Distances ë¹„êµ
ax4 = axes[1, 1]
best_dists = [h['best_dist'] for h in all_histories]
seeds = [h['seed'] for h in all_histories]
bars = ax4.bar(range(len(best_dists)), best_dists, color=colors)
ax4.set_xticks(range(len(best_dists)))
ax4.set_xticklabels([f'Model {i+1}\n(Seed {s})' for i, s in enumerate(seeds)])
ax4.set_ylabel('Best Validation Distance (m)')
ax4.set_title('Best Performance per Model')
ax4.grid(True, alpha=0.3, axis='y')

for i, (bar, dist) in enumerate(zip(bars, best_dists)):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height,
             f'{dist:.4f}m',
             ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('v8_training_history.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\ní•™ìŠµ ê³¡ì„  ì €ìž¥ ì™„ë£Œ: v8_training_history.png")
print(f"\nìµœì¢… ê²°ê³¼:")
print(f"{'='*60}")
for i, h in enumerate(all_histories):
    print(f"  Model {i+1} (Seed {h['seed']:4d}): {h['best_dist']:.4f}m")
print(f"  í‰ê· : {np.mean([h['best_dist'] for h in all_histories]):.4f}m")
print(f"{'='*60}")

In [None]:
# ======================================================
# [V8] Attention ë¶„ì„: ëª¨ë¸ì´ ì–´ë–¤ ì•¡ì…˜ì— ì§‘ì¤‘í•˜ëŠ”ê°€?
# ======================================================
import matplotlib.pyplot as plt
import seaborn as sns

print(f"\n{'='*70}")
print(f"ðŸ” [V8] Attention ë¶„ì„ ì‹œìž‘")
print(f"{'='*70}")

# í•™ìŠµëœ ëª¨ë¸ ë¡œë“œ
analysis_model = LSTMPredictor(INPUT_DIM, HIDDEN_DIM, LSTM_LAYERS, DROPOUT, BIDIRECTIONAL).to(DEVICE)
analysis_model.load_state_dict(torch.load('v8_model_0.pth'))
analysis_model.eval()

# Validation ë°ì´í„° ì¤€ë¹„ (Seed 42 ì‚¬ìš©)
seed_everything(42)
train_idx, val_idx = train_test_split(range(len(full_dataset)), test_size=0.2, random_state=42)
val_subset = torch.utils.data.Subset(full_dataset, val_idx)

print(f"âœ… ë¶„ì„ ë°ì´í„°: {len(val_subset):,}ê°œ ì—í”¼ì†Œë“œ")

# Attention ì¶”ì¶œ í•¨ìˆ˜
def extract_attention_weights(model, dataset_subset, num_samples=500):
    """
    ëžœë¤ ìƒ˜í”Œì—ì„œ Attention weights ì¶”ì¶œ
    """
    attention_data = []
    
    # ëžœë¤ ìƒ˜í”Œë§
    sample_indices = np.random.choice(len(dataset_subset), min(num_samples, len(dataset_subset)), replace=False)
    
    with torch.no_grad():
        for idx in tqdm(sample_indices, desc="Extracting Attention"):
            seq, target, team_id = dataset_subset[idx]
            
            # ì‹œí€€ìŠ¤ ê¸¸ì´ ì œí•œ
            if len(seq) > MAX_SEQ_LEN:
                seq = seq[-MAX_SEQ_LEN:]
            
            seq = seq.unsqueeze(0).to(DEVICE)  # (1, seq_len, features)
            length = torch.LongTensor([len(seq[0])]).to(DEVICE)
            mask = torch.zeros(1, len(seq[0]), dtype=torch.bool).to(DEVICE)
            
            # Attention weights ì¶”ì¶œ
            pred, attn_weights = model(seq, mask, length, return_attention=True)
            
            attn_weights = attn_weights.cpu().numpy()[0]  # (seq_len,)
            features = seq.cpu().numpy()[0]  # (seq_len, 44)
            
            attention_data.append({
                'attention': attn_weights,
                'features': features,
                'seq_length': len(attn_weights)
            })
    
    return attention_data

# Attention weights ì¶”ì¶œ
print(f"\nðŸ“Š Attention Weights ì¶”ì¶œ ì¤‘...")
attention_data = extract_attention_weights(analysis_model, val_subset, num_samples=1000)

print(f"âœ… ì¶”ì¶œ ì™„ë£Œ: {len(attention_data)}ê°œ ìƒ˜í”Œ")
print(f"   í‰ê·  ì‹œí€€ìŠ¤ ê¸¸ì´: {np.mean([d['seq_length'] for d in attention_data]):.2f}")

In [None]:
# ======================================================
# ì•¡ì…˜ íƒ€ìž…ë³„ Attention ë¶„ì„
# ======================================================

# Feature ì¸ë±ìŠ¤ ì •ì˜ (make_features í•¨ìˆ˜ ê¸°ì¤€)
# Scalar(16) + Zone(3) + Type(16) + Result(9) = 44
ZONE_START = 16
TYPE_START = 19  # 16 + 3
RESULT_START = 35  # 16 + 3 + 16

# ê° íƒ€ìž…ë³„ í‰ê·  Attention ê³„ì‚°
type_attention_scores = {t: [] for t in TOP_TYPES}
type_attention_scores['Other'] = []

result_attention_scores = {r: [] for r in ALL_RESULTS}
result_attention_scores['Other'] = []

print(f"\nðŸ“Š ì•¡ì…˜ íƒ€ìž…ë³„ Attention ë¶„ì„...")

for sample in tqdm(attention_data, desc="Analyzing"):
    features = sample['features']  # (seq_len, 44)
    attention = sample['attention']  # (seq_len,)
    
    for i in range(len(attention)):
        # íƒ€ìž… ì¶”ì¶œ (one-hotì—ì„œ)
        type_onehot = features[i, TYPE_START:TYPE_START+16]
        type_idx = np.argmax(type_onehot)
        
        if type_idx < len(TOP_TYPES):
            type_name = TOP_TYPES[type_idx]
        else:
            type_name = 'Other'
        
        type_attention_scores[type_name].append(attention[i])
        
        # ê²°ê³¼ ì¶”ì¶œ
        result_onehot = features[i, RESULT_START:RESULT_START+9]
        result_idx = np.argmax(result_onehot)
        
        if result_idx < len(ALL_RESULTS):
            result_name = ALL_RESULTS[result_idx]
        else:
            result_name = 'Other'
        
        result_attention_scores[result_name].append(attention[i])

# í‰ê·  ê³„ì‚°
type_avg_attention = {k: np.mean(v) if v else 0 for k, v in type_attention_scores.items()}
result_avg_attention = {k: np.mean(v) if v else 0 for k, v in result_attention_scores.items()}

# ì •ë ¬ (ë†’ì€ ìˆœ)
sorted_types = sorted(type_avg_attention.items(), key=lambda x: x[1], reverse=True)
sorted_results = sorted(result_avg_attention.items(), key=lambda x: x[1], reverse=True)

print(f"\n{'='*70}")
print(f"ðŸŽ¯ ì•¡ì…˜ íƒ€ìž…ë³„ í‰ê·  Attention (ìƒìœ„ 10ê°œ)")
print(f"{'='*70}")
for i, (type_name, avg_attn) in enumerate(sorted_types[:10], 1):
    count = len(type_attention_scores[type_name])
    print(f"  {i:2d}. {type_name:20s}: {avg_attn:.6f} (ìƒ˜í”Œ ìˆ˜: {count:,})")

print(f"\n{'='*70}")
print(f"ðŸŽ¯ ê²°ê³¼ë³„ í‰ê·  Attention (ìƒìœ„ 8ê°œ)")
print(f"{'='*70}")
for i, (result_name, avg_attn) in enumerate(sorted_results[:8], 1):
    count = len(result_attention_scores[result_name])
    print(f"  {i:2d}. {result_name:20s}: {avg_attn:.6f} (ìƒ˜í”Œ ìˆ˜: {count:,})")

In [None]:
# ======================================================
# Attention ì‹œê°í™”
# ======================================================

# í•œê¸€ í°íŠ¸ ì„¤ì •
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False


fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)

# 1. ì•¡ì…˜ íƒ€ìž…ë³„ Attention (ìƒìœ„ 12ê°œ)
ax1 = fig.add_subplot(gs[0, :])
top_types = sorted_types[:12]
type_names = [t[0] for t in top_types]
type_scores = [t[1] for t in top_types]

bars = ax1.barh(type_names, type_scores, color='steelblue', edgecolor='black', linewidth=0.8)
ax1.set_xlabel('í‰ê·  Attention Weight', fontsize=12, fontweight='bold')
ax1.set_title('[V8] ì•¡ì…˜ íƒ€ìž…ë³„ í‰ê·  Attention - ëª¨ë¸ì´ ì§‘ì¤‘í•˜ëŠ” ì•¡ì…˜', fontsize=14, fontweight='bold')
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3)

# ê°’ í‘œì‹œ
for i, (bar, score) in enumerate(zip(bars, type_scores)):
    count = len(type_attention_scores[type_names[i]])
    ax1.text(score + 0.0001, i, f'{score:.5f} (n={count:,})', 
             va='center', fontsize=9, fontweight='bold')

# 2. ê²°ê³¼ë³„ Attention (ìƒìœ„ 8ê°œ)
ax2 = fig.add_subplot(gs[1, 0])
top_results = sorted_results[:8]
result_names = [r[0] for r in top_results]
result_scores = [r[1] for r in top_results]

bars2 = ax2.bar(range(len(result_names)), result_scores, color='coral', edgecolor='black', linewidth=0.8)
ax2.set_xticks(range(len(result_names)))
ax2.set_xticklabels(result_names, rotation=45, ha='right', fontsize=9)
ax2.set_ylabel('í‰ê·  Attention Weight', fontsize=11, fontweight='bold')
ax2.set_title('ê²°ê³¼ë³„ í‰ê·  Attention', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

for i, (bar, score) in enumerate(zip(bars2, result_scores)):
    height = bar.get_height()
    ax2.text(i, height + 0.0001, f'{score:.5f}', ha='center', va='bottom', fontsize=8, fontweight='bold')

# 3. ì‹œí€€ìŠ¤ ìœ„ì¹˜ë³„ Attention íŒ¨í„´ (ížˆíŠ¸ë§µ)
ax3 = fig.add_subplot(gs[1, 1])

# ì‹œí€€ìŠ¤ë¥¼ 10ê°œ êµ¬ê°„ìœ¼ë¡œ ë‚˜ëˆ„ì–´ í‰ê·  attention ê³„ì‚°
position_attention = []
for sample in attention_data:
    attention = sample['attention']
    seq_len = len(attention)
    
    # ì •ê·œí™”ëœ ìœ„ì¹˜ (0~1)
    normalized_positions = np.linspace(0, 1, seq_len)
    
    # 10ê°œ êµ¬ê°„ìœ¼ë¡œ binning
    bins = np.linspace(0, 1, 11)
    bin_indices = np.digitize(normalized_positions, bins) - 1
    
    for pos_idx in range(10):
        mask = bin_indices == pos_idx
        if mask.sum() > 0:
            position_attention.append({
                'position': pos_idx,
                'attention': attention[mask].mean()
            })

# êµ¬ê°„ë³„ í‰ê· 
position_groups = {}
for item in position_attention:
    pos = item['position']
    if pos not in position_groups:
        position_groups[pos] = []
    position_groups[pos].append(item['attention'])

position_avg = [np.mean(position_groups.get(i, [0])) for i in range(10)]
position_labels = ['ì‹œìž‘\n0-10%', '10-20%', '20-30%', '30-40%', '40-50%', 
                   '50-60%', '60-70%', '70-80%', '80-90%', 'ë\n90-100%']

bars3 = ax3.bar(range(10), position_avg, color='mediumseagreen', edgecolor='black', linewidth=0.8)
ax3.set_xticks(range(10))
ax3.set_xticklabels(position_labels, rotation=0, fontsize=9)
ax3.set_ylabel('í‰ê·  Attention Weight', fontsize=11, fontweight='bold')
ax3.set_title('ì‹œí€€ìŠ¤ ìœ„ì¹˜ë³„ í‰ê·  Attention', fontsize=12, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

for i, (bar, score) in enumerate(zip(bars3, position_avg)):
    height = bar.get_height()
    ax3.text(i, height + 0.0001, f'{score:.5f}', ha='center', va='bottom', fontsize=8, fontweight='bold', rotation=0)

# 4. Attention ë¶„í¬ ížˆìŠ¤í† ê·¸ëž¨
ax4 = fig.add_subplot(gs[2, :])

all_attention_weights = []
for sample in attention_data:
    all_attention_weights.extend(sample['attention'])

ax4.hist(all_attention_weights, bins=100, color='purple', alpha=0.7, edgecolor='black')
ax4.set_xlabel('Attention Weight', fontsize=12, fontweight='bold')
ax4.set_ylabel('ë¹ˆë„', fontsize=12, fontweight='bold')
ax4.set_title('ì „ì²´ Attention Weight ë¶„í¬', fontsize=14, fontweight='bold')
ax4.axvline(np.mean(all_attention_weights), color='red', linestyle='--', linewidth=2, label=f'í‰ê· : {np.mean(all_attention_weights):.5f}')
ax4.axvline(np.median(all_attention_weights), color='orange', linestyle='--', linewidth=2, label=f'ì¤‘ì•™ê°’: {np.median(all_attention_weights):.5f}')
ax4.legend(fontsize=11)
ax4.grid(alpha=0.3)

plt.suptitle('[V8] Bi-LSTM + Attention ë¶„ì„ - ëª¨ë¸ì´ ì§‘ì¤‘í•˜ëŠ” ì•¡ì…˜ íŒ¨í„´', fontsize=16, fontweight='bold', y=0.995)
plt.savefig('v8_attention_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n{'='*70}")
print(f"âœ… Attention ë¶„ì„ ì™„ë£Œ!")
print(f"{'='*70}")
print(f"ðŸ“Š ì£¼ìš” ë°œê²¬:")
print(f"   - ê°€ìž¥ ë†’ì€ Attention: {sorted_types[0][0]} ({sorted_types[0][1]:.5f})")
print(f"   - ì „ì²´ í‰ê·  Attention: {np.mean(all_attention_weights):.5f}")
print(f"   - Attention í‘œì¤€íŽ¸ì°¨: {np.std(all_attention_weights):.5f}")
print(f"   - ì‹œí€€ìŠ¤ ëë¶€ë¶„ Attention: {position_avg[-1]:.5f}")
print(f"   - ì‹œí€€ìŠ¤ ì‹œìž‘ë¶€ë¶„ Attention: {position_avg[0]:.5f}")
print(f"{'='*70}")
print(f"ðŸ’¾ ì €ìž¥ ì™„ë£Œ: v8_attention_analysis.png")