In [None]:
import os
import gc
import warnings
import logging
import time
import math
import json
import glob
import cupy
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.onnx
import timm
import lightgbm as lgb
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [None]:
class CFG:
    # Paths
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
    
    # 모델 경로 직접 지정
    model_base_path = '/kaggle/input/conv_gbm/pytorch/default/1/'
    feature_extractor_path = '/kaggle/input/conv_gbm/pytorch/default/1/convnext_feature_extractor_20250605_203642.pth'
    lgbm_models_dir = '/kaggle/input/conv_gbm/pytorch/default/1/lgbm_models_20250605_203642'
    
    # Audio parameters
    FS = 32000  
    WINDOW_SIZE = 5  
    
    # Mel spectrogram parameters
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    TARGET_SHAPE = (256, 256)
    
    # Model parameters
    model_name = 'convnextv2_nano.fcmae'
    in_channels = 1
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Inference parameters
    batch_size = 32
    threshold = 0.5
    
    # TTA parameters
    use_tta = True
    tta_count = 5

    
    debug = False
    debug_count = 3

cfg = CFG()
print(f"Using device: {cfg.device}")
print(f"Feature extractor path: {cfg.feature_extractor_path}")
print(f"LightGBM models directory: {cfg.lgbm_models_dir}")

In [None]:
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

In [None]:
class ConvNeXtFeatureExtractor(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # 훈련 시와 완전히 동일한 설정
        self.backbone = timm.create_model(
            'convnextv2_nano.fcmae',
            pretrained=False,  # 훈련 시와 동일
            in_chans=cfg.in_channels,
            num_classes=0,
            drop_rate=0.2,      # 훈련 시 사용한 설정
            drop_path_rate=0.1  # 훈련 시 사용한 설정
        )
        # 훈련 시 사용한 이중 풀링 구조 완전 복원
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.global_pool = nn.AdaptiveMaxPool2d(1)

    def forward(self, x):
        features = self.backbone(x)
        if len(features.shape) == 4:
            avg_pool = self.pooling(features).view(features.size(0), -1)
            max_pool = self.global_pool(features).view(features.size(0), -1)
            # 평균과 최대 풀링 결합 (훈련 시와 동일)
            return torch.cat([avg_pool, max_pool], dim=1)
        return features


In [None]:
def audio2melspec(audio_data, cfg):
    """Convert audio data to mel spectrogram"""
    try:
        if np.isnan(audio_data).any():
            mean_signal = np.nanmean(audio_data)
            audio_data = np.nan_to_num(audio_data, nan=mean_signal)

        mel_spec = librosa.feature.melspectrogram(
            y=audio_data,
            sr=cfg.FS,
            n_fft=cfg.N_FFT,
            hop_length=cfg.HOP_LENGTH,
            n_mels=cfg.N_MELS,
            fmin=cfg.FMIN,
            fmax=cfg.FMAX,
            power=2.0
        )

        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
        
        return mel_spec_norm
        
    except Exception as e:
        print(f"Error in mel spectrogram conversion: {e}")
        # 오류 시 영 스펙트로그램 반환
        return np.zeros((cfg.N_MELS, int(cfg.FS * cfg.WINDOW_SIZE / cfg.HOP_LENGTH) + 1), dtype=np.float32)

        
    except Exception as e:
        print(f"Error in mel spectrogram conversion: {e}")
        # 오류 시 영 스펙트로그램 반환
        return np.zeros((cfg.N_MELS, int(cfg.FS * cfg.WINDOW_SIZE / cfg.HOP_LENGTH) + 1), dtype=np.float32)

def apply_tta_augmentations(spec):
    """Apply Test-Time Augmentation to spectrogram"""
    augmented_specs = []
    
    try:
        # 1. Original (no augmentation)
        augmented_specs.append(spec.copy())
        
        # 2. Time shift augmentations
        for shift in [-2, 2]:
            shifted = np.roll(spec, shift, axis=1)
            augmented_specs.append(shifted)
        
        # 3. Frequency shift augmentations
        for shift in [-1, 1]:
            shifted = np.roll(spec, shift, axis=0)
            augmented_specs.append(shifted)
        
        # 4. Brightness/contrast augmentations
        for gain in [0.9, 1.1]:
            adjusted = np.clip(spec * gain, 0, 1)
            augmented_specs.append(adjusted)
        
        # 5. Gaussian noise augmentation
        noise_spec = spec + np.random.normal(0, 0.02, spec.shape)
        noise_spec = np.clip(noise_spec, 0, 1)
        augmented_specs.append(noise_spec)
        
        # 6. Horizontal flip (time reversal)
        flipped = np.fliplr(spec)
        augmented_specs.append(flipped)
        
    except Exception as e:
        print(f"Error in TTA augmentation: {e}")
        # 오류 시 원본만 반환
        augmented_specs = [spec]
    
    return augmented_specs


def process_audio_segment(audio_data, cfg):
    """Process audio segment to get mel spectrogram"""
    try:
        if len(audio_data) < cfg.FS * cfg.WINDOW_SIZE:
            audio_data = np.pad(audio_data, 
                              (0, cfg.FS * cfg.WINDOW_SIZE - len(audio_data)), 
                              mode='constant')
        
        mel_spec = audio2melspec(audio_data, cfg)
        
        # Resize if needed
        if mel_spec.shape != cfg.TARGET_SHAPE:
            mel_spec = cv2.resize(mel_spec, cfg.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
            
        return mel_spec.astype(np.float32)
        
    except Exception as e:
        print(f"Error processing audio segment: {e}")
        # 오류 시 영 스펙트로그램 반환
        return np.zeros(cfg.TARGET_SHAPE, dtype=np.float32)

In [None]:
def load_models(cfg, num_classes):
    """Load all models - 직접 경로 지정 방식으로 수정"""
    models = {}
    
    try:
        # 모델 파일 존재 확인
        if not os.path.exists(cfg.feature_extractor_path):
            print(f"Error: Feature extractor not found at {cfg.feature_extractor_path}")
            return {}
        
        if not os.path.exists(cfg.lgbm_models_dir):
            print(f"Error: LightGBM models directory not found at {cfg.lgbm_models_dir}")
            return {}
        
        print(f"Loading PyTorch feature extractor from {cfg.feature_extractor_path}")
        
        # PyTorch 특징 추출기 로드
        feature_model = ConvNeXtFeatureExtractor(cfg)
        
        # 체크포인트 로드 및 검증
        print(f"Loading checkpoint...")
        checkpoint = torch.load(cfg.feature_extractor_path, map_location=torch.device('cpu'))
        print(f"Checkpoint keys: {list(checkpoint.keys())}")
        
        # 모델 상태 로드
        if 'model_state_dict' in checkpoint:
            feature_model.load_state_dict(checkpoint['model_state_dict'])
            print("Loaded model_state_dict successfully")
        else:
            print("Warning: model_state_dict not found in checkpoint, trying direct load...")
            feature_model.load_state_dict(checkpoint)
        
        feature_model = feature_model.to(cfg.device)
        feature_model.eval()
        
        # 특성 추출기 테스트
        print("Testing feature extractor...")
        test_input = torch.randn(1, 1, cfg.TARGET_SHAPE[0], cfg.TARGET_SHAPE[1]).to(cfg.device)
        with torch.no_grad():
            test_features = feature_model(test_input)
            print(f"Feature extractor output shape: {test_features.shape}")
        
        # Load LightGBM models
        print(f"Loading LightGBM models from {cfg.lgbm_models_dir}")
        
        # 실제 존재하는 모델 파일들 확인
        existing_models = glob.glob(os.path.join(cfg.lgbm_models_dir, 'lgbm_model_class_*.txt'))
        print(f"Found {len(existing_models)} LightGBM model files")
        
        lgbm_models = []
        loaded_count = 0
        
        for i in range(num_classes):
            model_path = os.path.join(cfg.lgbm_models_dir, f'lgbm_model_class_{i}.txt')
            if os.path.exists(model_path):
                try:
                    model = lgb.Booster(model_file=model_path)
                    lgbm_models.append(model)
                    loaded_count += 1
                    if i < 5:  # 처음 5개만 로그 출력
                        print(f"✓ Loaded LightGBM model {i}")
                except Exception as e:
                    print(f"✗ Error loading LightGBM model {i}: {e}")
                    lgbm_models.append(None)
            else:
                if i < 5:  # 처음 5개만 로그 출력
                    print(f"✗ Model file not found: {model_path}")
                lgbm_models.append(None)
        
        # 유효한 모델 수 확인
        if loaded_count == 0:
            print("Error: No valid LightGBM models found!")
            return {}
        
        models['feature_extractor'] = feature_model
        models['lgbm_models'] = lgbm_models
        models['use_openvino'] = False  # PyTorch 사용
        
        print(f"✓ Successfully loaded feature extractor and {loaded_count}/{num_classes} LightGBM models")
        
    except Exception as e:
        print(f"Error loading models: {e}")
        import traceback
        traceback.print_exc()
        return {}
    
    return models


In [None]:
def predict_on_spectrogram_with_tta(audio_path, models, cfg, species_ids):
    """Process a single audio file and predict species presence with TTA"""
    predictions = []
    row_ids = []
    soundscape_id = Path(audio_path).stem
    
    feature_extractor = models['feature_extractor']
    lgbm_models = models['lgbm_models']
    
    try:
        print(f"Processing {soundscape_id}")
        audio_data, _ = librosa.load(audio_path, sr=cfg.FS)
        
        # 샘플 코드와 동일하게 모든 세그먼트 처리
        segment_length = cfg.FS * cfg.WINDOW_SIZE
        
        segment_idx = 0
        for start_sample in range(0, len(audio_data), segment_length):
            end_sample = start_sample + segment_length
            segment_audio = audio_data[start_sample:end_sample]
            
            # 5초 미만이면 패딩
            if len(segment_audio) < segment_length:
                segment_audio = np.pad(segment_audio, 
                                     (0, segment_length - len(segment_audio)), 
                                     mode='constant')
            
            # Row ID 생성 - 샘플 코드와 동일한 방식
            end_time_sec = (segment_idx + 1) * cfg.WINDOW_SIZE
            row_id = f"{soundscape_id}_{end_time_sec}"
            row_ids.append(row_id)

            # Process segment
            mel_spec = process_audio_segment(segment_audio, cfg)
            
            if cfg.use_tta:
                # Apply TTA
                augmented_specs = apply_tta_augmentations(mel_spec)
                
                # Limit number of augmentations if specified
                if cfg.tta_count > 0:
                    augmented_specs = augmented_specs[:cfg.tta_count]
                
                segment_predictions = []
                
                for aug_spec in augmented_specs:
                    # Convert to tensor
                    spec_tensor = torch.tensor(aug_spec, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
                    spec_tensor = spec_tensor.to(cfg.device)
                    
                    # Extract features
                    with torch.no_grad():
                        features = feature_extractor(spec_tensor)
                        features = features.cpu().numpy()
                    
                    # LightGBM predictions
                    aug_preds = np.zeros(len(species_ids))
                    for i, lgb_model in enumerate(lgbm_models):
                        if lgb_model is not None:  # None 모델 체크
                            pred = lgb_model.predict(features)
                            aug_preds[i] = pred[0]
                        else:
                            aug_preds[i] = 0.0  # None 모델은 0으로 처리
                    
                    segment_predictions.append(aug_preds)
                    
                    # 메모리 정리
                    del spec_tensor
                    torch.cuda.empty_cache()
                
                # Average TTA predictions
                final_preds = np.mean(segment_predictions, axis=0)
                
            else:
                # No TTA - single prediction
                spec_tensor = torch.tensor(mel_spec, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
                spec_tensor = spec_tensor.to(cfg.device)
                
                # Extract features
                with torch.no_grad():
                    features = feature_extractor(spec_tensor)
                    features = features.cpu().numpy()
                
                # LightGBM predictions
                final_preds = np.zeros(len(species_ids))
                for i, lgb_model in enumerate(lgbm_models):
                    if lgb_model is not None:  # None 모델 체크
                        pred = lgb_model.predict(features)
                        final_preds[i] = pred[0]
                    else:
                        final_preds[i] = 0.0  # None 모델은 0으로 처리
                
                # 메모리 정리
                del spec_tensor
                torch.cuda.empty_cache()
            
            # Convert to probabilities using sigmoid
            final_preds = 1 / (1 + np.exp(-final_preds))
            predictions.append(final_preds)
            
            segment_idx += 1
            
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        import traceback
        traceback.print_exc()
    
    return row_ids, predictions

In [None]:
def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes with TTA"""
    try:
        test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))

        
        if cfg.debug:
            print(f"Debug mode enabled, using only {cfg.debug_count} files")
            test_files = test_files[:cfg.debug_count]
        
        print(f"Found {len(test_files)} test soundscapes")
        
        if cfg.use_tta:
            print(f"TTA enabled with {cfg.tta_count} augmentations per sample")
        else:
            print("TTA disabled - using single prediction per sample")

        all_row_ids = []
        all_predictions = []

        for audio_path in tqdm(test_files):
            row_ids, predictions = predict_on_spectrogram_with_tta(str(audio_path), models, cfg, species_ids)
            all_row_ids.extend(row_ids)
            all_predictions.extend(predictions)
        
        return all_row_ids, all_predictions
        
    except Exception as e:
        print(f"Error in inference: {e}")
        import traceback
        traceback.print_exc()
        return [], []

In [None]:
def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe with improved compatibility"""
    try:
        print("Creating submission dataframe...")

        # 기본 제출 구조 생성
        submission_dict = {'row_id': row_ids}
        for i, species in enumerate(species_ids):
            submission_dict[species] = [pred[i] for pred in predictions]

        submission_df = pd.DataFrame(submission_dict)
        
        # 샘플 제출 파일과 정확히 맞추기
        sample_sub = pd.read_csv(cfg.submission_csv)
        
        # 컬럼 순서와 내용을 샘플과 정확히 일치
        submission_df = submission_df.reindex(columns=sample_sub.columns, fill_value=0.0)
        
        print(f"Submission shape: {submission_df.shape}")
        print(f"Sample submission shape: {sample_sub.shape}")
        print(f"Columns match: {list(sample_sub.columns) == list(submission_df.columns)}")
        
        return submission_df
        
    except Exception as e:
        print(f"Error creating submission: {e}")
        import traceback
        traceback.print_exc()
        # 빈 제출 파일 생성
        sample_sub = pd.read_csv(cfg.submission_csv)
        return sample_sub.fillna(0.0)

In [None]:
def main():
    start_time = time.time()
    print("Starting BirdCLEF-2025 inference...")
    
    # 모델 로딩
    models = load_models(cfg, num_classes)
    
    if not models or 'feature_extractor' not in models:
        print("Critical: Models not loaded properly!")
        return
    
    if not models.get('lgbm_models'):
        print("Critical: LightGBM models not loaded!")
        return
    
    engine_info = "PyTorch"
    tta_info = f"TTA enabled ({cfg.tta_count} augmentations)" if cfg.use_tta else "TTA disabled"
    valid_lgbm_count = len([m for m in models['lgbm_models'] if m is not None])
    print(f"Model usage: {engine_info} ConvNeXt + {valid_lgbm_count}/{len(models['lgbm_models'])} LightGBM models with {tta_info}")

    # 추론 실행
    row_ids, predictions = run_inference(cfg, models, species_ids)
    
    if not row_ids or not predictions:
        print("No predictions generated!")
        return

    # 제출 파일 생성
    submission_df = create_submission(row_ids, predictions, species_ids, cfg)

    submission_path = 'submission.csv'
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")
    
    end_time = time.time()
    print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")

if __name__ == "__main__":
    main()