In [1]:
!pip install -q torch torchvision transformers
!pip install -q pandas numpy scikit-learn
!pip install -q pillow requests tqdm
!pip install -q faiss-cpu # or faiss-gpu if you have GPU
!pip install -q sentence-transformers

!pip install -q git+https://github.com/openai/CLIP.git

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import clip
from transformers import CLIPProcessor, CLIPModel
import requests
from tqdm import tqdm
import json
import pickle
import faiss
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

2025-08-12 09:01:33.888306: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754989294.219333      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754989294.314817      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


In [3]:
class Config:
    # Paths
    DATA_PATH = "/kaggle/working/newsimages_data/"
    YFCC_EMBEDDINGS_PATH = "/kaggle/working/yfcc_embeddings/"
    MODELS_PATH = "/kaggle/working/models/"
    RESULTS_PATH = "/kaggle/working/results/"
    
    # Model settings
    CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"  # Hugging Face model
    BATCH_SIZE = 32
    MAX_TEXT_LENGTH = 77
    IMAGE_SIZE = 224
    EMBEDDING_DIM = 512
    
    # Inference settings
    TOP_K_CANDIDATES = 8500
    TARGET_IMG_SIZE = (460, 260)  # Required output size

# Create directories
for path in [Config.DATA_PATH, Config.YFCC_EMBEDDINGS_PATH, 
             Config.MODELS_PATH, Config.RESULTS_PATH]:
    os.makedirs(path, exist_ok=True)

In [4]:
class NewsDataset:
    def __init__(self, csv_path, images_folder_path):
        """Load and preprocess news articles dataset with local images
        
        Args:
        csv_path: Path to the CSV file with news data
        images_folder_path: Path to folder containing images named by image_id
        """
        self.df = pd.read_csv(csv_path)
        self.images_folder_path = images_folder_path
        
        print(f"Loaded {len(self.df)} news articles")
        print(f"Images folder: {images_folder_path}")
        print(f"Columns: {list(self.df.columns)}")
        
        # Verify expected columns are present
        expected_cols = ['article_id', 'article_url', 'article_title', 'article_tags', 'image_id', 'image_url']
        missing_cols = [col for col in expected_cols if col not in self.df.columns]
        if missing_cols:
            print(f"Warning: Missing columns: {missing_cols}")
        
        # Clean and preprocess text
        self.df = self.preprocess_data()
        
        # Check available images
        self.check_available_images()
    
    def check_available_images(self):
        """Check which images are available locally and report statistics"""
        if not os.path.exists(self.images_folder_path):
            print(f"Warning: Images folder {self.images_folder_path} does not exist!")
            self.available_images = set()
            return
        
        # Get list of available image files
        image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
        available_files = []
        
        for file in os.listdir(self.images_folder_path):
            if any(file.lower().endswith(ext) for ext in image_extensions):
                # Extract image_id from filename (remove extension)
                image_id = os.path.splitext(file)[0]
                available_files.append(image_id)
        
        self.available_images = set(available_files)
        
        # Check how many dataset images are available
        dataset_image_ids = set(self.df['image_id'].astype(str))
        available_dataset_images = dataset_image_ids.intersection(self.available_images)
        
        print(f"Found {len(self.available_images)} image files in folder")
        print(f"Dataset has {len(dataset_image_ids)} unique image IDs")
        print(f"Available images for dataset: {len(available_dataset_images)}")
        print(f"Missing images: {len(dataset_image_ids) - len(available_dataset_images)}")
        
        if len(available_dataset_images) < len(dataset_image_ids) * 0.1:
            print("Warning: Very few images available! Check image_id naming convention.")
    
    def preprocess_data(self):
        """Clean and preprocess the news data"""
        df = self.df.copy()
        
        # Handle missing values
        df['article_title'] = df['article_title'].fillna('')
        df['article_tags'] = df['article_tags'].fillna('')
        df['article_url'] = df['article_url'].fillna('')
        df['image_url'] = df['image_url'].fillna('')
        
        # Combine title and tags for better matching
        df['combined_text'] = df.apply(self.combine_text_features, axis=1)
        
        # Clean text
        df['combined_text'] = df['combined_text'].str.replace('\n', ' ')
        df['combined_text'] = df['combined_text'].str.replace('\r', ' ')
        df['combined_text'] = df['combined_text'].str.strip()
        
        return df
    
    def combine_text_features(self, row):
        """Combine only title and tags (no web content)"""
        parts = []
        
        if pd.notna(row['article_title']) and row['article_title'].strip():
            parts.append(row['article_title'])
        
        if pd.notna(row['article_tags']) and row['article_tags'].strip():
            # Clean tags and add as keywords
            tags = row['article_tags'].replace(',', ' ').replace(';', ' ')
            parts.append(f"Keywords: {tags}")
        
        return ". ".join(parts)
    
    def get_image_path(self, idx):
        """Get local path to image file"""
        row = self.df.iloc[idx]
        image_id = str(row['image_id'])
        
        # Check if image exists locally
        if image_id not in self.available_images:
            return None
        
        # Try different common image extensions
        image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
        
        for ext in image_extensions:
            image_path = os.path.join(self.images_folder_path, f"{image_id}{ext}")
            if os.path.exists(image_path):
                # Verify it's a valid image
                try:
                    with Image.open(image_path) as img:
                        img.verify()
                    return image_path
                except Exception as e:
                    print(f"Invalid image file {image_path}: {e}")
                    continue
        
        return None
    
    def get_available_image_indices(self):
        """Get indices of rows that have available images"""
        available_indices = []
        
        for idx in range(len(self.df)):
            if self.get_image_path(idx) is not None:
                available_indices.append(idx)
        
        return available_indices
    
    def create_filtered_dataset(self):
        """Create a filtered dataset containing only rows with available images"""
        available_indices = self.get_available_image_indices()
        
        if not available_indices:
            print("Warning: No images available! Cannot create filtered dataset.")
            return None
        
        filtered_df = self.df.iloc[available_indices].copy().reset_index(drop=True)
        
        # Create new dataset object with filtered data
        filtered_dataset = NewsDataset.__new__(NewsDataset)
        filtered_dataset.df = filtered_df
        filtered_dataset.images_folder_path = self.images_folder_path
        filtered_dataset.available_images = self.available_images
        
        print(f"Created filtered dataset with {len(filtered_df)} articles (all have images)")
        
        return filtered_dataset
    
    def get_article_text(self, idx):
        """Get preprocessed text for an article"""
        return self.df.iloc[idx]['combined_text']
    
    def get_article_id(self, idx):
        """Get article ID"""
        return self.df.iloc[idx]['article_id']
    
    def __len__(self):
        return len(self.df)

class NewsImageRetrieval(nn.Module):
    def __init__(self, clip_model_name=Config.CLIP_MODEL_NAME):
        super().__init__()
        
        # Load CLIP model and processor
        self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(device)  # Move to device here
        self.processor = CLIPProcessor.from_pretrained(clip_model_name)
        
        # Additional layers for news domain adaptation
        self.text_projection = nn.Linear(512, Config.EMBEDDING_DIM)
        self.image_projection = nn.Linear(512, Config.EMBEDDING_DIM)
        
        # News-specific concept classifier (optional enhancement)
        self.concept_classifier = nn.Linear(Config.EMBEDDING_DIM, 100)  # 100 news concepts
        
        self.dropout = nn.Dropout(0.1)
        self.temperature = 0.07  # Use fixed temperature
    
    def encode_text(self, texts):
        """Encode text using CLIP text encoder"""
        inputs = self.processor(text=texts, return_tensors="pt", 
                                padding=True, truncation=True, max_length=Config.MAX_TEXT_LENGTH)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            text_features = self.clip_model.get_text_features(**inputs)
        
        # Apply additional projection for news domain
        text_features = self.text_projection(text_features)
        text_features = F.normalize(text_features, p=2, dim=1)
        
        return text_features
    
    def encode_image(self, images):
        """Encode images using CLIP image encoder"""
        # If tensor is already normalized, bring it back to [0,1]
        if isinstance(images, torch.Tensor):
            # Check if values are out of [0,1]
            if images.min() < 0 or images.max() > 1:
                images = (images - images.min()) / (images.max() - images.min())
            
            # Convert to list of PIL images
            from torchvision import transforms
            to_pil = transforms.ToPILImage()
            images = [to_pil(img.cpu()) for img in images]
        
        inputs = self.processor(images=images, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            image_features = self.clip_model.get_image_features(**inputs)
        
        # Apply additional projection
        image_features = self.image_projection(image_features)
        image_features = F.normalize(image_features, p=2, dim=1)
        
        return image_features
    
    def forward(self, texts, images):
        """Forward pass for training"""
        text_features = self.encode_text(texts)
        image_features = self.encode_image(images)
        
        return text_features, image_features
    
    def compute_similarity(self, text_features, image_features):
        """Compute similarity scores between text and image features"""
        # Normalize features
        text_features = F.normalize(text_features, p=2, dim=1)
        image_features = F.normalize(image_features, p=2, dim=1)
        
        # Compute cosine similarity
        similarity = torch.matmul(text_features, image_features.T) / self.temperature
        return similarity

class YFCC100MHandler:
    """Handle YFCC100M dataset for image retrieval with proper directory structure"""
    
    def __init__(self, yfcc_path):
        self.yfcc_path = yfcc_path
        self.image_embeddings = {}
        self.image_metadata = {}
        self.image_paths = {}  # Store mapping of image_id to full path
    
    def scan_yfcc_directory(self, max_images=None):
        """Scan YFCC100M directory structure and collect image paths"""
        print("Scanning YFCC100M directory structure...")
        
        image_paths = {}
        total_found = 0
        
        # Walk through the hierarchical directory structure
        for root, dirs, files in os.walk(self.yfcc_path):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    full_path = os.path.join(root, file)
                    
                    # Extract image ID from filename (remove extension)
                    image_id = os.path.splitext(file)[0]
                    image_paths[image_id] = full_path
                    
                    total_found += 1
                    if max_images and total_found >= max_images:
                        break
            
            if max_images and total_found >= max_images:
                break
        
        self.image_paths = image_paths
        print(f"Found {len(image_paths)} images in YFCC100M dataset")
        
        # Extract location information from directory structure (as fallback since no metadata)
        self.extract_location_metadata()
        
        return image_paths
    
    def extract_location_metadata(self):
        """Extract location/landmark information from directory paths"""
        print("Extracting location metadata from directory structure...")
        
        metadata = {}
        for image_id, full_path in self.image_paths.items():
            # Get relative path from yfcc_path
            rel_path = os.path.relpath(full_path, self.yfcc_path)
            path_parts = rel_path.split(os.sep)
            
            # Extract location/landmark information
            location_tags = []
            for part in path_parts[:-1]:  # Exclude filename
                if part not in ['test', 'train', 'calibration', 'images']:
                    # Clean up landmark names (replace underscores with spaces)
                    landmark = part.replace('_', ' ').title()
                    location_tags.append(landmark)
            
            metadata[image_id] = {
                'path': full_path,
                'relative_path': rel_path,
                'location_tags': location_tags,
                'landmark': ' '.join(location_tags) if location_tags else 'Unknown'
            }
        
        self.image_metadata = metadata
        print(f"Extracted metadata for {len(metadata)} images")
    
    def generate_image_embeddings(self, model, max_images=300000, batch_size=32):
        """Generate embeddings for YFCC100M images using actual image files"""
        print(f"Generating embeddings for up to {max_images} YFCC100M images...")
        
        if not self.image_paths:
            print("No image paths found. Running directory scan first...")
            self.scan_yfcc_directory(max_images=max_images)
        
        embeddings = {}
        processed = 0
        
        # Process images in batches
        image_items = list(self.image_paths.items())[:max_images]
        
        for i in tqdm(range(0, len(image_items), batch_size), desc="Generating embeddings"):
            batch_items = image_items[i:i+batch_size]
            batch_images = []
            batch_ids = []
            
            # Load batch of images
            for image_id, image_path in batch_items:
                try:
                    image = Image.open(image_path).convert('RGB')
                    batch_images.append(image)
                    batch_ids.append(image_id)
                except Exception as e:
                    print(f"Error loading image {image_path}: {e}")
                    continue
            
            # Generate embeddings for batch
            if batch_images:
                try:
                    with torch.no_grad():
                        batch_embeddings = model.encode_image(batch_images)
                    
                    # Store embeddings
                    for j, image_id in enumerate(batch_ids):
                        embedding = batch_embeddings[j].cpu().numpy()
                        embeddings[image_id] = embedding
                        processed += 1
                
                except Exception as e:
                    print(f"Error generating embeddings for batch: {e}")
        
        self.image_embeddings = embeddings
        print(f"Generated embeddings for {processed} images")
        return embeddings
    
    def get_image_info(self, image_id):
        """Get comprehensive information about an image"""
        if image_id in self.image_metadata:
            return self.image_metadata[image_id]
        return None
    
    def save_embeddings(self, filepath):
        """Save embeddings and metadata to disk"""
        data_to_save = {
            'embeddings': self.image_embeddings,
            'metadata': self.image_metadata,
            'image_paths': self.image_paths
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(data_to_save, f)
        print(f"Saved {len(self.image_embeddings)} embeddings and metadata to {filepath}")
    
    def load_embeddings(self, filepath):
        """Load embeddings and metadata from disk"""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        
        self.image_embeddings = data.get('embeddings', {})
        self.image_metadata = data.get('metadata', {})
        self.image_paths = data.get('image_paths', {})
        
        print(f"Loaded {len(self.image_embeddings)} embeddings and metadata from {filepath}")
    
    def copy_image_for_submission(self, image_id, output_path):
        """Copy and resize YFCC image for submission"""
        if image_id not in self.image_paths:
            print(f"Image {image_id} not found in YFCC dataset")
            return False
        
        source_path = self.image_paths[image_id]
        
        try:
            with Image.open(source_path) as img:
                # Convert to RGB if necessary
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                
                # Resize to submission requirements (460x260)
                img_resized = img.resize(Config.TARGET_IMG_SIZE, Image.Resampling.LANCZOS)
                
                # Save as PNG
                img_resized.save(output_path, 'PNG')
            return True
        
        except Exception as e:
            print(f"Error processing image {source_path}: {e}")
            return False

class EfficientImageRetrieval:
    """Efficient image retrieval using FAISS for similarity search"""
    
    def __init__(self, embedding_dim=Config.EMBEDDING_DIM):
        self.embedding_dim = embedding_dim
        self.index = None
        self.image_ids = []
    
    def build_index(self, image_embeddings):
        """Build FAISS index for fast similarity search"""
        print("Building FAISS index...")
        
        # Convert embeddings to numpy array
        embeddings_array = []
        image_ids = []
        
        for img_id, embedding in image_embeddings.items():
            embeddings_array.append(embedding)
            image_ids.append(img_id)
        
        embeddings_array = np.array(embeddings_array).astype('float32')
        
        # Create FAISS index
        self.index = faiss.IndexFlatIP(self.embedding_dim)  # Inner product (cosine similarity)
        
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings_array)
        
        # Add embeddings to index
        self.index.add(embeddings_array)
        self.image_ids = image_ids
        
        print(f"Built index with {len(image_ids)} images")
    
    def search(self, query_embedding, k=Config.TOP_K_CANDIDATES):
        """Search for most similar images"""
        if self.index is None:
            raise ValueError("Index not built. Call build_index first.")
        
        # Normalize query embedding
        query_embedding = query_embedding.astype('float32')
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        faiss.normalize_L2(query_embedding)
        
        # Search
        scores, indices = self.index.search(query_embedding, k)
        
        # Return results
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # Valid result
                results.append({
                    'image_id': self.image_ids[idx],
                    'similarity_score': float(score)
                })
        
        return results

In [5]:
class NewsImageRetrievalPipeline:
    """Complete pipeline for news image retrieval"""
    
    def __init__(self):
        self.model = None
        self.news_dataset = None
        self.yfcc_handler = None
        self.retrieval_engine = None
    
    def load_model(self, model_path=None):
        """Load trained model"""
        self.model = NewsImageRetrieval()
        
        if model_path and os.path.exists(model_path):
            self.model.load_state_dict(torch.load(model_path, map_location=device))
            print(f"Loaded model from {model_path}")
        else:
            print("Using pre-trained CLIP model without fine-tuning")
        
        self.model.eval()
        self.model = self.model.to(device)
    
    def setup_datasets(self, news_csv_path, images_path, yfcc_path):
        """Setup news and YFCC datasets"""
        self.news_dataset = NewsDataset(news_csv_path, images_path)
        self.yfcc_handler = YFCC100MHandler(yfcc_path)
    
    def prepare_retrieval_index(self, embeddings_path=None):
        """Prepare FAISS index for efficient retrieval"""
        if embeddings_path and os.path.exists(embeddings_path):
            self.yfcc_handler.load_embeddings(embeddings_path)
        else:
            # Generate embeddings for YFCC100M dataset
            self.yfcc_handler.generate_image_embeddings(self.model)
            if embeddings_path:
                self.yfcc_handler.save_embeddings(embeddings_path)
        
        # Build retrieval index
        self.retrieval_engine = EfficientImageRetrieval()
        self.retrieval_engine.build_index(self.yfcc_handler.image_embeddings)
    
    def retrieve_image_for_article(self, article_text, top_k=1):
        """Retrieve best matching image for an article"""
        # Encode article text
        with torch.no_grad():
            text_embedding = self.model.encode_text([article_text])
        
        # Convert to numpy
        text_embedding_np = text_embedding.cpu().numpy()[0]
        
        # Search for similar images
        results = self.retrieval_engine.search(text_embedding_np, k=top_k)
        
        return results[0] if results else None
    
    def process_evaluation_set(self, article_ids, output_dir, subtask="LARGE", group_name="CodingSoft"):
        """Process articles for evaluation submission"""
        os.makedirs(output_dir, exist_ok=True)
        
        results = {}
        successful_retrievals = 0
        
        for article_id in tqdm(article_ids, desc=f"Processing {subtask} articles"):
            # Find article in dataset
            article_row = self.news_dataset.df[
                self.news_dataset.df['article_id'] == article_id
            ]
            
            if not article_row.empty:
                # Get basic article text (title + tags only)
                idx = article_row.index[0]
                article_text = self.news_dataset.get_article_text(idx)
                
                # Retrieve best matching image
                best_match = self.retrieve_image_for_article(article_text)
                
                if best_match:
                    # Copy and resize image for submission
                    output_filename = f"{article_id}_{group_name}_CLIP.png"
                    output_path = os.path.join(output_dir, output_filename)
                    
                    success = self.yfcc_handler.copy_image_for_submission(
                        best_match['image_id'], output_path
                    )
                    
                    if success:
                        results[article_id] = {
                            'image_id': best_match['image_id'],
                            'similarity_score': best_match['similarity_score'],
                            'output_path': output_path
                        }
                        successful_retrievals += 1
                    else:
                        print(f"Failed to copy image for article {article_id}")
                else:
                    print(f"No matching image found for article {article_id}")
            else:
                print(f"Article {article_id} not found in dataset")
        
        print(f"Successfully processed {successful_retrievals}/{len(article_ids)} articles for {subtask}")
        return results

In [6]:
def create_submission_structure(group_name="CodingSoft"):
    """Create proper submission directory structure"""
    
    base_dir = f"{group_name}"
    subdirs = [
        "RET_CLIP_LARGE",
        "RET_CLIP_SMALL",
        "RET_ENHANCED_LARGE", 
        "RET_ENHANCED_SMALL"
    ]
    
    for subdir in subdirs:
        full_path = os.path.join(base_dir, subdir)
        os.makedirs(full_path, exist_ok=True)
        print(f"Created directory: {full_path}")

def resize_and_save_image(image_path, output_path, target_size=Config.TARGET_IMG_SIZE):
    """Resize image to submission requirements and save as PNG"""
    try:
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Resize to target dimensions
            img_resized = img.resize(target_size, Image.Resampling.LANCZOS)
            
            # Save as PNG
            img_resized.save(output_path, 'PNG')
        return True
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return False

def generate_submission_files(results, group_name, approach_name, subtask):
    """Generate properly named submission files"""
    group_name = "CodingSoft"
    output_dir = os.path.join(group_name, f"RET_{approach_name}_{subtask}")
    
    for article_id, result in results.items():
        # Create output filename
        output_filename = f"{article_id}_{group_name}_{approach_name}.png"
        output_path = os.path.join(output_dir, output_filename)
        
        # In practice, you would retrieve the actual image from YFCC100M
        # and resize it to the required dimensions
        # For demo purposes, we'll create a placeholder
        
        print(f"Would save: {output_path}")
        # resize_and_save_image(source_image_path, output_path)

def setup_yfcc_dataset(yfcc_path, max_images=400000):
    """Setup YFCC100M dataset with proper directory scanning"""
    print("Setting up YFCC100M dataset...")
    
    # Initialize handler
    yfcc_handler = YFCC100MHandler(yfcc_path)
    
    # Scan directory structure
    image_paths = yfcc_handler.scan_yfcc_directory(max_images=max_images)
    
    # Skip loading official metadata since it's not provided
    print("Skipping official metadata loading as no metadata file is provided.")
    
    print(f"YFCC100M setup complete: {len(image_paths)} images ready")
    return yfcc_handler

def generate_yfcc_embeddings(yfcc_handler, model, embeddings_path, max_images=300000):
    """Generate embeddings for YFCC100M images"""
    print("Generating YFCC100M embeddings...")
    
    # Generate embeddings using actual images
    embeddings = yfcc_handler.generate_image_embeddings(
        model=model, 
        max_images=max_images,
        batch_size=16  # Adjust based on GPU memory
    )
    
    # Save embeddings
    yfcc_handler.save_embeddings(embeddings_path)
    
    return embeddings

def main_inference():
    """Main inference script for generating submissions"""
    print("Starting NewsImages Retrieval Inference...")
    
    # Initialize pipeline
    pipeline = NewsImageRetrievalPipeline()
    
    # Load model (assuming it's already saved)
    model_path = os.path.join("/kaggle/input/news_retrieval_model_1/pytorch/default/1", "news_retrieval_model.pth")
    pipeline.load_model(model_path)
    
    # Setup datasets
    pipeline.setup_datasets(
        news_csv_path="/kaggle/input/newsimagedataset-v2/newsarticles.csv",
        images_path="/kaggle/input/newsimagedataset-v2/newsimages",
        yfcc_path="/kaggle/input/yfcc100m-dataset/OANet/yfcc100m"  # Replace with actual YFCC100M path
    )
    
    # Prepare retrieval index
    embeddings_path = os.path.join(Config.YFCC_EMBEDDINGS_PATH, "yfcc_embeddings.pkl")
    pipeline.prepare_retrieval_index(embeddings_path)
    
    # Load evaluation article IDs (these will be provided by organizers)
    # For now, using a sample
    small_eval_ids = list(pipeline.news_dataset.df['article_id'].head(50))
    large_eval_ids = list(pipeline.news_dataset.df['article_id'].head(8500))
    
    # Process small evaluation set
    print("Processing SMALL evaluation set...")
    small_results = pipeline.process_evaluation_set(
        article_ids=small_eval_ids,
        output_dir=os.path.join(Config.RESULTS_PATH, "RET_CLIP_SMALL"),
        subtask="SMALL"
    )
    
    # Process large evaluation set 
    print("Processing LARGE evaluation set...")
    large_results = pipeline.process_evaluation_set(
        article_ids=large_eval_ids,
        output_dir=os.path.join(Config.RESULTS_PATH, "RET_CLIP_LARGE"),
        subtask="LARGE"
    )
    
    print("Inference completed!")
    return small_results, large_results

def full_pipeline_example():
    """Complete example of the full pipeline without training"""
    print("NewsImages 2025 - Complete Pipeline Example (Skipping Training)")
    print("=" * 50)
    
    # Step 1: Setup directories
    create_submission_structure("CodingSoft")
    
    # Skip Step 2: Training (assuming model is already saved)
    print("\n🔹 Skipping Step 2: Model training (using saved model)")
    
    # Step 3: Setup YFCC100M dataset
    print("\n🔹 Step 3: Setting up YFCC100M dataset...")
    yfcc_path = "/kaggle/input/yfcc100m-dataset/OANet/yfcc100m"  # Update this path
    yfcc_handler = setup_yfcc_dataset(yfcc_path, max_images=400000)
    
    # Step 4: Generate embeddings
    print("\n🔹 Step 4: Generating embeddings...")
    model = NewsImageRetrieval()
    model = model.to(device)
    embeddings_path = os.path.join(Config.YFCC_EMBEDDINGS_PATH, "yfcc_embeddings.pkl")
    
    if not os.path.exists(embeddings_path):
        generate_yfcc_embeddings(yfcc_handler, model, embeddings_path, max_images=300000)
    
    # Step 5: Run inference
    print("\n🔹 Step 5: Running inference...")
    results = main_inference()
    
    print("\n🎉 Pipeline completed successfully!")
    return results

if __name__ == "__main__":
    print("NewsImages 2025 - Image Retrieval System")
    print("Directory structure detected:")
    print("natural_history_museum/")
    print("├── train/")
    print("│ ├── images/")
    print("│ │ ├── 11737074_...")
    print("│ │ ├── 12308432_...")
    print("│ │ └── ...")
    print("│ └── calibration/")
    print("└── test/")
    print("\n" + "=" * 50)
    
    # Update these paths according to your setup:
    YFCC_PATH = "/kaggle/input/yfcc100m-dataset/OANet/yfcc100m"  # Update this
    NEWS_CSV_PATH = "/kaggle/input/newsimagedataset-v2/newsarticles.csv"  # Update this
    
    print(f"Expected YFCC100M path: {YFCC_PATH}")
    print(f"Expected news CSV path: {NEWS_CSV_PATH}")
    print("\nRunning the inference pipeline:")
    full_pipeline_example()

NewsImages 2025 - Image Retrieval System
Directory structure detected:
natural_history_museum/
├── train/
│ ├── images/
│ │ ├── 11737074_...
│ │ ├── 12308432_...
│ │ └── ...
│ └── calibration/
└── test/

Expected YFCC100M path: /kaggle/input/yfcc100m-dataset/OANet/yfcc100m
Expected news CSV path: /kaggle/input/newsimagedataset-v2/newsarticles.csv

Running the inference pipeline:
NewsImages 2025 - Complete Pipeline Example (Skipping Training)
Created directory: CodingSoft/RET_CLIP_LARGE
Created directory: CodingSoft/RET_CLIP_SMALL
Created directory: CodingSoft/RET_ENHANCED_LARGE
Created directory: CodingSoft/RET_ENHANCED_SMALL

🔹 Skipping Step 2: Model training (using saved model)

🔹 Step 3: Setting up YFCC100M dataset...
Setting up YFCC100M dataset...
Scanning YFCC100M directory structure...
Found 36179 images in YFCC100M dataset
Extracting location metadata from directory structure...
Extracted metadata for 36179 images
Skipping official metadata loading as no metadata file is provide

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Generating YFCC100M embeddings...
Generating embeddings for up to 300000 YFCC100M images...


Generating embeddings: 100%|██████████| 2262/2262 [12:35<00:00,  2.99it/s]


Generated embeddings for 36179 images
Saved 36179 embeddings and metadata to /kaggle/working/yfcc_embeddings/yfcc_embeddings.pkl

🔹 Step 5: Running inference...
Starting NewsImages Retrieval Inference...
Loaded model from /kaggle/input/news_retrieval_model_1/pytorch/default/1/news_retrieval_model.pth
Loaded 8500 news articles
Images folder: /kaggle/input/newsimagedataset-v2/newsimages
Columns: ['article_id', 'article_url', 'article_title', 'article_tags', 'image_id', 'image_url']
Found 8500 image files in folder
Dataset has 8500 unique image IDs
Available images for dataset: 8500
Missing images: 0
Loaded 36179 embeddings and metadata from /kaggle/working/yfcc_embeddings/yfcc_embeddings.pkl
Building FAISS index...
Built index with 36179 images
Processing SMALL evaluation set...


Processing SMALL articles: 100%|██████████| 50/50 [00:03<00:00, 12.69it/s]


Successfully processed 50/50 articles for SMALL
Processing LARGE evaluation set...


Processing LARGE articles: 100%|██████████| 8500/8500 [10:11<00:00, 13.91it/s]


Successfully processed 8500/8500 articles for LARGE
Inference completed!

🎉 Pipeline completed successfully!


In [7]:
import shutil

# source_dir is the folder you want to zip (e.g., /kaggle/working/my_images)
source_dir = "/kaggle/working/results"

# base_name is the output zip path without the .zip extension
shutil.make_archive("/kaggle/working/CodingSoft", "zip", source_dir)


'/kaggle/working/CodingSoft.zip'