# LeMiL-ViT Project Setup

This notebook sets up the environment and dependencies for the LeMiL-ViT project on Kaggle.
It installs required packages, configures GPU support, and verifies the dataset structure.


In [None]:
# ====================
# Install Dependencies
# ====================
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = [
    "torch", 
    "torchvision",
    "timm",
    "transformers",
    "pandas",
    "scikit-learn",
    "matplotlib",
    "Pillow"
]

print("Installing required packages...")
for package in required_packages:
    try:
        __import__(package)
        print(f"✓ {package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)
        print(f"✓ {package} installed successfully")

# ====================
# Import Libraries
# ====================
import os
import random
import warnings
import numpy as np
import torch
import torch.nn as nn
import torchvision
import pandas as pd
from pathlib import Path
warnings.filterwarnings('ignore')

print("\nAll required packages imported successfully!")

In [None]:
# ====================
# Configure Settings
# ====================

# Define paths
DATASET_ROOT = Path("/kaggle/input/lemit-vit")
OUTPUT_ROOT = Path("/kaggle/working/lemit-vit")
TRAIN_DIR = DATASET_ROOT / "train_small/train_small"
VAL_DIR = DATASET_ROOT / "val_small/val_small"

# Create output directory if it doesn't exist
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

print("Project directories configured:")
print(f"Dataset root: {DATASET_ROOT}")
print(f"Output root: {OUTPUT_ROOT}")

In [None]:
# ====================
# Set Random Seeds
# ====================

SEED = 42

def set_seeds(seed):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seeds(SEED)
print(f"Random seeds set to {SEED} for reproducibility")

In [None]:
# ====================
# Check GPU
# ====================

def get_gpu_info():
    """Get GPU information if available"""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_count = torch.cuda.device_count()
        return f"GPU available: {gpu_name} (Count: {gpu_count})"
    return "No GPU available, running on CPU"

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nDevice configuration:")
print(f"PyTorch device: {device}")
print(get_gpu_info())
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# ====================
# Verify Dataset
# ====================

def check_dataset_structure():
    """Verify the dataset structure and print summary"""
    # Check directories exist
    if not TRAIN_DIR.exists():
        raise FileNotFoundError(f"Training directory not found at {TRAIN_DIR}")
    if not VAL_DIR.exists():
        raise FileNotFoundError(f"Validation directory not found at {VAL_DIR}")
    
    # Count images
    train_images = list(TRAIN_DIR.glob("*.png"))
    val_images = list(VAL_DIR.glob("*.png"))
    
    # Check CSV files
    train_csv = TRAIN_DIR.parent / "train_small.csv"
    val_csv = VAL_DIR.parent / "val_small.csv"
    
    if not train_csv.exists():
        raise FileNotFoundError(f"train_small.csv not found at {train_csv}")
    if not val_csv.exists():
        raise FileNotFoundError(f"val_small.csv not found at {val_csv}")
    
    print("\nDataset Structure Verification:")
    print(f"Training images found: {len(train_images)}")
    print(f"Validation images found: {len(val_images)}")
    print(f"Training CSV: {train_csv.exists()}")
    print(f"Validation CSV: {val_csv.exists()}")
    
    # Print directory tree
    def print_tree(path, level=0, max_level=2):
        if level > max_level:
            return
        prefix = "│   " * level
        print(f"{prefix}├── {path.name}")
        if path.is_dir():
            for item in path.iterdir():
                print_tree(item, level + 1, max_level)
    
    print("\nDataset Directory Structure:")
    print_tree(DATASET_ROOT)

# Verify dataset structure
check_dataset_structure()

In [None]:
# ====================
# Setup Complete
# ====================

print("\n" + "="*50)
print("Environment Setup Complete!")
print("="*50)
print("\nSummary:")
print(f"• Device: {device}")
print(f"• GPU Support: {torch.cuda.is_available()}")
print(f"• Random Seed: {SEED}")
print(f"• Dataset Root: {DATASET_ROOT}")
print(f"• Output Root: {OUTPUT_ROOT}")
print("="*50)