In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import timm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score
from skimage.feature import local_binary_pattern
import json
import ast
import random
import logging
from typing import List, Dict, Tuple, Any, Optional, Union
from transformers import PreTrainedModel, PretrainedConfig
from huggingface_hub import HfApi, create_repo, upload_folder, login
from datasets import load_dataset
import warnings
from tqdm import tqdm
import io


# Setup logging
def setup_logging(level: str = 'INFO', log_file: Optional[str] = None) -> logging.Logger:
    """Setup logging configuration."""
    logger = logging.getLogger('medical_cui_pipeline')
    logger.setLevel(getattr(logging, level.upper()))
    
    # Clear existing handlers
    logger.handlers.clear()
    
    # Console handler
    console_handler = logging.StreamHandler()
    console_formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    console_handler.setFormatter(console_formatter)
    logger.addHandler(console_handler)
    
    # File handler if specified
    if log_file:
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(console_formatter)
        logger.addHandler(file_handler)
    
    return logger

class MedicalImageDataset(Dataset):
    """Custom Dataset for medical images with CUI labels."""
    
    def __init__(
        self, 
        parquet_path: str, 
        transform: Optional[Any] = None, 
        is_training: bool = True,
        device: str = 'cpu',
        logger: Optional[logging.Logger] = None
    ):

        self.df = pd.read_parquet(parquet_path)

        self.transform = transform
        self.is_training = is_training
        self.device = device
        self.logger = logger or logging.getLogger(__name__)
        
        if is_training:
            self._process_cui_labels()
        
        self.logger.info(f"Loaded dataset with {len(self.df)} samples from {parquet_path}")
        
    def _process_cui_labels(self) -> None:
        all_cuis = []
        for cui_list in self.df['cui']:
            if isinstance(cui_list, str):
                try:
                    cui_list = ast.literal_eval(cui_list) if cui_list.startswith('[') else [cui_list]
                except (ValueError, SyntaxError):
                    self.logger.warning(f"Failed to parse CUI list: {cui_list}")
                    cui_list = [cui_list]
            all_cuis.extend(cui_list)
        
        unique_cuis = list(set(all_cuis))
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit([unique_cuis])
        
        cui_labels = []
        for cui_list in self.df['cui']:
            if isinstance(cui_list, str):
                try:
                    cui_list = ast.literal_eval(cui_list) if cui_list.startswith('[') else [cui_list]
                except (ValueError, SyntaxError):
                    cui_list = [cui_list]
            cui_labels.append(cui_list)
        
        self.labels = self.mlb.transform(cui_labels)
        self.logger.info(f"Processed {len(unique_cuis)} unique CUI labels")
        
    def set_mlb(self, mlb: MultiLabelBinarizer) -> None:
        self.mlb = mlb
        cui_labels = []
        for cui_list in self.df['cui']:
            if isinstance(cui_list, str):
                try:
                    cui_list = ast.literal_eval(cui_list) if cui_list.startswith('[') else [cui_list]
                except (ValueError, SyntaxError):
                    cui_list = [cui_list]
            cui_labels.append(cui_list)
        self.labels = self.mlb.transform(cui_labels)
    
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                image_bytes = self.df.loc[idx]['image']['bytes']
                image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
                img_array = np.array(image)
                
                # Convert to grayscale using standard weights
                gray = np.dot(img_array[..., :3], [0.299, 0.587, 0.114])
                
                # Normalize and convert to uint8 to avoid LBP warning
                gray_uint8 = np.clip(gray, 0, 255).astype(np.uint8)
                
                radius = 3
                n_points = 8 * radius
                lbp = local_binary_pattern(gray_uint8, n_points, radius, method='uniform')
                
                if lbp.max() > lbp.min():
                    lbp_normalized = ((lbp - lbp.min()) / (lbp.max() - lbp.min()) * 255).astype(np.uint8)
                else:
                    lbp_normalized = np.zeros_like(lbp, dtype=np.uint8)
                img_4channel = np.dstack([img_array, lbp_normalized])
                img_4channel = Image.fromarray(img_4channel.astype(np.uint8))
                
                if self.transform:
                    img_tensor = self.transform(img_4channel)
                else:
                    img_tensor = transforms.ToTensor()(img_4channel)
                
                label = torch.FloatTensor(self.labels[idx])
                return img_tensor, label

            except Exception as e:
                self.logger.warning(f"Failed to load image at index {idx} (attempt {attempt + 1}): {e}")
                if attempt == max_retries - 1:
                    self.logger.error(f"Skipping sample at index {idx} after {max_retries} attempts")
                    dummy_tensor = torch.zeros((4, 224, 224))
                    dummy_label = torch.zeros(len(self.mlb.classes_))
                    return dummy_tensor, dummy_label
                idx = (idx + 1) % len(self.df)


class FourChannelTransform:
    """Custom transform for 4-channel images."""
    
    def __init__(self, size: Tuple[int, int] = (224, 224)):
        self.size = size
        # ImageNet stats for RGB channels, mean for LBP channel
        self.mean = [0.485, 0.456, 0.406, 0.5]
        self.std = [0.229, 0.224, 0.225, 0.25]
    
    def __call__(self, img: Image.Image) -> torch.Tensor:
        # Resize
        img = img.resize(self.size, Image.Resampling.BILINEAR)
        
        # Convert to tensor
        img_array = np.array(img)
        img_tensor = torch.from_numpy(img_array).permute(2, 0, 1).float() / 255.0
        
        # Normalize
        for i in range(4):
            img_tensor[i] = (img_tensor[i] - self.mean[i]) / self.std[i]
        
        return img_tensor  # ✅ stays on CPU


class ViTConfig(PretrainedConfig):
    """Configuration class for ViT model."""
    
    model_type = "vit_medical_cui"
    
    def __init__(
        self,
        num_labels: int = 100,
        model_name: str = "vit_base_patch16_224",
        **kwargs
    ):
        super().__init__(**kwargs)
        self.num_labels = num_labels
        self.model_name = model_name

class MedicalViTModel(PreTrainedModel):
    """Modified ViT model for 4-channel input and multi-label classification."""
    
    config_class = ViTConfig
    
    def __init__(self, config: ViTConfig):
        super().__init__(config)
        self.config = config
        
        # Load pretrained ViT
        self.vit = timm.create_model(
            config.model_name, 
            pretrained=True,
            num_classes=0  # Remove classification head
        )
        
        # Modify first conv layer for 4 channels
        original_conv = self.vit.patch_embed.proj
        self.vit.patch_embed.proj = nn.Conv2d(
            4,  # 4 input channels (RGB + LBP)
            original_conv.out_channels,
            kernel_size=original_conv.kernel_size,
            stride=original_conv.stride,
            padding=original_conv.padding,
            bias=original_conv.bias is not None
        )
        
        # Initialize new weights
        with torch.no_grad():
            # Copy RGB weights and average them for the 4th channel
            rgb_weights = original_conv.weight.data
            self.vit.patch_embed.proj.weight.data[:, :3, :, :] = rgb_weights
            self.vit.patch_embed.proj.weight.data[:, 3:4, :, :] = rgb_weights.mean(dim=1, keepdim=True)
        
        # Add classification head
        self.classifier = nn.Sequential(
            nn.Linear(self.vit.num_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, config.num_labels)
        )
        
    def forward(
        self, 
        pixel_values: torch.Tensor, 
        labels: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:
        # Extract features using ViT
        features = self.vit(pixel_values)
        
        # Classification
        logits = self.classifier(features)
        
        output = {"logits": logits}
        
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)
            output["loss"] = loss
            
        return output

class MedicalImageTrainer:
    """Trainer class for medical image classification."""

    def __init__(
        self,
        model,
        train_loader: DataLoader,
        val_loader: DataLoader,
        device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
        logger: Optional[logging.Logger] = None,
        checkpoint_dir: str = "./checkpoints",
        threshold: float = 0.5,
        grad_accum_steps: int = 1,
        early_stop_patience: int = 3,
        use_amp: bool = True,
    ):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.logger = logger or logging.getLogger(__name__)
        self.checkpoint_dir = checkpoint_dir
        self.threshold = threshold
        self.grad_accum_steps = grad_accum_steps
        self.early_stop_patience = early_stop_patience
        self.use_amp = use_amp and (device != 'cpu')  # AMP only on GPU

        os.makedirs(checkpoint_dir, exist_ok=True)

        self.optimizer = optim.AdamW(
            self.model.parameters(),
            lr=3e-5,
            weight_decay=0.01
        )
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer,
            T_max=len(train_loader) * 10  # Assuming 10 epochs
        )

        self.scaler = torch.amp.GradScaler() if self.use_amp else None

        self.train_losses = []
        self.val_losses = []
        self.val_metrics = []
        self.best_f1 = 0.0

        self.early_stop_counter = 0

    def train_epoch(self) -> float:
        """Train for one epoch."""
        self.model.train()
        total_loss = 0.0
        num_batches = len(self.train_loader)
        self.optimizer.zero_grad()

        pbar = tqdm(total=num_batches, desc="Training", leave=False)

        for batch_idx, (images, labels) in enumerate(self.train_loader):
            images, labels = images.to(self.device), labels.to(self.device)

            if self.use_amp:
                with torch.amp.autocast(device_type=self.device):
                    outputs = self.model(pixel_values=images, labels=labels)
                    loss = outputs["loss"] / self.grad_accum_steps
                self.scaler.scale(loss).backward()
            else:
                outputs = self.model(pixel_values=images, labels=labels)
                loss = outputs["loss"] / self.grad_accum_steps
                loss.backward()

            if (batch_idx + 1) % self.grad_accum_steps == 0 or (batch_idx + 1) == num_batches:
                if self.use_amp:
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                else:
                    self.optimizer.step()
                self.scheduler.step()
                self.optimizer.zero_grad()

            total_loss += loss.item() * self.grad_accum_steps  # multiply back for logging

            pbar.update(1)
            pbar.set_postfix({'Loss': f'{loss.item() * self.grad_accum_steps:.4f}'})

            if batch_idx % 10 == 0:
                self.logger.info(f'Batch {batch_idx}/{num_batches}, Loss: {loss.item() * self.grad_accum_steps:.4f}')

        pbar.close()
        avg_loss = total_loss / num_batches
        self.train_losses.append(avg_loss)
        return avg_loss

    def validate(self) -> Dict[str, float]:
        """Validate the model."""
        self.model.eval()
        total_loss = 0.0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for images, labels in self.val_loader:
                images, labels = images.to(self.device), labels.to(self.device)

                outputs = self.model(pixel_values=images, labels=labels)
                loss = outputs["loss"]

                total_loss += loss.item()

                preds = torch.sigmoid(outputs["logits"]) > self.threshold
                all_preds.append(preds.cpu().numpy())
                all_labels.append(labels.cpu().numpy())

        all_preds = np.vstack(all_preds)
        all_labels = np.vstack(all_labels)

        precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
        recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
        f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

        avg_loss = total_loss / len(self.val_loader)
        self.val_losses.append(avg_loss)

        metrics = {
            'val_loss': avg_loss,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        self.val_metrics.append(metrics)

        # Save checkpoint if F1 improved
        if f1 > self.best_f1:
            self.best_f1 = f1
            self.save_checkpoint(metrics, is_best=True)
            self.logger.info(f"New best F1 score: {f1:.4f} - Model checkpoint saved")
            self.early_stop_counter = 0
        else:
            self.early_stop_counter += 1

        return metrics

    def save_checkpoint(self, metrics: Dict[str, float], is_best: bool = False) -> None:
        """Save model checkpoint."""
        checkpoint = {
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'metrics': metrics,
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
            'val_metrics': self.val_metrics
        }

        if is_best:
            checkpoint_path = os.path.join(self.checkpoint_dir, 'best_model.pt')
        else:
            checkpoint_path = os.path.join(self.checkpoint_dir, 'latest_model.pt')

        torch.save(checkpoint, checkpoint_path)

    def save_training_history(self, filepath: str = "./training_history.json") -> None:
        """Save training history to JSON file."""
        history = {
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
            'val_metrics': self.val_metrics,
            'best_f1_score': self.best_f1
        }

        with open(filepath, 'w') as f:
            json.dump(history, f, indent=2)

        self.logger.info(f"Training history saved to {filepath}")

    def train(self, num_epochs: int = 10) -> None:
        """Full training loop."""
        self.logger.info(f"Training on {self.device}")

        for epoch in range(num_epochs):
            self.logger.info(f"\nEpoch {epoch+1}/{num_epochs}")
            self.logger.info("-" * 50)

            # Train
            train_loss = self.train_epoch()
            self.logger.info(f"Training Loss: {train_loss:.4f}")

            # Log current LR
            current_lr = self.optimizer.param_groups[0]['lr']
            self.logger.info(f"Learning Rate: {current_lr:.6f}")

            # Validate
            val_metrics = self.validate()
            self.logger.info(f"Validation Loss: {val_metrics['val_loss']:.4f}")
            self.logger.info(f"Precision: {val_metrics['precision']:.4f}")
            self.logger.info(f"Recall: {val_metrics['recall']:.4f}")
            self.logger.info(f"F1-Score: {val_metrics['f1_score']:.4f}")

            # Early stopping check
            if self.early_stop_counter >= self.early_stop_patience:
                self.logger.info(f"Early stopping triggered after {self.early_stop_patience} epochs with no improvement.")
                break

        # Save final training history
        self.save_training_history()


def set_seeds(seed: int = 42) -> None:
    """Set seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def create_data_loaders(
    train_parquet: str,
    val_parquet: str,
    batch_size: int = 16,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
    seed: Optional[int] = None,
    logger: Optional[logging.Logger] = None
) -> Tuple[DataLoader, DataLoader, MultiLabelBinarizer]:
    """Create training and validation data loaders."""
    
    if logger is None:
        logger = logging.getLogger(__name__)
    
    # Set seeds if provided
    if seed is not None:
        set_seeds(seed)
        logger.info(f"Seeds set to {seed} for reproducibility")
    
    # Transform
    transform = FourChannelTransform(size=(224, 224))
    
    # Datasets
    train_dataset = MedicalImageDataset(
        train_parquet, 
        transform=transform, 
        is_training=True,
        device=device,
        logger=logger
    )
    val_dataset = MedicalImageDataset(
        val_parquet, 
        transform=transform, 
        is_training=False,
        device=device,
        logger=logger
    )
    
    # Share the multilabel binarizer
    val_dataset.set_mlb(train_dataset.mlb)
    
    # Data loaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=4,
        pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=4,
        pin_memory=True
    )
    
    logger.info(f"Created data loaders with batch size {batch_size}")
    logger.info(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")
    
    return train_loader, val_loader, train_dataset.mlb

def push_to_huggingface(
    model: MedicalViTModel,
    mlb: MultiLabelBinarizer,
    repo_name: str,
    model_card_data: Dict[str, Any],
    local_dir: str = "./medical_vit_model",
    logger: Optional[logging.Logger] = None
) -> None:
    """Push model to Hugging Face Hub."""
    
    if logger is None:
        logger = logging.getLogger(__name__)
    
    
    # Create local directory
    os.makedirs(local_dir, exist_ok=True)
    
    # Save model
    model.save_pretrained(local_dir)
    logger.info(f"Model saved to {local_dir}")
    
    # Save multilabel binarizer
    mlb_classes = mlb.classes_.tolist()
    with open(os.path.join(local_dir, "cui_classes.json"), "w") as f:
        json.dump({"classes": mlb_classes}, f, indent=2)
    
    # Create model card
    model_card = f"""---
tags:
- medical-imaging
- ViT
- LBP
- multi-label-classification
- CUIs
- vision-transformer
- pytorch
- medical-ai
language:
- en
library_name: transformers
pipeline_tag: image-classification
datasets:
- medical-cui-dataset
metrics:
- precision
- recall
- f1
---

# Medical Image CUI Classification Model

## Model Description

This model is a Vision Transformer (ViT) fine-tuned for multi-label classification of medical images with Concept Unique Identifiers (CUIs). The model uses a 4-channel input (RGB + Local Binary Pattern) for enhanced feature extraction from medical images.

## Model Architecture

- **Base Model**: {model_card_data.get('base_model', 'vit_base_patch16_224')}
- **Input Channels**: 4 (RGB + LBP features)
- **Input Size**: 224x224
- **Number of CUI Classes**: {model_card_data.get('num_classes', 'N/A')}
- **Classification Type**: Multi-label

## Training Details

- **Optimizer**: AdamW (lr=3e-5, weight_decay=0.01)
- **Loss Function**: BCEWithLogitsLoss
- **Scheduler**: CosineAnnealingLR
- **Best F1-Score**: {model_card_data.get('best_f1', 'N/A')}
- **Best Precision**: {model_card_data.get('best_precision', 'N/A')}
- **Best Recall**: {model_card_data.get('best_recall', 'N/A')}

## Usage

```python
from transformers import AutoModel, AutoConfig
import torch
from PIL import Image
import numpy as np
from skimage.feature import local_binary_pattern

# Load model
config = AutoConfig.from_pretrained("{repo_name}")
model = AutoModel.from_pretrained("{repo_name}")

# Preprocess image (implement FourChannelTransform)
# ... your preprocessing code ...

# Inference
with torch.no_grad():
    outputs = model(pixel_values=processed_image)
    predictions = torch.sigmoid(outputs.logits) > 0.5
```

## License

This model is released under the Apache 2.0 License.
"""
    
    with open(os.path.join(local_dir, "README.md"), "w") as f:
        f.write(model_card)
    
    # Create repository and upload
    try:
        api = HfApi()
        api.upload_folder(
        folder_path="/kaggle/working/medical_vit_model",
        repo_id="Gobi2005/medical-vit-cui-classifier",
)
        logger.info(f"Model successfully uploaded to: https://huggingface.co/{repo_name}")
    except Exception as e:
        logger.error(f"Error uploading to Hugging Face: {e}")


def load_and_save_dataset(
    repo_id: str = "eltorio/ROCOv2-radiology",
    train_pct: float = 0.10,
    valid_pct: float = 0.10,
    save_dir: str = "dataset"
):
    train_split = f"train[:{int(train_pct * 100)}%]"
    valid_split = f"validation[:{int(valid_pct * 100)}%]"

    # Create directories
    train_dir = os.path.join(save_dir, "train")
    valid_dir = os.path.join(save_dir, "valid")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(valid_dir, exist_ok=True)

    print(f"Loading {int(train_pct * 100)}% of training data...")
    train_ds = load_dataset(repo_id, split=train_split)
    train_save_path = os.path.join(train_dir, "train_dataset.parquet")
    train_ds.to_parquet(train_save_path)
    print(f"Saved training dataset to {train_save_path}")

    print(f"Loading {int(valid_pct * 100)}% of validation data...")
    valid_ds = load_dataset(repo_id, split=valid_split)
    valid_save_path = os.path.join(valid_dir, "valid_dataset.parquet")
    valid_ds.to_parquet(valid_save_path)
    print(f"Saved validation dataset to {valid_save_path}")

    print("✅ Dataset loading and saving complete.")
    return train_ds, valid_ds




def main() -> None:
    """Main training pipeline."""

    login(token='MY TOKEN')

    #get parquet files from HuggingFace
    load_and_save_dataset(train_pct=0.30,valid_pct=0.30)

    # Setup logging
    logger = setup_logging(level='INFO', log_file='training.log')
    
    # Configuration
    config = {
        'train_parquet': '/kaggle/working/dataset/train/train_dataset.parquet',
        'val_parquet': '/kaggle/working/dataset/valid/valid_dataset.parquet',
        'batch_size': 16,
        'num_epochs': 15,
        'repo_name': 'Gobi2005/medical-vit-cui-classifier',
        'seed': 42,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu'
    }
    
    logger.info("Starting medical image CUI classification pipeline")
    logger.info(f"Configuration: {config}")
    
    logger.info("Creating data loaders...")
    train_loader, val_loader, mlb = create_data_loaders(
        config['train_parquet'],
        config['val_parquet'],
        config['batch_size'],
        config['device'],
        config['seed'],
        logger
    )
    
    logger.info(f"Number of CUI classes: {len(mlb.classes_)}")
    
    # Create model
    model_config = ViTConfig(num_labels=len(mlb.classes_))
    model = MedicalViTModel(model_config)
    
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(f"Model created with {total_params} total parameters ({trainable_params} trainable)")
    
    # Create trainer
    trainer = MedicalImageTrainer(
        model, 
        train_loader, 
        val_loader, 
        config['device'],
        logger
    )
    
    # Train
    logger.info("Starting training...")
    trainer.train(config['num_epochs'])
    
    # Get best metrics
    best_metrics = max(trainer.val_metrics, key=lambda x: x['f1_score'])
    logger.info(f"\nBest validation metrics:")
    logger.info(f"F1-Score: {best_metrics['f1_score']:.4f}")
    logger.info(f"Precision: {best_metrics['precision']:.4f}")
    logger.info(f"Recall: {best_metrics['recall']:.4f}")
    
    # Prepare for Hugging Face upload
    model_card_data = {
        'base_model': 'vit_base_patch16_224',
        'num_classes': len(mlb.classes_),
        'best_f1': f"{best_metrics['f1_score']:.4f}",
        'best_precision': f"{best_metrics['precision']:.4f}",
        'best_recall': f"{best_metrics['recall']:.4f}"
    }
    
    # Push to Hugging Face
    logger.info("Pushing model to Hugging Face...")
    push_to_huggingface(
        model=model,
        mlb=mlb,
        repo_name=config['repo_name'],
        model_card_data=model_card_data,
        logger=logger
    )
    
    logger.info("Training pipeline completed successfully!")

if __name__ == "__main__":
    main()

2025-06-03 19:35:18.579304: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748979318.755885      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748979318.808120      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading 30% of training data...


README.md:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/27 [00:00<?, ?files/s]

train-00000-of-00027.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00001-of-00027.parquet:   0%|          | 0.00/504M [00:00<?, ?B/s]

train-00002-of-00027.parquet:   0%|          | 0.00/490M [00:00<?, ?B/s]

train-00003-of-00027.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

train-00004-of-00027.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00005-of-00027.parquet:   0%|          | 0.00/498M [00:00<?, ?B/s]

train-00006-of-00027.parquet:   0%|          | 0.00/532M [00:00<?, ?B/s]

train-00007-of-00027.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

train-00008-of-00027.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

train-00009-of-00027.parquet:   0%|          | 0.00/489M [00:00<?, ?B/s]

train-00010-of-00027.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

train-00011-of-00027.parquet:   0%|          | 0.00/508M [00:00<?, ?B/s]

train-00012-of-00027.parquet:   0%|          | 0.00/490M [00:00<?, ?B/s]

train-00013-of-00027.parquet:   0%|          | 0.00/499M [00:00<?, ?B/s]

train-00014-of-00027.parquet:   0%|          | 0.00/499M [00:00<?, ?B/s]

train-00015-of-00027.parquet:   0%|          | 0.00/498M [00:00<?, ?B/s]

train-00016-of-00027.parquet:   0%|          | 0.00/496M [00:00<?, ?B/s]

train-00017-of-00027.parquet:   0%|          | 0.00/498M [00:00<?, ?B/s]

train-00018-of-00027.parquet:   0%|          | 0.00/525M [00:00<?, ?B/s]

train-00019-of-00027.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

train-00020-of-00027.parquet:   0%|          | 0.00/483M [00:00<?, ?B/s]

train-00021-of-00027.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

train-00022-of-00027.parquet:   0%|          | 0.00/493M [00:00<?, ?B/s]

train-00023-of-00027.parquet:   0%|          | 0.00/494M [00:00<?, ?B/s]

train-00024-of-00027.parquet:   0%|          | 0.00/500M [00:00<?, ?B/s]

train-00025-of-00027.parquet:   0%|          | 0.00/511M [00:00<?, ?B/s]

train-00026-of-00027.parquet:   0%|          | 0.00/517M [00:00<?, ?B/s]

validation-00000-of-00006.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

validation-00001-of-00006.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

validation-00002-of-00006.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]

validation-00003-of-00006.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

validation-00004-of-00006.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

validation-00005-of-00006.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

test-00000-of-00006.parquet:   0%|          | 0.00/436M [00:00<?, ?B/s]

test-00001-of-00006.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

test-00002-of-00006.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

test-00003-of-00006.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

test-00004-of-00006.parquet:   0%|          | 0.00/425M [00:00<?, ?B/s]

test-00005-of-00006.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/59962 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9904 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9927 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/180 [00:00<?, ?ba/s]

Saved training dataset to dataset/train/train_dataset.parquet
Loading 30% of validation data...


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

2025-06-03 19:39:33,587 - medical_cui_pipeline - INFO - Starting medical image CUI classification pipeline
2025-06-03 19:39:33,588 - medical_cui_pipeline - INFO - Configuration: {'train_parquet': '/kaggle/working/dataset/train/train_dataset.parquet', 'val_parquet': '/kaggle/working/dataset/valid/valid_dataset.parquet', 'batch_size': 16, 'num_epochs': 15, 'repo_name': 'Gobi2005/medical-vit-cui-classifier', 'seed': 42, 'device': 'cuda'}
2025-06-03 19:39:33,589 - medical_cui_pipeline - INFO - Creating data loaders...
2025-06-03 19:39:33,598 - medical_cui_pipeline - INFO - Seeds set to 42 for reproducibility


Saved validation dataset to dataset/valid/valid_dataset.parquet
✅ Dataset loading and saving complete.


2025-06-03 19:39:42,071 - medical_cui_pipeline - INFO - Processed 19 unique CUI labels
2025-06-03 19:39:42,072 - medical_cui_pipeline - INFO - Loaded dataset with 17989 samples from /kaggle/working/dataset/train/train_dataset.parquet
2025-06-03 19:39:43,566 - medical_cui_pipeline - INFO - Loaded dataset with 2971 samples from /kaggle/working/dataset/valid/valid_dataset.parquet
2025-06-03 19:39:43,574 - medical_cui_pipeline - INFO - Created data loaders with batch size 16
2025-06-03 19:39:43,574 - medical_cui_pipeline - INFO - Training samples: 17989, Validation samples: 2971
2025-06-03 19:39:43,575 - medical_cui_pipeline - INFO - Number of CUI classes: 19


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

2025-06-03 19:39:46,536 - medical_cui_pipeline - INFO - Model created with 86398739 total parameters (86398739 trainable)
2025-06-03 19:39:46,885 - medical_cui_pipeline - INFO - Starting training...
2025-06-03 19:39:46,887 - medical_cui_pipeline - INFO - Training on cuda
2025-06-03 19:39:46,887 - medical_cui_pipeline - INFO - 
Epoch 1/15
2025-06-03 19:39:46,888 - medical_cui_pipeline - INFO - --------------------------------------------------
Training:   0%|          | 1/1125 [00:09<2:58:17,  9.52s/it, Loss=0.6848]2025-06-03 19:39:56,419 - medical_cui_pipeline - INFO - Batch 0/1125, Loss: 0.6848
Training:   1%|          | 11/1125 [00:19<21:47,  1.17s/it, Loss=0.2544]2025-06-03 19:40:06,299 - medical_cui_pipeline - INFO - Batch 10/1125, Loss: 0.2544
Training:   2%|▏         | 21/1125 [00:35<38:49,  2.11s/it, Loss=0.1724]2025-06-03 19:40:22,732 - medical_cui_pipeline - INFO - Batch 20/1125, Loss: 0.1724
Training:   3%|▎         | 31/1125 [00:47<19:59,  1.10s/it, Loss=0.1920]2025-06-03 19

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

2025-06-04 00:56:20,628 - medical_cui_pipeline - INFO - Model successfully uploaded to: https://huggingface.co/Gobi2005/medical-vit-cui-classifier
2025-06-04 00:56:20,629 - medical_cui_pipeline - INFO - Training pipeline completed successfully!


In [2]:
import zipfile
import os

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Add file to zip with relative path
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)

# Example usage:
folder_to_zip = "/kaggle/working/medical_vit_model"
output_zip_file = "/kaggle/working/output.zip"
zip_folder(folder_to_zip, output_zip_file)
