## Load the dataset from Kaggle ('olegbaryshnikov/rsna-roi-512x512-pngs')

In [4]:
import os
import json
import zipfile
from pathlib import Path
import requests
from kaggle.api.kaggle_api_extended import KaggleApi
import logging
from tqdm import tqdm
import shutil

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('kaggle_downloader.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class KaggleDatasetDownloader:
    def __init__(self, output_dir: str):
        """
        Initialize the Kaggle dataset downloader.
        
        Args:
            output_dir (str): Directory where the dataset will be saved.
        """
        self.output_dir = Path(output_dir)
        self.api = KaggleApi()
        
        # Set custom kaggle.json location if needed
        if not (Path.home() / '.kaggle' / 'kaggle.json').exists():
            custom_kaggle_path = '/Volumes/KODAK/folder 02/Brest_cancer_prediction/src/kaggle.json'
            if Path(custom_kaggle_path).exists():
                os.environ['KAGGLE_CONFIG_DIR'] = str(Path(custom_kaggle_path).parent)
        
        # Validate and create output directory
        self._prepare_output_directory()
        
    def _prepare_output_directory(self) -> None:
        """Ensure the output directory exists and is writable."""
        try:
            self.output_dir.mkdir(parents=True, exist_ok=True)
            # Test write permission
            test_file = self.output_dir / '.permission_test'
            test_file.touch()
            test_file.unlink()
        except Exception as e:
            logger.error(f"Failed to prepare output directory: {e}")
            raise
            
    def _validate_kaggle_credentials(self) -> bool:
        """Check if Kaggle credentials are properly configured."""
        try:
            kaggle_dir = Path(os.environ.get('KAGGLE_CONFIG_DIR', Path.home() / '.kaggle'))
            kaggle_json = kaggle_dir / 'kaggle.json'
            
            if not kaggle_json.exists():
                logger.error(f"Kaggle credentials not found at {kaggle_json}. Please ensure kaggle.json exists.")
                return False
                
            with open(kaggle_json) as f:
                json.load(f)  # Validate JSON
            return True
        except Exception as e:
            logger.error(f"Invalid kaggle.json file: {e}")
            return False
            
    def _download_with_progress(self, dataset_name: str, destination: Path) -> bool:
        """Download dataset with progress bar."""
        try:
            # Get the download URL
            dataset_files = self.api.dataset_list_files(dataset_name).files
            if not dataset_files:
                logger.error("No files found in dataset")
                return False
                
            # Create progress bar
            with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1) as pbar:
                def update_progress(block_num, block_size, total_size):
                    if pbar.total != total_size:
                        pbar.total = total_size
                    pbar.update(block_size)
                
                # Download each file in the dataset
                for file in dataset_files:
                    file_path = destination / file.name
                    logger.info(f"Downloading {file.name}...")
                    
                    self.api.dataset_download_file(
                        dataset=dataset_name,
                        file_name=file.name,
                        path=destination,
                        force=True,
                        quiet=True
                    )
                    
                    # The API doesn't provide direct progress, so we simulate it
                    temp_file = destination / file.name
                    if temp_file.exists():
                        temp_file.rename(file_path)
                        pbar.total = os.path.getsize(file_path)
                        pbar.update(os.path.getsize(file_path))
            
            return True
        except Exception as e:
            logger.error(f"Download failed: {e}")
            return False
            
    def download_dataset(self, dataset_name: str, unzip: bool = True, delete_zip: bool = True) -> bool:
        """
        Download a dataset from Kaggle.
        
        Args:
            dataset_name (str): Kaggle dataset identifier in format 'owner/dataset-name'
            unzip (bool): Whether to unzip the downloaded file
            delete_zip (bool): Whether to delete the zip file after extraction
            
        Returns:
            bool: True if download and processing succeeded, False otherwise
        """
        if not self._validate_kaggle_credentials():
            return False
            
        try:
            logger.info(f"Initializing Kaggle API connection...")
            self.api.authenticate()
            
            logger.info(f"Downloading dataset: {dataset_name}")
            
            # Download with progress tracking
            zip_path = self.output_dir / f"{dataset_name.replace('/', '_')}.zip"
            
            # First try the standard download method
            try:
                self.api.dataset_download_files(
                    dataset=dataset_name,
                    path=self.output_dir,
                    quiet=False,  # Let Kaggle show its progress
                    force=True,
                    unzip=False
                )
                
                # Rename the downloaded file to a consistent format
                temp_zip = self.output_dir / f"{dataset_name.split('/')[1]}.zip"
                if temp_zip.exists():
                    temp_zip.rename(zip_path)
            except Exception as e:
                logger.warning(f"Standard download failed, trying alternative method: {e}")
                if not self._download_with_progress(dataset_name, self.output_dir):
                    raise RuntimeError("Both download methods failed")
                
            if not zip_path.exists():
                # Check if files were downloaded without zip
                contents = list(self.output_dir.glob('*'))
                if contents:
                    logger.info(f"Files downloaded directly without zip: {contents}")
                    return True
                raise FileNotFoundError(f"Downloaded files not found at {self.output_dir}")
                
            logger.info(f"Successfully downloaded dataset to {zip_path}")
            
            if unzip and zip_path.exists():
                self._unzip_file(zip_path, delete_zip)
                
            return True
            
        except requests.exceptions.HTTPError as e:
            logger.error(f"HTTP Error occurred: {e}")
            if e.response.status_code == 403:
                logger.error("Authentication failed. Please check your Kaggle API token.")
            elif e.response.status_code == 404:
                logger.error("Dataset not found. Please check the dataset name.")
        except Exception as e:
            logger.error(f"An error occurred while downloading dataset: {e}")
            
        return False
        
    def _unzip_file(self, zip_path: Path, delete_zip: bool = True) -> None:
        """
        Unzip a downloaded dataset.
        
        Args:
            zip_path (Path): Path to the zip file
            delete_zip (bool): Whether to delete the zip file after extraction
        """
        try:
            logger.info(f"Extracting {zip_path.name}...")
            
            # Get total size for progress bar
            total_size = sum(f.file_size for f in zipfile.ZipFile(zip_path).infolist())
            
            with tqdm(total=total_size, unit='B', unit_scale=True, desc="Extracting") as pbar:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    for file in zip_ref.infolist():
                        try:
                            zip_ref.extract(file, self.output_dir)
                            pbar.update(file.file_size)
                        except Exception as e:
                            logger.warning(f"Failed to extract {file.filename}: {e}")
                            continue
                        
            logger.info(f"Extraction complete to {self.output_dir}")
            
            if delete_zip:
                zip_path.unlink()
                logger.info(f"Deleted zip file: {zip_path.name}")
                
        except zipfile.BadZipFile:
            logger.error(f"File is not a zip file or is corrupted: {zip_path}")
        except Exception as e:
            logger.error(f"Error during extraction: {e}")
            

def main():
    # Configuration
    DATASET_NAME = "olegbaryshnikov/rsna-roi-512x512-pngs"
    OUTPUT_DIR = "/Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data"
    
    try:
        downloader = KaggleDatasetDownloader(OUTPUT_DIR)
        success = downloader.download_dataset(DATASET_NAME)
        
        if success:
            logger.info("Dataset download and processing completed successfully!")
        else:
            logger.error("Dataset download failed.")
            exit(1)
            
    except Exception as e:
        logger.error(f"Fatal error in main execution: {e}")
        exit(1)
        

if __name__ == "__main__":
    main()

2025-06-22 09:11:14,958 - INFO - Initializing Kaggle API connection...
2025-06-22 09:11:14,960 - INFO - Downloading dataset: olegbaryshnikov/rsna-roi-512x512-pngs


Dataset URL: https://www.kaggle.com/datasets/olegbaryshnikov/rsna-roi-512x512-pngs
Downloading rsna-roi-512x512-pngs.zip to /Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data


100%|██████████| 9.21G/9.21G [01:22<00:00, 120MB/s] 
2025-06-22 10:38:13,530 - INFO - Successfully downloaded dataset to /Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data/olegbaryshnikov_rsna-roi-512x512-pngs.zip
2025-06-22 10:38:13,535 - INFO - Extracting olegbaryshnikov_rsna-roi-512x512-pngs.zip...





Extracting: 100%|██████████| 12.1G/12.1G [17:22<00:00, 11.6MB/s] 
2025-06-22 10:55:36,087 - INFO - Extraction complete to /Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data
2025-06-22 10:55:36,101 - INFO - Deleted zip file: olegbaryshnikov_rsna-roi-512x512-pngs.zip
2025-06-22 10:55:36,203 - INFO - Dataset download and processing completed successfully!


## Exploring the CSV file

In [6]:
import os
import pandas as pd
from pathlib import Path

def check_for_csv_files(dataset_path):
    """
    Check for CSV files in the dataset directory and validate their structure.
    
    Args:
        dataset_path (str): Path to the raw dataset directory.
    """
    dataset_path = Path(dataset_path)
    print(f"Checking for CSV files in: {dataset_path}")
    
    # Find all CSV files recursively (ignore macOS metadata files)
    csv_files = [f for f in dataset_path.rglob("*.csv") 
                if not f.name.startswith('._')]  # Skip macOS hidden files
    
    if not csv_files:
        print(" No valid CSV files found in the dataset directory.")
        return
    
    print(f"Found {len(csv_files)} CSV file(s):")
    for csv_file in csv_files:
        print(f"  - {csv_file.relative_to(dataset_path)}")
    
    # Analyze each CSV file
    for csv_file in csv_files:
        print(f"\nAnalyzing: {csv_file.name}")
        try:
            # Try multiple encodings for compatibility
            try:
                df = pd.read_csv(csv_file)
            except UnicodeDecodeError:
                df = pd.read_csv(csv_file, encoding='latin1')
            
            # Basic info
            print(f"  - Shape: {df.shape} (rows, columns)")
            print(f"  - Columns: {list(df.columns)}")
            
            # Check for critical columns (case-insensitive)
            critical_cols = {'image_path', 'label', 'diagnosis', 'patient_id', 'image_id'}
            found_cols = [col for col in df.columns 
                         if col.lower() in {c.lower() for c in critical_cols}]
            
            if found_cols:
                print(f"  - Found relevant columns: {found_cols}")
                # Show value counts for the first matching column
                print(f"  - Value counts for '{found_cols[0]}':\n{df[found_cols[0]].value_counts()}")
            else:
                print("   No standard label columns found")
                print("     Expected columns like: 'image_path', 'label', 'diagnosis'")
            
            # Display first 3 rows
            print("\n  Sample data:")
            print(df.head(3).to_string())
            
        except Exception as e:
            print(f"   Error reading {csv_file.name}: {str(e)}")
            if "No columns to parse" in str(e):
                print("     This might be a corrupted or empty CSV file.")

if __name__ == "__main__":
    dataset_path = "/Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data"
    check_for_csv_files(dataset_path)

Checking for CSV files in: /Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data
Found 1 CSV file(s):
  - unrecognized_images.csv

Analyzing: unrecognized_images.csv
  - Shape: (89, 3) (rows, columns)
  - Columns: ['Unnamed: 0', 'patient_id', 'image_id']
  - Found relevant columns: ['patient_id', 'image_id']
  - Value counts for 'patient_id':
33581    4
36584    4
2738     4
13095    3
52509    3
39850    3
59101    3
20008    3
60669    3
65471    2
735      2
7010     2
4073     2
4659     2
51985    2
53879    2
32292    2
3768     2
16497    2
26102    2
26576    1
8421     1
7330     1
822      1
17111    1
43368    1
44259    1
46373    1
489      1
2086     1
50601    1
51028    1
16124    1
15503    1
53470    1
15237    1
54713    1
20302    1
6637     1
29768    1
21827    1
31065    1
26530    1
33150    1
33208    1
35039    1
25578    1
36847    1
1511     1
38571    1
38703    1
23419    1
33084    1
36590    1
23251    1
22573    1
5509     1
Name: patient_id, d

In [20]:
import shutil
from pathlib import Path

def move_csv_file(source_dir, target_dir, csv_filename="unrecognized_images.csv"):
    """
    Move a CSV file from source directory to target directory.
    
    Args:
        source_dir (str): Path to source directory containing the CSV
        target_dir (str): Path to target directory
        csv_filename (str): Name of the CSV file to move
    """
    source_path = Path(source_dir) / csv_filename
    target_path = Path(target_dir) / csv_filename
    
    try:
        # Check if source exists
        if not source_path.exists():
            print(f" Source file not found: {source_path}")
            return False
        
        # Create target directory if needed
        target_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Copy the file (use shutil.copy2 to preserve metadata)
        shutil.copy2(source_path, target_path)
        print(f"✅ Successfully copied:\n"
              f"  From: {source_path}\n"
              f"  To: {target_path}")
        return True
        
    except Exception as e:
        print(f" Error moving file: {e}")
        return False

if __name__ == "__main__":
    
    # Define paths
    source_directory = "/Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data"
    target_directory = "/Volumes/KODAK/folder 02/Brest_cancer_prediction/data"
    
    # Execute the move
    move_csv_file(source_directory, target_directory)

✅ Successfully copied:
  From: /Volumes/KODAK/folder 02/Brest_cancer_prediction/data/raw_data/unrecognized_images.csv
  To: /Volumes/KODAK/folder 02/Brest_cancer_prediction/data/unrecognized_images.csv
