#### Directory Path

In [None]:
import os

In [None]:
# Current folder path
%pwd

In [None]:
# Go to the roor directory and give the path
os.chdir("../")
%pwd

In [None]:
from utils import logger

In [None]:
# config yamal and keys and this keys are same
# this is return type of a function
# data class allows to define class variable without adding self
# == Entity ==
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    unzip_dir: Path
    reshape_dir: Path
    image_size: tuple

In [None]:
# constants file has the location to config files
# updtate the configuration manager in src config

from utils.base_utils import read_yaml, create_directories
from constants import *


class ConfigurationManager:

    def __init__(self, config_filepath=CONFIG_FILE_PATH, param_path=PARAMS_FILE_PATH):
        # this will retuen config box type dictionay
        self.config = read_yaml(config_filepath)
        self.param = read_yaml(param_path)

        create_directories([self.config.artifacts_root])

    def get_data_preprocess_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        create_directories([self.config.artifacts_root])

        data_preproess_config = DataPreprocessingConfig(
            root_dir=config.root_dir,
            unzip_dir=config.unzip_dir,
            reshape_dir=config.reshape_dir,
            image_size=config.image_size,
        )
        return data_preproess_config

In [None]:
import rarfile
import zipfile
import gdown
from utils import logger

from utils.image_utils import reshape_image, save_image
from pathlib import Path


# components
class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def reshape_extracted_data(self):
        """
        Reshape all images from unzip_dir to fixed size and save them to reshape_dir.
        Preserves class folder structure and logs progress.
        """
        source_dir = Path(self.config.unzip_dir)
        target_dir = Path(self.config.reshape_dir)
        os.makedirs(target_dir, exist_ok=True)

        total_images = 0

        # Iterate over class directories (fresh, spoiled)
        for class_dir in source_dir.iterdir():
            if not class_dir.is_dir():
                logger.info(f"Skipping non-directory item: {class_dir}")
                continue

            logger.info(f"Processing class directory: {class_dir.name}")

            # Create corresponding class folder in target_dir
            class_subdir = target_dir / class_dir.name
            class_subdir.mkdir(parents=True, exist_ok=True)

            image_count = 0

            # Recursively iterate all image files in class_dir
            for img_file in class_dir.rglob("*"):
                if img_file.is_file() and img_file.suffix.lower() in [
                    ".jpg",
                    ".jpeg",
                    ".png",
                ]:
                    try:
                        # Resize image
                        resized_image = reshape_image(
                            image_path=img_file,
                            image_size=tuple(self.config.image_size),
                        )

                        # Preserve nested subfolder structure
                        relative_path = img_file.relative_to(class_dir)
                        save_path = class_subdir / relative_path
                        save_path.parent.mkdir(parents=True, exist_ok=True)

                        save_image(resized_image, save_path)

                        image_count += 1
                        total_images += 1
                        logger.info(f"Resized and saved image to {save_path}")

                    except Exception as e:
                        logger.error(f"Error processing image {img_file}: {e}")
                        continue

            logger.info(f"Processed {image_count} images for class '{class_dir.name}'")

        logger.info(f"Total images resized and saved: {total_images}")

In [None]:
# Pipeleline
try:
    # Initilize the ConfigurationManager
    config = ConfigurationManager()
    # Get the config yaml file details
    data_preprocessing_config = config.get_data_preprocess_config()
    # Initilize the DataIngestion
    data_processing = DataPreprocessing(config=data_preprocessing_config)
    # Call reshape extracted data
    data_processing.reshape_extracted_data()
except Exception as e:
    raise e