# DATA INGESTION

In [1]:
import os

In [2]:
%pwd

'e:\\Project\\MLOPs\\Chicken_disease_classification\\research'

Changing directory

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\Project\\MLOPs\\Chicken_disease_classification'

## 1.   Return type or Entity of data_ingestion component

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

## 2. Creating configuration for Data Ingestion

In [6]:
from chicken_disease.constants import *
from chicken_disease.utils import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    """
    A class for managing configuration parameters and providing access to specific configurations.

    Attributes:
        config_filepath (str): File path to the configuration file.
        params_filepath (str): File path to the parameters file.
        config (dict): Loaded configuration data.
        params (dict): Loaded parameter data.
    """
    
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        """
        Initializes a new ConfigurationManager instance.

        Args:
            config_filepath (str, optional): File path to the configuration file. Defaults to CONFIG_FILE_PATH.
            params_filepath (str, optional): File path to the parameters file. Defaults to PARAMS_FILE_PATH.
        """

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Retrieves the data ingestion configuration.

        Returns:
            DataIngestionConfig: The data ingestion configuration.
        """

        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

## 3. Functions for Data Ingestion

In [8]:
import os
import urllib.request as request
import zipfile
from chicken_disease import logger
from chicken_disease.utils import get_size

In [9]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        """
        Initializes a DataIngestion instance with the provided configuration.

        Args:
            config (DataIngestionConfig): The configuration object containing the necessary settings for data ingestion.

        Returns:
            None
        """

        self.config = config

    def download_file(self):
        """
        Downloads a file from a remote source to the local data directory.

        If the file does not exist locally, it will be downloaded from the specified source URL.
        If the file already exists, the function will log the file's size.

        Returns:
            None
        """

        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  

    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

## 4. Creating Data Ingestion Pipeline

In [10]:
import sys
from chicken_disease import CDCException
import yaml

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
    
except Exception as e:
    raise CDCException(error_message=e, error_detail=sys)