In [6]:
import os

In [7]:
%pwd

'd:\\Data Science Projects\\DL-Project\\End-to-End-Chest-Cancer-Classification-using-MLflow-DVC-\\research'

##### move back a folder, so that data is donwloaded at correct location

In [8]:
os.chdir('../')

In [9]:
%pwd

'd:\\Data Science Projects\\DL-Project\\End-to-End-Chest-Cancer-Classification-using-MLflow-DVC-'

#### 1.1 updated config.yaml file

#### 1.2 updated params.yaml file

#### 1.3 update entity

In [10]:
from dataclasses import dataclass
from pathlib import Path

## entity
@dataclass(frozen=True) ## frozen=True: This makes instances of the class immutable. Once an object of this class is created, its attributes cannot be modified. If you try to change an attribute, Python will raise an error.
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path ## all these class variables are same as of config file.
## these are the return type of a method. like how ConfigBox is a returntype for few methods, this can also be used like that.
    
## Note: You don't see self because the dataclass decorator generates the __init__ method automatically, including the handling of self.
## when using the @dataclass decorator, you don't need to manually define the __init__() method. The dataclass decorator automatically generates it for you, 
## and it handles the initialization of attributes based on the fields you define in the class.

#### 1.4 update of config.yaml using configuration manager

In [11]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH, ##paths from constants
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) ##creates directories for artifacts which is artifact_root.
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion ## data_ingestion from config.yaml
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        return data_ingestion_config


#### 1.5 data ingestion COMPONENT creation

In [14]:
import os
import zipfile
import gdown
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
     
    def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            file_id = dataset_url.split("/")[-2] ## extracting drive download file_id from source_url
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id,zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

#### 1.6 Pipe Line

In [17]:
try:
    config = ConfigurationManager() ##revoking constructor from entity ConfigurationManager
    data_ingestion_config = config.get_data_ingestion_config() ##config.yaml is connected
    data_ingestion = DataIngestion(config = data_ingestion_config) ## componed instance
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-09-06 17:32:35,299: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-06 17:32:35,299: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-06 17:32:35,308: INFO: common: created directory at: artifacts]
[2024-09-06 17:32:35,311: INFO: common: created directory at: artifacts/data_ingestion]
[2024-09-06 17:32:35,314: INFO: 2511976084: Downloading data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=sharing into file artifacts/data_ingestion/data.zip]


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY&confirm=t&uuid=abce784b-8ce7-4428-9e96-e88fcbbf9d1f
To: d:\Data Science Projects\DL-Project\End-to-End-Chest-Cancer-Classification-using-MLflow-DVC-\artifacts\data_ingestion\data.zip
100%|██████████| 49.0M/49.0M [00:07<00:00, 6.37MB/s]

[2024-09-06 17:32:46,712: INFO: 2511976084: Downloaded data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=sharing into file artifacts/data_ingestion/data.zip]



