## Data Ingestion Stage

In [20]:
from collections import namedtuple
import os

In [21]:
DataIngestionConfig= namedtuple("IngestionConfig", [
    "root_dir",
    "source_URL",
    "local_data_file",
    "unzip_dir"
])

In [22]:
# The below code serves the same purpose as that of the above.

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [23]:
## The above are expirimenting for entity.

In [24]:
# Now we go for configuration manager

In [25]:
from cnnClassifier.constants import *


from cnnClassifier.utils import read_yaml, create_directories     # We do this only after coding in utils.__init__.py , or else it wont work this way

In [26]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
          config = self.config.data_ingestion

          create_directories([config.root_dir])

          data_ingestion_config = DataIngestionConfig(
                root_dir= config.root_dir,
                source_URL= config.source_URL,
                local_data_file= config.local_data_file,
                unzip_dir= config.unzip_dir
          )

          return data_ingestion_config

In [27]:
import os
import urllib.request as request    # To dowload the dataset
from zipfile import ZipFile         # To unzip the dataset

In [None]:
# Below we are creating the components

In [53]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename= self.config.local_data_file
            )


    def _get_updated_list_of_files(self, list_of_files):
        return[f for f in list_of_files if f.endswith(".jpg") and ("Cat" in f or "Dog" in f)]   # We only take images and not other files like "thumb" or Cat and dog folder
    
    # the lsit of files, will carry the path, and as per the code it checks wheather the path contains .jpg or Cat or Dog ( the folder name ), 
    # Thus it will only get the images from the folder and not from the outside.




    def _preprocess(self, zf: ZipFile, f: str, working_dir: str):
        target_filepath = os.path.join(working_dir, f)
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)                  ## But for this code to function properly we need to create working dir before right? Have we ever created that ?

        if os.path.getsize(target_filepath) == 0:
            os.remove(target_filepath)


    def unzip_and_clean(self):
        with ZipFile(file= self.config.local_data_file, mode= "r") as zf:
            list_of_files= zf.namelist()
            updated_list_of_files= self._get_updated_list_of_files(list_of_files)

            for f in updated_list_of_files:
                self._preprocess(zf, f, self.config.unzip_dir)

In [54]:
!pwd

/c/Users/harik/Desktop/All Folder/PGDA/Interships & Projects/NLP/End-to-End Workflow/cnnClassifier


In [None]:
# /c/Users/harik/Desktop/All Folder/PGDA/Interships & Projects/NLP/End-to-End Workflow/cnnClassifier/research

# Here we need to chnage the path to our root directory, that is cnnClassifier, for that we use os.chdir

In [55]:
os.chdir("C:\\Users\\harik\\Desktop\\All Folder\\PGDA\\Interships & Projects\\NLP\\End-to-End Workflow\\cnnClassifier\\research")

In [57]:
os.chdir("../")   # This will go back one folder

In [58]:
!pwd

/c/Users/harik/Desktop/All Folder/PGDA/Interships & Projects/NLP/End-to-End Workflow/cnnClassifier


In [None]:
# Below is the pipeline

In [60]:
try:
    config= ConfigurationManager()
    data_ingestion_config= config.get_data_ingestion_config()
    data_ingestion= DataIngestion(config= data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_and_clean()

except Exception as e:
    raise e

[2023-09-21 11:06:36,482: INFO: common: yaml file: config\config.yaml loaded succesfully]
[2023-09-21 11:06:36,482: INFO: common: yaml file: params.yaml loaded succesfully]
[2023-09-21 11:06:36,482: INFO: common: created directory at: artifacts]
[2023-09-21 11:06:36,498: INFO: common: created directory at: artifacts/data_ingestion]


We did all these in this ipnyb file rather than hard coding it in the entity or components, to check if this all works well. ans after the confirmation of that we procedd in writing hte code in a modular way.