## Testing Data Ingestion Stage

* Testing basic functionality of data ingestion pipeline

In [2]:
%pwd

'd:\\ImageClassifier-Pipeline\\research_env'

In [3]:
### to root directory
%cd ..

#It will create a log file inside research_env - remov that at the end 

d:\ImageClassifier-Pipeline


In [4]:
## unmanaged code for DataIngestionConfig
import os
from collections import namedtuple


DataIngestionConfig = namedtuple("DataIngestionConfig", ["root_dir","source_URL","local_data_file","unzip_dir",])
DataIngestionConfig

__main__.DataIngestionConfig

In [5]:
## Managed code for DataIngestionConfig
from pathlib import Path
from dataclasses import dataclass


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

### Defining Configuration Manager

In [6]:
from imageClassifier.constants import *
from imageClassifier.utils import read_yaml, create_directories

   
        
class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) #it can create n-number of directories

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(root_dir=config.root_dir,
                                                    source_URL=config.source_URL,
                                                    local_data_file=config.local_data_file,
                                                    unzip_dir=config.unzip_dir,)
        return data_ingestion_config

### Now writing data ingestion class

In [17]:
#will implement logger and progress baar in data_ingestion.py file
import os
import requests
import urllib.request
from zipfile import ZipFile



class DataIngestion:
    def __init__(self, config:DataIngestionConfig):
        self.config = config

    # download data - if this will not work we have to use port number approach
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename,headers = urllib.request.urlretrieve(url=self.config.source_URL,filename=self.config.local_data_file)
    ######################
    # def download_file(self):
    #     if not os.path.exists(self.config.local_data_file):
    #         try:
    #             response = requests.get(self.config.source_URL, stream=True)
    #             response.raise_for_status()  # Raise an exception for non-200 status codes

    #             with open(self.config.local_data_file,'wb') as f:
    #                 for chunk in response.iter_content(chunk_size=8192):
    #                     f.write(chunk)

    #         except requests.exceptions.RequestException as e:
    #             raise Exception(f"Download failed with error: {e}")
    ######################

    def _get_updated_list_of_files(self, list_of_files):
        return [file for file in list_of_files if file.endswith(".jpg") and ("Cat" in file or "Dog" in file)]
    
    
    def _preprocess(self, zf:ZipFile, f:str, working_dir:str):
        target_filepath = os.path.join(working_dir, f)
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)
        #for images which have 0 size
        if os.path.getsize(target_filepath)==0:
            os.remove(target_filepath)

    #unzip and clean data
    def unzip_and_clean(self):
        with ZipFile(file=self.config.local_data_file, mode="r") as zf:
            list_of_files = zf.namelist()
            updated_list_of_files = self._get_updated_list_of_files(list_of_files) #helps in getting only related image files
            
            for f in updated_list_of_files:
                self._preprocess(zf, f, self.config.unzip_dir)

**`NOTE:`**
* My folder structure for saved images will be like this : "File_0/File_1/Dog/7.jpg"
* Above functions will not work for this folder structure : "File_0/File_1/7.jpg"

Methods like : `_get_updated_list_of_files` and `_preprocess` are hidden methods
* `Single leading underscore (_)`: This suggests method are intended for internal use within class and shouldn't be called directly from outside

To access above Classes and Methods we need to be in root directory

In [8]:
%pwd

'd:\\ImageClassifier-Pipeline'

In [18]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_and_clean()
except Exception as e:
    raise e

[2023-12-23 15:07:14,268: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-23 15:07:14,274: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-23 15:07:14,277: INFO: common: Created directory at: artifacts]
[2023-12-23 15:07:14,284: INFO: common: Created directory at: artifacts/data_ingestion]
