In [1]:
import os
import sys
from pathlib import Path

current_notebook_location = Path(os.getcwd())
package_container_dir = current_notebook_location.parent

path_to_add = str(package_container_dir.resolve())

if path_to_add not in sys.path:
    sys.path.insert(0, path_to_add)
    print(f"Added '{path_to_add}' to sys.path.")
else:
    print(f"'{path_to_add}' is already in sys.path.")

print("\n--- Current sys.path for debugging: ---")
for path in sys.path:
    print(f"- {path}")
print("---------------------------------------")

Added '/home/mushfiq/Desktop/End-to-End-MLOPS/ds_end_to_end' to sys.path.

--- Current sys.path for debugging: ---
- /home/mushfiq/Desktop/End-to-End-MLOPS/ds_end_to_end
- /home/mushfiq/anaconda3/envs/mlops/lib/python312.zip
- /home/mushfiq/anaconda3/envs/mlops/lib/python3.12
- /home/mushfiq/anaconda3/envs/mlops/lib/python3.12/lib-dynload
- 
- /home/mushfiq/anaconda3/envs/mlops/lib/python3.12/site-packages
---------------------------------------


In [2]:
import os
from dataclasses import dataclass
from pathlib import Path
import urllib.request as request
import zipfile

from src.ds_end_to_end.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from src.ds_end_to_end.utils.common import read_yaml, create_directories
from src.ds_end_to_end import logger

In [3]:
@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH
    ):
        self.base_path = Path(sys.path[0])

        absolute_config_path = self.base_path / config_filepath
        absolute_params_path = self.base_path / params_filepath
        absolute_schema_path = self.base_path / schema_filepath

        self.config = read_yaml(absolute_config_path)
        self.params = read_yaml(absolute_params_path)
        self.schema = read_yaml(absolute_schema_path)

        create_directories([self.base_path / Path(self.config.artifacts_root)])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        ingestion_root_dir = self.base_path / Path(config.root_dir)
        ingestion_local_data_file = self.base_path / Path(config.local_data_file)
        ingestion_unzip_dir = self.base_path / Path(config.unzip_dir)

        create_directories([ingestion_root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=ingestion_root_dir,
            source_URL=config.source_URL,
            local_data_file=ingestion_local_data_file,
            unzip_dir=ingestion_unzip_dir
        )

        return data_ingestion_config

In [5]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        if not self.config.local_data_file.exists():
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = str(self.config.local_data_file)
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File '{self.config.local_data_file}' already exists!")

    def extract_zip_file(self):
        """
        Extracts the zip file into the data directory
        """
        unzip_path = self.config.unzip_dir
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
        logger.info(f"Zip file extracted to: {unzip_path}")

In [6]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
    logger.info("Data Ingestion process completed successfully!")
except Exception as e:
    logger.exception(f"Error during Data Ingestion: {e}")
    raise e

[2025-07-22 13:36:50,127: INFO: common: yaml file: /home/mushfiq/Desktop/End-to-End-MLOPS/ds_end_to_end/config/config.yaml loaded successfully]
[2025-07-22 13:36:50,128: INFO: common: yaml file: /home/mushfiq/Desktop/End-to-End-MLOPS/ds_end_to_end/params.yaml loaded successfully]
[2025-07-22 13:36:50,130: INFO: common: yaml file: /home/mushfiq/Desktop/End-to-End-MLOPS/ds_end_to_end/schema.yaml loaded successfully]
[2025-07-22 13:36:51,212: INFO: 966646210: /home/mushfiq/Desktop/End-to-End-MLOPS/ds_end_to_end/artifacts/data_ingestion/data.zip download! with following info: 
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: A904:1CF3A8:1EDB1E: