In [None]:
from pathlib import Path
from utils.base_utils import read_yaml

yaml_path = Path("../config/config.yaml")

try:
    config = read_yaml(yaml_path)
    print(config)
except Exception as e:
    print(f"Error: {e}")

In [None]:
from pathlib import Path
from utils.base_utils import create_directories

base_path = Path("../test")
dirs = [base_path / "dir1", base_path / "dir2"]
create_directories(dirs)

### Initial Testing

In [None]:
import os

In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
%pwd

In [None]:
import os
import zipfile
import gdown
from utils import logger

In [None]:
# config yamal and keys and this keys are same
# this is return type of a function
# data class allows to define class variable without adding self
# == Entity ==
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_datafile: Path
    unzip_dir: Path
    data_folder: Path
    moved_location: Path

In [None]:
# constants file has the location to config files
# updtate the configuration manager in src config

from utils.base_utils import read_yaml, create_directories
from constants import *


class ConfigurationManager:

    def __init__(self, config_filepath=CONFIG_FILE_PATH, param_path=PARAMS_FILE_PATH):
        # this will retuen config box type dictionay
        self.config = read_yaml(config_filepath)
        self.param = read_yaml(param_path)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([self.config.artifacts_root])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_datafile=config.local_datafile,
            unzip_dir=config.unzip_dir,
            data_folder=config.data_folder,
            moved_location=config.moved_location,
        )
        return data_ingestion_config

In [None]:
import rarfile
import zipfile
import shutil
import gdown
from utils import logger
from pathlib import Path


# components
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self) -> str:
        try:
            dataset_url = self.config.source_URL
            zip_donwload_dir = self.config.local_datafile
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(
                f"Donwloaded data from {dataset_url} into file {zip_donwload_dir}"
            )

            file_id = dataset_url.split("/")[-2]
            prefix_url = "https://drive.google.com/uc?/export=download&id="
            gdown.download(prefix_url + file_id, zip_donwload_dir)

        except Exception as e:
            raise e

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        file_path = self.config.local_datafile

        if file_path.endswith(".zip"):
            with zipfile.ZipFile(file_path, "r") as zip_ref:
                zip_ref.extractall(unzip_path)
        elif file_path.endswith(".rar"):
            with rarfile.RarFile(file_path) as rar_ref:
                rar_ref.extractall(unzip_path)
        else:
            raise Exception(
                "File format not supported for extraction. Only .zip and .rar are supported."
            )
    
    def moved_and_cleanup(self):
        root_dir = self.config.root_dir
        source_dir = self.config.data_folder
        target_dir = self.config.moved_location

        try:
            os.makedirs(os.path.dirname(target_dir), exist_ok=True)

            if os.path.exists(target_dir):
                shutil.rmtree(target_dir)
                logger.info(f"Removed existing directory at target location: {target_dir}")
            
            shutil.move(source_dir, target_dir)
            logger.info(f"Moved data from {source_dir} to {target_dir}")

            if os.path.exists(root_dir):
                shutil.rmtree(root_dir)
                logger.info(f"Removed root ingestion directory: {root_dir}")

        except Exception as e:
            logger.error("Error during move and cleanup")
            raise e
    

In [None]:
# Pipeleline
try:
    # Initilize the ConfigurationManager
    config = ConfigurationManager()
    # Get the config yaml file details
    data_ingestion_config = config.get_data_ingestion_config()
    # Initilize the DataIngestion
    data_ingestion = DataIngestion(config=data_ingestion_config)
    # Call download file
    data_ingestion.download_file()
    # Call unzip file
    data_ingestion.extract_zip_file()
    # Move and cleanup
    data_ingestion.moved_and_cleanup()
except Exception as e:
    raise e