In [1]:
from dataclasses import dataclass
from pathlib import Path

In [2]:
# dataclass dacorator is used to avoid writing __init__ constructor
# in the class and eliminate use of self.attribute

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir : Path
    source_URL : str
    download_folder_path : Path
    local_data_file : Path
    processed_data : Path

@dataclass(frozen=True)
class DatabaseCredentials:
    db_user : str
    db_password : str
    db_host : str
    db_port : str
    db_name : str


In [3]:
from regression.constants import constant
from regression.utils.common_func import read_yaml, create_dir

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = constant.CONFIG_FILE_PATH,
        param_filepath = constant.PARAMS_FILE_PATH,
        secret_filepath = constant.SECRET_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(param_filepath)
        self.secret = read_yaml(secret_filepath)

        create_dir([self.config.root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_paths

        create_dir([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            download_folder_path=config.download_folder_path,
            local_data_file=config.local_data_file,
            processed_data=config.processed_data

        )

        return data_ingestion_config
    
    def get_database_credentials(self) -> DatabaseCredentials:
        cred_config = self.secret.database_cred

        credentials_data = DatabaseCredentials(
            db_user= cred_config.db_user,
            db_password=cred_config.db_password,
            db_host=cred_config.db_host,
            db_port=cred_config.db_port,
            db_name=cred_config.db_name
        )

        return credentials_data


In [5]:
import os
from regression import logger
import gdown
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [6]:
class DataIngestion:
    def __init__(self, config:DataIngestionConfig, db_config: DatabaseCredentials):
        self.config = config
        self.db_config = db_config

    def download_data(self) -> str:
        try:
            dataset_url = self.config.source_URL
            download_file = self.config.local_data_file
            os.makedirs(self.config.root_dir, exist_ok = True)
            logger.info(f"Downloading Data from {dataset_url} into {download_file}")

            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id, download_file)

            logger.info(f'Downloaded data from {dataset_url} into file {download_file}')

        except Exception as e:
            raise e
        
    def preprocess_data(self, rawdata_filepath) ->str:
        #preprocessing of data will complete by this function
        #Save the preprocessed data into local database
        processed_data_filepath = self.config.processed_data
        dataframe = pd.read_csv(rawdata_filepath)
        logger.info("Dataset Loaded in Dataframe")
        """
                Do the Preprocessing Steps Here       
        
        """

        logger.info(f"Preprocessing Completed")
        dataframe.to_csv(processed_data_filepath, index=False)
        logger.info(f'Processed Data Saved into file {processed_data_filepath}') 

        connection_string = f'postgresql+psycopg2://{self.db_config.db_user}:{self.db_config.db_password}@{self.db_config.db_host}:{self.db_config.db_port}/{self.db_config.db_name}'
        
        logger.info(f'connecting to database at {connection_string}')

        try:
            engine = create_engine(connection_string)
        except Exception as e:
            raise e
        
        logger.info("connection made to local pgadmin server")

        preprocessed_data = pd.read_csv(processed_data_filepath)
        table_name = 'whine_quality'
        preprocessed_data.to_sql(table_name, engine, if_exists='replace', index=False)
        logger.info("Successfully created table into Local Database")

    def transform_data(self) -> str:
        rawdata_filepath = self.config.local_data_file
        dataframe = pd.read_csv(rawdata_filepath)
        if(dataframe):
            logger.info(f'preprocessing data ...')
            self.preprocess_data(rawdata_filepath)
        else:
            logger.info(f'No file present in the {rawdata_filepath}')



        

In [7]:
try:
    config_manager = ConfigurationManager()
    data_ingestion_config = config_manager.get_data_ingestion_config()
    db_credentials_config = config_manager.get_database_credentials()
    data_ingestion_preprocess = DataIngestion(data_ingestion_config, db_credentials_config)
    data_ingestion_preprocess.download_data()
    data_ingestion_preprocess.transform_data()
except Exception as e:
    raise e

[2024-09-13 15:54:59,982:INFO:common_func:yaml file: ..\config\config.yaml loaded successfully]
[2024-09-13 15:54:59,985:INFO:common_func:yaml file: ..\params.yaml loaded successfully]


ValueError: yaml file is empty