In [1]:
import os
os.chdir('../')
%pwd

'/home/paladin/Downloads/Sensor-Fault-Detection'

In [2]:
from pathlib import Path
from dataclasses import dataclass


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    s3_bucket: str
    s3_key: str
    s3_secret_key: str
    object_key: Path
    local_data_file: Path
    train_test_ratio: float
    train_data_file: Path
    test_data_file: Path
    drop_columns: list

In [3]:
from sensorFaultDetection.constants import *
from sensorFaultDetection.utils import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 secret_filepath=SECRET_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 saved_modelpath=SAVED_MODEL_PATH,
                 ):
       
        self.config = read_yaml(config_filepath)
        self.secret = read_yaml(secret_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)
        self.saved_modelpath = saved_modelpath
        
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        secret = self.secret.aws_credential

        create_directories([config.ROOT_DIR])

        data_ingestion_config = DataIngestionConfig(
            root_dir= config.ROOT_DIR,
            s3_bucket= secret.S3_BUCKET,
            s3_key= secret.S3_KEY,
            s3_secret_key= secret.S3_SECRET_KEY,
            object_key= secret.OBJECT_KEY,
            local_data_file= config.LOCAL_DATA_FILE,
            train_test_ratio= self.params.TRAIN_TEST_RATIO,
            train_data_file= config.TRAIN_DATA_FILE,
            test_data_file= config.TEST_DATA_FILE,
            drop_columns= self.schema.drop_columns

        )
        

        return data_ingestion_config

In [5]:
import numpy as np
import os
import pandas as pd
from io import StringIO
import boto3
from sensorFaultDetection.utils import get_size
from sensorFaultDetection.logger import logging
from sklearn.model_selection import train_test_split

In [6]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.client = boto3.client('s3',
                      aws_access_key_id=self.config.s3_key,
                      aws_secret_access_key=self.config.s3_secret_key
                      )


    def dowload_file(self):
        if not os.path.exists(self.config.local_data_file):
            csv_obj = self.client.get_object(Bucket=self.config.s3_bucket, Key=self.config.object_key)
            body = csv_obj['Body']
            csv_string = body.read().decode('utf-8')
            df = pd.read_csv(StringIO(csv_string))
            
            if "_id" in list(df):
                df = df.drop('_id', axis=1)
            df.replace({'na': np.nan}, inplace=True)
            df.drop(columns= self.config.drop_columns, axis=1, inplace=True)

            df.to_csv(self.config.local_data_file, index=False, header=True)
            logging.info(f'{self.config.local_data_file} is downloaded!') 
            self.df = df             

        else:
            logging.info(f"File already exists of size : {get_size(Path(self.config.local_data_file))}")

    def train_test_creation(self):
        df = pd.read_csv(self.config.local_data_file)
        target_feature  = 'class'
        y = df[target_feature]
        X = df.drop(target_feature, axis=1)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.config.train_test_ratio, shuffle=True)
        X_train[target_feature] = y_train
        X_test[target_feature] = y_test

        # shift column target_feature to first position
        first_column_train = X_train.pop(target_feature) 
        first_column_test = X_test.pop(target_feature)   
        # insert column using insert(position,column_name,first_column) function
        X_train.insert(0, target_feature, first_column_train)
        X_test.insert(0, target_feature, first_column_test)

        X_train.to_csv(self.config.train_data_file, index=False, header=True)
        logging.info(f'Train data is created and saved at {self.config.train_data_file}!')   
        X_test.to_csv(self.config.test_data_file, index=False, header=True)
        logging.info(f'Test data is created and saved at {self.config.test_data_file}!')   

In [7]:
import sys
from sensorFaultDetection.exception import CustomException

In [8]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.dowload_file()
    data_ingestion.train_test_creation()
except Exception as e:
    CustomException(e, sys)