# data Ingestion 

In [2]:
# constants
import os 
import sys 
import numpy as np 
import pandas as pd 

""" 
defining common constant variable for training pipeline 
"""
TARGET_COLUMN='Result'
PIPELINE_NAME: str='NetworkSecurity'
ARTIFACT_DIR: str= 'Artifacts'
FILE_NAME: str = 'phisingData.csv'
TRAIN_FILE_NAME: str='train.csv'
TEST_FILE_NAME: str='test.csv'


""" 
Data Ingestion related constants starts with DATA_INGESTION VAR NAME 
"""
DATA_INGESTION_COLLECTION_NAME: str = 'NetworkData'
DATA_INGESTION_DATABASE_NAME: str = 'IMMORTALPI'
DATA_INGESTION_DIR_NAME: str= 'data_ingestion'
DATA_INGESTION_FEATURE_STORE_DIR: str='feature_store'
DATA_INGESTION_INGESTED_DIR: str='ingested'
DATA_INGESION_TAIN_TEST_SPLIT_RATIO: float=0.2 

In [None]:
# artifacts entity

from dataclasses import dataclass

@dataclass
class DataIngestionArtifact:
    trained_file_path:str 
    test_file_path:str

In [7]:
# entity 
from dataclasses import dataclass

@dataclass
class DataIngestionArtifact:
    trained_file_path:str 
    test_file_path:str

In [None]:
# config entity 
from datetime import datetime
import os 
from NetworkSecurity.constants import training_pipeline 

class TrainingPipelineConfig:
    def __init__(self, timestamp=datetime.now()):
        timestamp=timestamp.strftime('%m_%d_%Y_%H_%M_%S')
        self.pipeline_name=training_pipeline.PIPELINE_NAME
        self.artifact_name=training_pipeline.ARTIFACT_DIR
        self.artifact_dir=os.path.join(self.artifact_name,timestamp)
        self.timestamp: str=timestamp
        

class DataIngestionConfig:
    def __init__(self,training_pipeline_config:TrainingPipelineConfig):
        self.data_ingestion_dir:str=os.path.join(
            training_pipeline_config.artifact_dir,training_pipeline.DATA_INGESTION_DIR_NAME
        )
        self.feature_store_file_path: str=os.path.join(
            self.data_ingestion_dir,training_pipeline.DATA_INGESTION_FEATURE_STORE_DIR,training_pipeline.FILE_NAME
        )
        self.training_file_path: str=os.path.join(
            self.data_ingestion_dir,training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TRAIN_FILE_NAME
        )
        self.training_file_path: str=os.path.join(
            self.data_ingestion_dir,training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TRAIN_FILE_NAME
        )
        self.test_file_path: str=os.path.join(
            self.data_ingestion_dir,training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TEST_FILE_NAME
        )
        self.train_test_split_ratio:float=training_pipeline.DATA_INGESION_TAIN_TEST_SPLIT_RATIO
        self.collection_name:str=training_pipeline.DATA_INGESTION_COLLECTION_NAME
        self.database_name:str=training_pipeline.DATA_INGESTION_DATABASE_NAME




# component
- read the data from mongodb 
- create a feature store 
- create split the data into train and test


In [8]:
import os 
from NetworkSecurity.exception.exception import NetworkSecurityException
from NetworkSecurity.logging.logger import logging
import os 
import sys 
import numpy as np
import pandas as pd
import pymongo
from typing import List
from sklearn.model_selection import train_test_split 

#config of data ingestion 
#enter the config filepath 

from dotenv import load_dotenv
load_dotenv()
MONGO_DB_URL=os.getenv('MONGODB_URI')

class DataIngestion:
    def __init__(self, data_ingestion_config:DataIngestionConfig):
        try:
            self.data_ingestion_config=data_ingestion_config
        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
    def export_collection_as_dataframe(self):
        try:
            database_name=self.data_ingestion_config.database_name
            collection_name=self.data_ingestion_config.collection_name
            self.mongo_client=pymongo.MongoClient(MONGO_DB_URL)
            collection=self.mongo_client[database_name][collection_name]
            df=pd.DataFrame(list(collection.find()))
            if "_id" in df.columns.to_list():
                df=df.drop(columns=['_id'],axis=1)
            df.replace({'na':np.nan},inplace=True)
            return df
        except Exception as e:
            raise NetworkSecurityException(e,sys)
   
        
    def export_data_into_feature_store(self,dataframe:pd.DataFrame):
        try:
            feature_store_file_path=self.data_ingestion_config.feature_store_file_path
            # creating folder 
            dir_path=os.path.dirname(feature_store_file_path)
            os.makedirs(dir_path,exist_ok=True)
            dataframe.to_csv(feature_store_file_path,index=False,header=True)
            return dataframe
        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
    def split_data_as_train_test(self,dataframe:pd.DataFrame):
        try:
            train_set,test_set=train_test_split(
                dataframe,test_size=self.data_ingestion_config.train_test_split_ratio
            )
            logging.info('Performed train test split on the dataframe')
            logging.info('Exited split_data_as_Train_test method of Data_Ingestion class')
            dir_path=os.path.dirname(self.data_ingestion_config.training_file_path)
            os.makedirs(dir_path,exist_ok=True)
            logging.info(f'Exporting train and test file path')
            train_set.to_csv(
                self.data_ingestion_config.training_file_path,index=False, header=True
            )
            test_set.to_csv(
                self.data_ingestion_config.test_file_path,index=False,header=True
            )
            logging.info(f'Exported train and test file path')
        except Exception as e:
            raise NetworkSecurityException(e,sys) 

    def initiate_data_ingestion(self):
        try:
            dataframe=self.export_collection_as_dataframe()
            dataframe=self.export_data_into_feature_store(dataframe)
            self.split_data_as_train_test(dataframe)
            dataingestionartifact=DataIngestionArtifact(self.data_ingestion_config.training_file_path,
                                                        self.data_ingestion_config.test_file_path)
            return dataingestionartifact
        except Exception as e:
            raise NetworkSecurityException(e,sys)       


In [10]:
import os 
os.chdir('../')
%pwd

'd:\\pythonProjects\\NetworkSecurity'

In [12]:
# pipeline 
STAGE_NAME='DATA INGESTION STAGE'
try:
    logging.info(f'{STAGE_NAME} started')
    trainingpipelineconfig=TrainingPipelineConfig()
    dataingestionconfig=DataIngestionConfig(trainingpipelineconfig)
    dataingestion=DataIngestion(dataingestionconfig)
    logging.info('INITIATED DATA INGESTION COMPONENT')
    dataingestionartifact=dataingestion.initiate_data_ingestion()
    print(dataingestionartifact)
    logging.info(f'{STAGE_NAME} completed')
except Exception as e:
    raise NetworkSecurityException(e,sys)


DataIngestionArtifact(trained_file_path='Artifacts\\05_03_2025_18_30_20\\data_ingestion\\ingested\\train.csv', test_file_path='Artifacts\\05_03_2025_18_30_20\\data_ingestion\\ingested\\test.csv')
