In [2]:
import os

In [3]:
%pwd

'e:\\Project\\DS Project\\E to E\\ML\\Credit Utility\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'e:\\Project\\DS Project\\E to E\\ML\\Credit Utility'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    train_data_path: Path
    test_data_path: Path 
    Processed_data_path: Path 
    Transformed_data_path: Path 
    Transformed_data_OBJ_PATH: Path 
    Processed_data_OBJ_PATH: Path 

In [7]:
from CREDIT_UTILITY.constants import *
from src.CREDIT_UTILITY.utils.common import read_yaml, create_directories,save_bin

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir,config.Processed_data_path,config.Transformed_data_path])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            Processed_data_path=config.Processed_data_path,
            Transformed_data_path=config.Transformed_data_path,
            Transformed_data_OBJ_PATH= config.Transformed_data_OBJ_PATH,
            Processed_data_OBJ_PATH= config.Processed_data_OBJ_PATH
        
        )

        return data_transformation_config

In [9]:
import os, sys
import pandas as pd
import joblib
import numpy as np
from CREDIT_UTILITY.logger import logger
from CREDIT_UTILITY.Exception import CustomException
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans

In [10]:
class Feature_Engineering(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        logger.info("******************feature Engineering started******************")


    def transform_data(self,data):

        try:

            #data = pd.read_csv(self.config.data_path)
            data.drop('id', axis=1, inplace=True)
            Discrete_features_more_than_2_categories=['NumDots','SubdomainLevel','PathLevel','NumDash',
                                                      'NumDashInHostname','NumUnderscore','NumPercent',
                                                      'NumQueryComponents','NumAmpersand','NumSensitiveWords',
                                                      'SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT',
                                                      'AbnormalExtFormActionR','ExtMetaScriptLinkRT',
                                                      'PctExtNullSelfRedirectHyperlinksRT']
        
            for feature in Discrete_features_more_than_2_categories:
                kmeans = KMeans(n_clusters=4, random_state=42)
                cluster_labels = kmeans.fit_predict(data[[feature]])
                data[feature] = cluster_labels

            logger.info(f"Applying KNN to reduce the Cardinality")
            return(data)
        
        except Exception as e:
                error = CustomException(e, sys)
                logger.info(error.error_message)



    def fit(self,X,y=None):
        return self 
    

    def transform(self,X:pd.DataFrame,y=None):
        try:    
            transformed_df=self.transform_data(X)
            return transformed_df
        
        except Exception as e:
                 error = CustomException(e, sys)
                 logger.info(error.error_message)



In [36]:
class DataTransformation:


    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def get_data_transformation_obj(self):
        
      try:  
            
            
            continuous_features=['UrlLength', 'NumNumericChars', 'HostnameLength', 'PathLength', 'QueryLength',
                                  'PctExtHyperlinks', 'PctExtResourceUrls', 'PctNullSelfRedirectHyperlinks']
            
            Discrete_features=['NumDots','SubdomainLevel','PathLevel','NumDash','NumDashInHostname','AtSymbol','TildeSymbol','NumUnderscore',
                            'NumPercent','NumQueryComponents','NumAmpersand','NumHash','NoHttps','RandomString','IpAddress',
                            'DomainInSubdomains','DomainInPaths','HttpsInHostname','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName',
                            'ExtFavicon','InsecureForms','RelativeFormAction','ExtFormAction','AbnormalFormAction','FrequentDomainNameMismatch','FakeLinkInStatusBar',
                            'RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame','MissingTitle','ImagesOnlyInForm',
                            'SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT','AbnormalExtFormActionR','ExtMetaScriptLinkRT',
                            'PctExtNullSelfRedirectHyperlinksRT']
            
            # Numerical pipeline
            numerical_pipeline = Pipeline(steps = [
                 ('scaler', StandardScaler(with_mean=False))])

            # Categorical Pipeline
            categorical_pipeline = Pipeline(steps = [
                ('onehot', OneHotEncoder(handle_unknown = 'ignore')),
                ('scaler', StandardScaler(with_mean=False))])
            
            preprocssor = ColumnTransformer([
                ('numerical_pipeline', numerical_pipeline,continuous_features ),
                ('categorical_pipeline', categorical_pipeline,Discrete_features )])
            
            logger.info("Pipeline Steps Completed")
            return preprocssor
      
      except Exception as e:
                 error = CustomException(e, sys)
                 logger.info(error.error_message)


    def get_feature_engineering_object(self):
        try:
            feature_engineering = Pipeline(steps = [("fe",Feature_Engineering())])

            return feature_engineering 
        
        except Exception as e:
            error = CustomException(e, sys)
            logger.info(error.error_message)


    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)
        
        train, test = train_test_split(data, test_size = 0.20, random_state= 42)
        
        train.to_csv(os.path.join(self.config.Transformed_data_path, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.Transformed_data_path, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
  

    def inititate_data_transformation(self):
        try:

            train = pd.read_csv(self.config.train_data_path)
            test = pd.read_csv(self.config.test_data_path)

            logger.info("Obtaining FE steps object")
            
            fe_obj = self.get_feature_engineering_object()

            train = fe_obj.fit_transform(train)

            test = fe_obj.transform(test)

            train.to_csv(os.path.join(self.config.Transformed_data_path, "train.csv"), index=False, mode='w')
            test.to_csv(os.path.join(self.config.Transformed_data_path, "test.csv"), index=False, mode='w')

            logger.info("Data Saved after feature_engineering")

            processing_obj = self.get_data_transformation_obj()

            traget_columns_name = "CLASS_LABEL"
            
            X_train = train.drop(columns = traget_columns_name, axis = 1)
            y_train = train[traget_columns_name]
            
            X_test = test.drop(columns = traget_columns_name, axis = 1)
            y_test = test[traget_columns_name]
            
            logger.info(X_train.columns)
            X_train = processing_obj.fit_transform(X_train)
            X_test = processing_obj.transform(X_test)
            
            train_arr = np.c_[X_train, np.array(y_train)]
            test_arr = np.c_[X_test, np.array(y_test)]

            df_train = pd.DataFrame(train_arr)
            df_test = pd.DataFrame(test_arr)

            
            df_train.to_csv(os.path.join(self.config.Transformed_data_path, "train.csv"),index = False)
            df_test.to_csv(os.path.join(self.config.Transformed_data_path, "test.csv"),index = False)

            logger.info("Processed and Feature Engineered data")
            logger.info(df_train.shape)
            logger.info(df_test.shape)

            joblib.dump(processing_obj,self.config.Processed_data_OBJ_PATH)

            joblib.dump(fe_obj, self.config.Transformed_data_OBJ_PATH)

            logger.info("Successfully dump {} and, {}".format(self.config.Processed_data_OBJ_PATH,self.config.Processed_data_OBJ_PATH))
            logger.info("Data Transformation completed")
            return(train_arr,
                   test_arr,
                   self.config.Processed_data_OBJ_PATH)

        except Exception as e:
                 error = CustomException(e, sys)
                 logger.info(error.error_message)

                   
    
    
    

In [37]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
    data_transformation.inititate_data_transformation()
except Exception as e:
        error = CustomException(e, sys)
        logger.info(error.error_message)


[2024-02-27 17:50:35,256: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-27 17:50:35,256: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-27 17:50:35,276: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-27 17:50:35,284: INFO: common: created directory at: artifacts]
[2024-02-27 17:50:35,284: INFO: common: created directory at: artifacts/data_transformation]
[2024-02-27 17:50:35,284: INFO: common: created directory at: artifacts/data_transformation/Processed]
[2024-02-27 17:50:35,292: INFO: common: created directory at: artifacts/data_transformation/Transformed]
[2024-02-27 17:50:35,585: INFO: 2953761869: Splited data into training and test sets]
[2024-02-27 17:50:35,585: INFO: 2953761869: (8000, 50)]
[2024-02-27 17:50:35,593: INFO: 2953761869: (2000, 50)]
(8000, 50)
(2000, 50)
[2024-02-27 17:50:35,702: INFO: 2953761869: Obtaining FE steps object]
[2024-02-27 17:50:35,702: INFO: 1487739946: ******************feature Engine

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[2024-02-27 17:50:36,197: INFO: 2953761869: Data Saved after feature_engineering]
[2024-02-27 17:50:36,197: INFO: 2953761869: Pipeline Steps Completed]
[2024-02-27 17:50:36,207: INFO: 2953761869: Index(['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'Mis