In [2]:
import os

In [3]:
%pwd

'e:\\Project\\DS Project\\E to E\\ML\\Credit Utility\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'e:\\Project\\DS Project\\E to E\\ML\\Credit Utility'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    train_data_path: Path
    test_data_path: Path 
    Processed_data_path: Path 
    Transformed_data_path: Path 
    Transformed_data_OBJ_PATH: Path 
    Processed_data_OBJ_PATH: Path 

In [7]:
from CREDIT_UTILITY.constants import *
from src.CREDIT_UTILITY.utils.common import read_yaml, create_directories,save_bin

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir,config.Processed_data_path,config.Transformed_data_path])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            Processed_data_path=config.Processed_data_path,
            Transformed_data_path=config.Transformed_data_path,
            Transformed_data_OBJ_PATH= config.Transformed_data_OBJ_PATH,
            Processed_data_OBJ_PATH= config.Processed_data_OBJ_PATH
        
        )

        return data_transformation_config

In [9]:
import os, sys
import pandas as pd
import joblib
import numpy as np
from CREDIT_UTILITY.logger import logger
from CREDIT_UTILITY.Exception import CustomException
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans

In [10]:
class Feature_Engineering(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        logger.info("******************feature Engineering started******************")


    def transform_data(self,data):

        try:

            #data = pd.read_csv(self.config.data_path)
            data.drop('id', axis=1, inplace=True)
            Discrete_features_more_than_2_categories=['NumDots','SubdomainLevel','PathLevel','NumDash',
                                                      'NumDashInHostname','NumUnderscore','NumPercent',
                                                      'NumQueryComponents','NumAmpersand','NumSensitiveWords',
                                                      'SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT',
                                                      'AbnormalExtFormActionR','ExtMetaScriptLinkRT',
                                                      'PctExtNullSelfRedirectHyperlinksRT']
        
            for feature in Discrete_features_more_than_2_categories:
                kmeans = KMeans(n_clusters=4, random_state=42)
                cluster_labels = kmeans.fit_predict(data[[feature]])
                data[feature] = cluster_labels

            logger.info(f"Applying KNN to reduce the Cardinality")
            return(data)
        
        except Exception as e:
                error = CustomException(e, sys)
                logger.info(error.error_message)



    def fit(self,X,y=None):
        return self 
    

    def transform(self,X:pd.DataFrame,y=None):
        try:    
            transformed_df=self.transform_data(X)
            return transformed_df
        
        except Exception as e:
                 error = CustomException(e, sys)
                 logger.info(error.error_message)



In [19]:
class DataTransformation:


    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def get_data_transformation_obj(self):
        
      try:  
            
            
            continuous_features=['UrlLength', 'NumNumericChars', 'HostnameLength', 'PathLength', 'QueryLength',
                                  'PctExtHyperlinks', 'PctExtResourceUrls', 'PctNullSelfRedirectHyperlinks']
            
            Discrete_features=['NumDots','SubdomainLevel','PathLevel','NumDash','NumDashInHostname','AtSymbol','TildeSymbol','NumUnderscore',
                            'NumPercent','NumQueryComponents','NumAmpersand','NumHash','NoHttps','RandomString','IpAddress',
                            'DomainInSubdomains','DomainInPaths','HttpsInHostname','DoubleSlashInPath','NumSensitiveWords','EmbeddedBrandName',
                            'ExtFavicon','InsecureForms','RelativeFormAction','ExtFormAction','AbnormalFormAction','FrequentDomainNameMismatch','FakeLinkInStatusBar',
                            'RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame','MissingTitle','ImagesOnlyInForm',
                            'SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT','AbnormalExtFormActionR','ExtMetaScriptLinkRT',
                            'PctExtNullSelfRedirectHyperlinksRT']
            
            # Numerical pipeline
            numerical_pipeline = Pipeline(steps = [
                 ('scaler', StandardScaler(with_mean=False))])

            # Categorical Pipeline
            categorical_pipeline = Pipeline(steps = [
                ('onehot', OneHotEncoder(handle_unknown = 'ignore')),
                ('scaler', StandardScaler(with_mean=False))])
            
            preprocssor = ColumnTransformer([
                ('numerical_pipeline', numerical_pipeline,continuous_features ),
                ('categorical_pipeline', categorical_pipeline,Discrete_features )])
            
            logger.info("Pipeline Steps Completed")
            return preprocssor
      
      except Exception as e:
                 error = CustomException(e, sys)
                 logger.info(error.error_message)


    def get_feature_engineering_object(self):
        try:
            feature_engineering = Pipeline(steps = [("fe",Feature_Engineering())])

            return feature_engineering 
        
        except Exception as e:
            error = CustomException(e, sys)
            logger.info(error.error_message)


    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)
        
        train, test = train_test_split(data, test_size = 0.20, random_state= 42)
        
        train.to_csv(os.path.join(self.config.Transformed_data_path, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.Transformed_data_path, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
  
    def inititate_data_transformation(self):
        try:
            train = pd.read_csv(self.config.train_data_path)
            test = pd.read_csv(self.config.test_data_path)

            logger.info("Obtaining FE steps object")
            fe_obj = self.get_feature_engineering_object()

            logger.info("Applying feature engineering on train and test sets")
            train = fe_obj.fit_transform(train)
            test = fe_obj.transform(test)

            train.to_csv("train_data.csv")
            test.to_csv("test_data.csv")

            logger.info("Data Saved after feature_engineering")

            processing_obj = self.get_data_transformation_obj()

            target_column_name = "CLASS_LABEL"

            X_train = train.drop(columns=[target_column_name], axis=1)
            y_train = train[target_column_name]

            X_test = test.drop(columns=[target_column_name], axis=1)
            y_test = test[target_column_name]

            logger.info("Applying data transformation on train and test sets")
            X_train = processing_obj.fit_transform(X_train)
            X_test = processing_obj.transform(X_test)

            # Get transformed column names
            transformed_columns = []
            for name, transformer, features in processing_obj.transformers_:
                if hasattr(transformer, 'get_feature_names_out'):
                    if hasattr(transformer, 'categories_'):
                        transformed_columns.extend(transformer.get_feature_names_out())
                    else:
                        transformed_columns.extend(transformer.get_feature_names_out(features))
                else:
                    transformed_columns.extend(features)

            # Save transformed arrays with column names
            np.savetxt(os.path.join(self.config.Transformed_data_path, "X_train.csv"), X_train, delimiter=",", header=",".join(transformed_columns))
            np.savetxt(os.path.join(self.config.Transformed_data_path, "X_test.csv"), X_test, delimiter=",", header=",".join(transformed_columns))
            np.savetxt(os.path.join(self.config.Transformed_data_path, "y_train.csv"), y_train, delimiter=",", header=target_column_name)
            np.savetxt(os.path.join(self.config.Transformed_data_path, "y_test.csv"), y_test, delimiter=",", header=target_column_name)

            logger.info("Processed and Feature Engineered data")
            logger.info("X_train shape: {}".format(X_train.shape))
            logger.info("X_test shape: {}".format(X_test.shape))

            joblib.dump(processing_obj, self.config.Processed_data_OBJ_PATH)
            joblib.dump(fe_obj, self.config.Transformed_data_OBJ_PATH)

            logger.info("Successfully dumped {} and, {}".format(self.config.Processed_data_OBJ_PATH, self.config.Processed_data_OBJ_PATH))
            logger.info("Data Transformation completed")
            return (X_train, y_train), (X_test, y_test), self.config.Processed_data_OBJ_PATH

        except Exception as e:
            error = CustomException(e, sys)
            logger.info(error.error_message)




    
    

In [20]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
    data_transformation.inititate_data_transformation()
except Exception as e:
        error = CustomException(e, sys)
        logger.info(error.error_message)



[2024-03-04 07:39:11,835: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-04 07:39:11,846: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-04 07:39:11,867: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-04 07:39:11,867: INFO: common: created directory at: artifacts]
[2024-03-04 07:39:11,875: INFO: common: created directory at: artifacts/data_transformation]
[2024-03-04 07:39:11,875: INFO: common: created directory at: artifacts/data_transformation/Processed]
[2024-03-04 07:39:11,875: INFO: common: created directory at: artifacts/data_transformation/Transformed]


[2024-03-04 07:39:12,319: INFO: 917735424: Splited data into training and test sets]
[2024-03-04 07:39:12,319: INFO: 917735424: (8000, 50)]
[2024-03-04 07:39:12,332: INFO: 917735424: (2000, 50)]
(8000, 50)
(2000, 50)
[2024-03-04 07:39:12,445: INFO: 917735424: Obtaining FE steps object]
[2024-03-04 07:39:12,445: INFO: 1487739946: ******************feature Engineering started******************]
[2024-03-04 07:39:12,445: INFO: 917735424: Applying feature engineering on train and test sets]
[2024-03-04 07:39:12,615: INFO: 1487739946: Applying KNN to reduce the Cardinality]


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[2024-03-04 07:39:12,748: INFO: 1487739946: Applying KNN to reduce the Cardinality]
[2024-03-04 07:39:13,003: INFO: 917735424: Data Saved after feature_engineering]
[2024-03-04 07:39:13,003: INFO: 917735424: Pipeline Steps Completed]
[2024-03-04 07:39:13,011: INFO: 917735424: Applying data transformation on train and test sets]
[2024-03-04 07:39:13,171: INFO: 917735424: Error occurred in execution of:
        [C:\Users\neera\AppData\Local\Temp\ipykernel_24296\917735424.py] at
        try block line number: [103]
        and exception block line number : [129]
        error message: ['list' object has no attribute 'get_feature_names_out']
        ]
