In [4]:
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler


import os


@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path=os.path.join('artifacts','preprocessor.pkl')


            logging.info('Data Transformation initiated')
            # Define which columns should be ordinal-encoded and which should be scaled
            categorical_cols = ['cut', 'color','clarity']
            numerical_cols = ['carat', 'depth','table', 'x', 'y', 'z']
            
            # Define the custom ranking for each ordinal variable
            cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
            color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
            clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
            
            logging.info('Pipeline Initiated')

            ## Numerical Pipeline
            num_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())

                ]

            )

            # Categorigal Pipeline
            cat_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                ('scaler',StandardScaler())
                ]

            )

            preprocessor=ColumnTransformer([
            ('num_pipeline',num_pipeline,numerical_cols),
            ('cat_pipeline',cat_pipeline,categorical_cols)
            ])
            
            return preprocessor

            logging.info('Pipeline Completed')

        except Exception as e:
            logging.info("Error in Data Trnasformation")
            raise CustomException(e,sys)
        
    def initaite_data_transformation(self,train_path,test_path):
        try:
            # Reading train and test data
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)

            logging.info('Read train and test data completed')
            logging.info(f'Train Dataframe Head : \n{train_df.head().to_string()}')
            logging.info(f'Test Dataframe Head  : \n{test_df.head().to_string()}')

            logging.info('Obtaining preprocessing object')

            preprocessing_obj = self.get_data_transformation_object()

            target_column_name = 'price'
            drop_columns = [target_column_name,'id']

            input_feature_train_df = train_df.drop(columns=drop_columns,axis=1)
            target_feature_train_df=train_df[target_column_name]

            input_feature_test_df=test_df.drop(columns=drop_columns,axis=1)
            target_feature_test_df=test_df[target_column_name]
            
            ## Trnasformating using preprocessor obj
            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            logging.info("Applying preprocessing object on training and testing datasets.")
            

            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            save_object(

                file_path=self.data_transformation_config.preprocessor_obj_file_path,
                obj=preprocessing_obj

            )
            logging.info('Preprocessor pickle file saved')

            return (
                train_arr,
                test_arr,
                self.data_transformation_config.preprocessor_obj_file_path,
            )
            
        except Exception as e:
            logging.info("Exception occured in the initiate_datatransformation")

            raise CustomException(e,sys)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 61)

In [8]:
df

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365


In [12]:
import pandas as pd
df=pd.read_csv('train.csv')
for i in column:
    t=set(df[i].unique())
    print(i,t)


Product_ID {'P00270142', 'P00355742', 'P00018242', 'P00219842', 'P00170042', 'P00345642', 'P00023842', 'P00259242', 'P00130342', 'P00202942', 'P00025242', 'P00011242', 'P00306142', 'P00323642', 'P00260242', 'P00239842', 'P00356642', 'P00287142', 'P00171742', 'P00286742', 'P00224142', 'P00006842', 'P00125542', 'P00169342', 'P00051142', 'P00142942', 'P00167742', 'P00023742', 'P00047042', 'P00293042', 'P00060942', 'P00079342', 'P00132842', 'P00055042', 'P00169842', 'P00100442', 'P00278242', 'P00299542', 'P00169142', 'P00364542', 'P00070442', 'P00221642', 'P00349842', 'P00070742', 'P00067942', 'P00319242', 'P00149342', 'P00128942', 'P00076142', 'P00037742', 'P00342642', 'P00216342', 'P00221842', 'P00249542', 'P00084442', 'P00056342', 'P00261042', 'P00361142', 'P00312042', 'P00226642', 'P00112242', 'P00028142', 'P00202442', 'P00200942', 'P00145842', 'P00176042', 'P00259942', 'P00241042', 'P00363942', 'P00175142', 'P00167142', 'P00115242', 'P00341642', 'P00191742', 'P00156642', 'P00168842', 

In [15]:
len(set(df['Product_ID'].unique()))

3631

In [17]:
df['Product_ID']

0         P00069042
1         P00248942
2         P00087842
3         P00085442
4         P00285442
            ...    
550063    P00372445
550064    P00375436
550065    P00375436
550066    P00375436
550067    P00371644
Name: Product_ID, Length: 550068, dtype: object

In [16]:
len(df)

550068

In [11]:
column=['Product_ID','Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status','Product_Category_1',	'Product_Category_2','Product_Category_3']

In [9]:
import pandas as pd
import logging
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Read the dataset
data = pd.read_csv('train.csv')

# Define the categorical and numerical columns
categorical_cols = ['Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
numerical_cols = ['Purchase']

# Define the custom ranking for each ordinal variable
age_categories = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
occupation_categories = list(range(21))
city_categories = ['A', 'B', 'C']
stay_years_categories = ['0', '1', '2', '3', '4+']
marital_status_categories = [0, 1]
product_categories = list(range(1, 21))
product_categories.append(None)  # Adding 'nan' as a category

logging.info('Pipeline Initiated')

# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[['M', 'F'], age_categories, occupation_categories, city_categories, stay_years_categories, marital_status_categories, product_categories, product_categories, product_categories])),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

# Apply the preprocessor to the dataset
processed_data = preprocessor.fit_transform(data)

# You can now use the processed_data for further analysis or modeling


In [10]:
processed_data

array([[-0.1779729 ,  1.74651251, -1.84424754, ..., -0.61080871,
        -0.29360542,  0.36879231],
       [ 1.1817558 ,  1.74651251, -1.84424754, ..., -1.11891197,
        -0.75936196, -0.36117636],
       [-1.56119326,  1.74651251, -1.84424754, ...,  1.67565594,
        -0.29360542,  0.36879231],
       ...,
       [-1.81701338,  1.74651251, -0.36673935, ...,  3.70806897,
        -0.29360542,  0.36879231],
       [-1.77162273,  1.74651251,  2.58827703, ...,  3.70806897,
        -0.29360542,  0.36879231],
       [-1.7467375 ,  1.74651251,  1.11076884, ...,  3.70806897,
        -0.29360542,  0.36879231]])

In [None]:
target_column_name = 'Purchase'
drop_columns = [target_column_name,'User_ID']

input_feature_train_df = train_df.drop(columns=drop_columns,axis=1)

target_feature_train_df=train_df[target_column_name]

input_feature_test_df=test_df.drop(columns=drop_columns,axis=1)
target_feature_test_df=test_df[target_column_name]

## Trnasformating using preprocessor obj
input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)