In [1]:
import pandas as pd
from utils.data_preprocessor import DataPreprocessor

In [2]:
df = pd.read_csv('../data/preprocessed_data.csv')

In [3]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# declare 2 ordered categorical columns which need to be handled
ordered_categorical_cols = ['Size', 'Frequency of Purchases']

# remove ordered categorical columns from the list of categorical columns
categorical_cols = [col for col in categorical_cols if col not in ordered_categorical_cols]

cols = ordered_categorical_cols + categorical_cols + numerical_cols
df = df[cols]

In [4]:
# Define the order of the sizes
size_order = ['S', 'M', 'L', 'XL']

# Create a dictionary to map each size to an integer
# difference betewwn in here and in clustering_cleaning
size_mapping = {size: index for index, size in enumerate(size_order)}

# Map the size column using the defined order
df['Size'] = df['Size'].map(size_mapping)

frequency_mapping = {
    'Weekly': 6,
    'Fortnightly': 5,
    'Bi-Weekly': 4,
    'Monthly': 3,
    'Quarterly': 2,             # combine 'Quarterly' and 'Every 3 Months'       
    'Every 3 Months': 2,        # combine 'Quarterly' and 'Every 3 Months'
    'Annually': 1
}

df['Frequency of Purchases'] = df['Frequency of Purchases'].map(frequency_mapping)

In [5]:
preprocessor = DataPreprocessor(df)
preprocessor.encode_categorical(columns = categorical_cols)
preprocessor.normalization(columns = numerical_cols)
pca_cleaned_data = preprocessor.get_preprocessed_data()
pca_cleaned_data

Unnamed: 0,Size,Frequency of Purchases,Gender,Category,Location,Color,Season,Subscription Status,Payment Method,Shipping Type,Discount Applied,Preferred Payment Method,Age,Purchase Amount (USD),Review Rating,Previous Purchases
0,2,5,1,1,16,7,3,1,2,1,1,5,0.711538,0.4125,0.24,0.265306
1,2,5,1,1,18,12,3,1,0,1,1,1,0.019231,0.5500,0.24,0.020408
2,0,6,1,1,20,12,1,1,1,2,1,2,0.615385,0.6625,0.24,0.448980
3,1,6,1,2,38,12,1,1,4,3,1,4,0.057692,0.8750,0.40,0.979592
4,1,1,1,1,36,21,1,1,1,2,1,4,0.519231,0.3625,0.08,0.612245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,2,6,0,1,45,21,2,0,1,0,0,5,0.423077,0.1000,0.68,0.632653
3896,2,4,0,0,14,23,1,0,4,5,0,0,0.653846,0.3625,0.80,0.816327
3897,2,2,0,0,29,8,1,0,2,4,0,5,0.538462,0.1625,0.16,0.469388
3898,0,6,0,2,22,3,2,0,4,1,0,5,0.500000,0.7125,0.52,0.469388


In [6]:
pca_cleaned_data.to_csv('../data/pca_cleaned_data.csv', index=False)