In [61]:
import pandas as pd
from utils.data_preprocessor import DataPreprocessor

In [62]:
df = pd.read_csv('../data/preprocessed_data.csv')

In [63]:
df.shape

(3900, 16)

### Transformation of dataset before clustering:

1. **Label Encoding** for Ordinal Columns: Preserves the order of the categories.
2. **One-Hot Encoding** for Nominal Columns: Ensures no ordinal relationship is imposed.
3. **Normalization** of Numerical Columns: Standardizes the numerical data.

## Define ordered, categorical and numerical columns

In [64]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# check if the length of categorical and numerical columns is equal to the total number of columns
len(categorical_cols) + len(numerical_cols) == df.shape[1]

True

In [65]:
# look for some ordered categorical columns
for col in categorical_cols:
    print('-' * 50)
    print(col)
    print()
    print(df[col].value_counts())

--------------------------------------------------
Gender

Male      2652
Female    1248
Name: Gender, dtype: int64
--------------------------------------------------
Category

Clothing       1737
Accessories    1240
Footwear        599
Outerwear       324
Name: Category, dtype: int64
--------------------------------------------------
Location

Montana           96
California        95
Idaho             93
Illinois          92
Alabama           89
Minnesota         88
Nebraska          87
New York          87
Nevada            87
Maryland          86
Delaware          86
Vermont           85
Louisiana         84
North Dakota      83
Missouri          81
West Virginia     81
New Mexico        81
Mississippi       80
Indiana           79
Georgia           79
Kentucky          79
Arkansas          79
North Carolina    78
Connecticut       78
Virginia          77
Ohio              77
Tennessee         77
Texas             77
Maine             77
South Carolina    76
Colorado          75
Ok

In [66]:
# declare 2 ordered categorical columns which need to be handled
ordered_categorical_cols = ['Size', 'Frequency of Purchases']

# remove ordered categorical columns from the list of categorical columns
categorical_cols = [col for col in categorical_cols if col not in ordered_categorical_cols]

In [67]:
cols = ordered_categorical_cols + categorical_cols + numerical_cols

In [68]:
# change the order of columns to have numerical first and categorical last
df = df[cols]

## Transform variables

### Transform ordered columns - Size

In [71]:
ordered_categorical_cols

['Size', 'Frequency of Purchases']

In [69]:
df['Size'].value_counts()

M     1755
L     1053
S      663
XL     429
Name: Size, dtype: int64

In [70]:
# Define the order of the sizes
size_order = ['S', 'M', 'L', 'XL']

# Create a dictionary to map each size to an integer
size_mapping = {size: index + 1 for index, size in enumerate(size_order)}

# Map the size column using the defined order
df['Size'] = df['Size'].map(size_mapping)

### Transform ordered columns - Frequency of Purchases

We can observe that there are 2 values which means the same: Quarterly and Every 3 Months, we combine them to 1 category

In [73]:
df['Frequency of Purchases'].value_counts()

Every 3 Months    584
Annually          572
Quarterly         563
Monthly           553
Bi-Weekly         547
Fortnightly       542
Weekly            539
Name: Frequency of Purchases, dtype: int64

In [74]:
frequency_mapping = {
    'Weekly': 1,
    'Fortnightly': 2,
    'Bi-Weekly': 3,
    'Monthly': 4,
    'Quarterly': 5,             # combine 'Quarterly' and 'Every 3 Months'       
    'Every 3 Months': 5,        # combine 'Quarterly' and 'Every 3 Months'
    'Annually': 6
}

df['Frequency of Purchases'] = df['Frequency of Purchases'].map(frequency_mapping)

### Transform categorical columns

In [79]:
df

Unnamed: 0,Size,Frequency of Purchases,Gender,Category,Location,Color,Season,Subscription Status,Payment Method,Shipping Type,Discount Applied,Preferred Payment Method,Age,Purchase Amount (USD),Review Rating,Previous Purchases
0,3,2,Male,Clothing,Kentucky,Gray,Winter,Yes,Credit Card,Express,Yes,Venmo,55,53,3.1,14
1,3,2,Male,Clothing,Maine,Maroon,Winter,Yes,Bank Transfer,Express,Yes,Cash,19,64,3.1,2
2,1,1,Male,Clothing,Massachusetts,Maroon,Spring,Yes,Cash,Free Shipping,Yes,Credit Card,50,73,3.1,23
3,2,1,Male,Footwear,Rhode Island,Maroon,Spring,Yes,PayPal,Next Day Air,Yes,PayPal,21,90,3.5,49
4,2,6,Male,Clothing,Oregon,Turquoise,Spring,Yes,Cash,Free Shipping,Yes,PayPal,45,49,2.7,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,3,1,Female,Clothing,Virginia,Turquoise,Summer,No,Cash,2-Day Shipping,No,Venmo,40,28,4.2,32
3896,3,3,Female,Accessories,Iowa,White,Spring,No,PayPal,Store Pickup,No,Bank Transfer,52,49,4.5,41
3897,3,5,Female,Accessories,New Jersey,Green,Spring,No,Credit Card,Standard,No,Venmo,46,33,2.9,24
3898,1,1,Female,Footwear,Minnesota,Brown,Summer,No,PayPal,Express,No,Venmo,44,77,3.8,24


In [11]:
preprocessor = DataPreprocessor(df)

In [12]:
preprocessor.one_hot_encode(columns=['Location'])
one_hot = preprocessor.get_preprocessed_data()
one_hot

Unnamed: 0,Age,Purchase Amount (USD),Review Rating,Previous Purchases,Gender,Category,Size,Color,Season,Subscription Status,...,Location_South Dakota,Location_Tennessee,Location_Texas,Location_Utah,Location_Vermont,Location_Virginia,Location_Washington,Location_West Virginia,Location_Wisconsin,Location_Wyoming
0,55,53,3.1,14,Male,Clothing,L,Gray,Winter,Yes,...,0,0,0,0,0,0,0,0,0,0
1,19,64,3.1,2,Male,Clothing,L,Maroon,Winter,Yes,...,0,0,0,0,0,0,0,0,0,0
2,50,73,3.1,23,Male,Clothing,S,Maroon,Spring,Yes,...,0,0,0,0,0,0,0,0,0,0
3,21,90,3.5,49,Male,Footwear,M,Maroon,Spring,Yes,...,0,0,0,0,0,0,0,0,0,0
4,45,49,2.7,31,Male,Clothing,M,Turquoise,Spring,Yes,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,40,28,4.2,32,Female,Clothing,L,Turquoise,Summer,No,...,0,0,0,0,0,1,0,0,0,0
3896,52,49,4.5,41,Female,Accessories,L,White,Spring,No,...,0,0,0,0,0,0,0,0,0,0
3897,46,33,2.9,24,Female,Accessories,L,Green,Spring,No,...,0,0,0,0,0,0,0,0,0,0
3898,44,77,3.8,24,Female,Footwear,S,Brown,Summer,No,...,0,0,0,0,0,0,0,0,0,0


In [9]:
data = preprocessor.get_preprocessed_data()
data

Unnamed: 0,Age,Gender,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Previous Purchases,Preferred Payment Method,Frequency of Purchases
0,55,1,1,53,16,0,7,3,3.1,1,2,1,1,14,5,3
1,19,1,1,64,18,0,12,3,3.1,1,0,1,1,2,1,3
2,50,1,1,73,20,2,12,1,3.1,1,1,2,1,23,2,6
3,21,1,2,90,38,1,12,1,3.5,1,4,3,1,49,4,6
4,45,1,1,49,36,1,21,1,2.7,1,1,2,1,31,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,40,0,1,28,45,0,21,2,4.2,0,1,0,0,32,5,6
3896,52,0,0,49,14,0,23,1,4.5,0,4,5,0,41,0,1
3897,46,0,0,33,29,0,8,1,2.9,0,2,4,0,24,5,5
3898,44,0,2,77,22,2,3,2,3.8,0,4,1,0,24,5,6
