In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# Walk through "data" folder, printing directories and files 
# to use their names for file paths

for root, dirs, files in os.walk("data"):
    print("Current Directory:", root)
    print()
    print("Subdirectories:", dirs)
    print()
    print("Files:", files)

Current Directory: data

Subdirectories: ['.ipynb_checkpoints', 'Processed', 'Raw']

Files: ['customer_behavior.ipynb']
Current Directory: data\.ipynb_checkpoints

Subdirectories: []

Files: ['customer_behavior-checkpoint.ipynb']
Current Directory: data\Processed

Subdirectories: ['.ipynb_checkpoints', 'customer_segmentation']

Files: []
Current Directory: data\Processed\.ipynb_checkpoints

Subdirectories: []

Files: []
Current Directory: data\Processed\customer_segmentation

Subdirectories: []

Files: ['customer_segmentation.csv']
Current Directory: data\Raw

Subdirectories: []

Files: ['E-commerce Customer Behavior - Sheet1.csv']


In [3]:
main_dir = 'data/raw/'
file_name = 'E-commerce Customer Behavior - Sheet1.csv'
file_path = os.path.join(main_dir, file_name)

In [4]:
df = pd.read_csv(file_path, index_col='Customer ID')

In [5]:
df.head()

Unnamed: 0_level_0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
101,Female,29,New York,Gold,1120.2,14,4.6,True,25,Satisfied
102,Male,34,Los Angeles,Silver,780.5,11,4.1,False,18,Neutral
103,Female,43,Chicago,Bronze,510.75,9,3.4,True,42,Unsatisfied
104,Male,30,San Francisco,Gold,1480.3,19,4.7,False,12,Satisfied
105,Male,27,Miami,Silver,720.4,13,4.0,True,55,Unsatisfied


In [6]:
# df.info()

# process categorical data

In [7]:
categorical_columns = ['Discount Applied', 'Gender_Female', 'Gender_Male', 'City_Chicago', 'City_Houston', 
    'City_Los Angeles', 'City_Miami', 'City_New York', 'City_San Francisco', 
    'Membership Type_Bronze', 'Membership Type_Gold', 'Membership Type_Silver']

In [8]:
# Label Encoding (for ordinal data like Satisfaction Level)
label_encoder = LabelEncoder()
df['Satisfaction Level'] = label_encoder.fit_transform(df['Satisfaction Level'])

# One-Hot Encoding (for nominal data like Gender, City, Membership Type)
df = pd.get_dummies(df, columns=['Gender', 'City', 'Membership Type'])

# Convert the one-hot encoded columns to integers
df[categorical_columns] = df[categorical_columns].astype(int)

In [9]:
df

Unnamed: 0_level_0,Age,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level,Gender_Female,Gender_Male,City_Chicago,City_Houston,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Membership Type_Bronze,Membership Type_Gold,Membership Type_Silver
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
101,29,1120.20,14,4.6,1,25,1,1,0,0,0,0,0,1,0,0,1,0
102,34,780.50,11,4.1,0,18,0,0,1,0,0,1,0,0,0,0,0,1
103,43,510.75,9,3.4,1,42,2,1,0,1,0,0,0,0,0,1,0,0
104,30,1480.30,19,4.7,0,12,1,0,1,0,0,0,0,0,1,0,1,0
105,27,720.40,13,4.0,1,55,2,0,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,32,660.30,10,3.8,1,42,2,0,1,0,0,0,1,0,0,0,0,1
447,36,470.50,8,3.0,0,27,0,1,0,0,1,0,0,0,0,1,0,0
448,30,1190.80,16,4.5,1,28,1,1,0,0,0,0,0,1,0,0,1,0
449,34,780.20,11,4.2,0,21,0,0,1,0,0,1,0,0,0,0,0,1


# process numerical

In [10]:
numerical_columns = ['Age', 'Total Spend', 'Items Purchased', 'Average Rating', 
                     'Days Since Last Purchase', 'Satisfaction Level']

In [11]:
# Apply Min-Max Normalization
scaler = MinMaxScaler()

# Normalize the numerical columns
final_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

In [12]:
final_df

Unnamed: 0_level_0,Age,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level,Gender_Female,Gender_Male,City_Chicago,City_Houston,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Membership Type_Bronze,Membership Type_Gold,Membership Type_Silver
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
101,0.176471,0.639502,0.500000,0.842105,1.0,0.296296,0.333333,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
102,0.470588,0.333273,0.285714,0.578947,0.0,0.166667,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
103,1.000000,0.090102,0.142857,0.210526,1.0,0.611111,0.666667,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
104,0.235294,0.964122,0.857143,0.894737,0.0,0.055556,0.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
105,0.058824,0.279095,0.428571,0.526316,1.0,0.851852,0.666667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,0.352941,0.224917,0.214286,0.421053,1.0,0.611111,0.666667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
447,0.588235,0.053818,0.071429,0.000000,0.0,0.333333,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
448,0.235294,0.703146,0.642857,0.789474,1.0,0.351852,0.333333,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
449,0.470588,0.333003,0.285714,0.631579,0.0,0.222222,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# save the data

In [13]:
# Directory path where files will be saved
save_dir = 'data/processed/customer_segmentation'
cleaned_file_name = 'customer_segmentation.csv'
cleaned_file_path = os.path.join(save_dir, cleaned_file_name)

In [16]:
# Save the cleaned data
final_df.to_csv(cleaned_file_path, index=True)
print(f"File saved successfully at {cleaned_file_path}.")

File saved successfully at data/processed/customer_segmentation\customer_segmentation.csv.
