## Imports

In [1]:
# Libraries
import pandas as pd
import numpy as np
import os
import hashlib

In [17]:
# Files
path = os.path.expanduser('~/Desktop/CareerFoundry/2.4/4.10.') # path to the root folder

### Orders
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv')) 
### Customers
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data','customers.csv'))

## PII Cleaning and Anonymization

- **First/Last Names** were dropped as direct identifiers.
- **User IDs** were SHA-256 hashed to ensure anonymity while preserving the ability to track behavior across datasets.
- **Age** was converted into broad age groups.
- **Income** was bucketed using quartiles to preserve relative distribution without exposing raw values.
- **Date Joined** was shifted from %m/%d/%Y format to %m/%Y to remove tracability.
These steps follow data privacy best practices while preserving analytical value for segmentation and modeling.

## Drop direct identifiers 

In [3]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [4]:
df_customers = df_customers.drop(columns=['First Name', 'Surnam'])

## Hash user_id

In [5]:
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


In [6]:
def hash_id(val):
    return hashlib.sha256(str(val).encode()).hexdigest()

In [7]:
df_customers['user_id'] = df_customers['user_id'].apply(hash_id)

In [8]:
df_orders['user_id'] = df_orders['user_id'].apply(hash_id)

## Bucket ages (into 10-year ranges)

In [9]:
df_customers['age_group'] = pd.cut(
    df_customers['Age'],
    bins=[0, 18, 25, 35, 45, 55, 65, np.inf],
    labels=['0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '65+']
)

In [10]:
df_customers = df_customers.drop(columns=['Age'])

## Bucket yearly income into quartiles

In [11]:
df_customers['income_bracket'] = pd.qcut(
    df_customers['income'], 
    q=4, 
    labels=['Low', 'Mid-low', 'Mid-high', 'High']
)

In [12]:
df_customers = df_customers.drop(columns=['income'])

## Change date_joined format to remove exact traceability

In [13]:
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

In [14]:
df_customers['date_joined'] = df_customers['date_joined'].dt.strftime('%m/%Y') # changing from %m/%d/%Y to %m/%Y

In [15]:
df_orders.to_csv(os.path.join(path, '02 Data','Original Data', 'products.csv'))

In [16]:
df_customers.to_csv(os.path.join(path, '02 Data','Original Data', 'customers.csv'))