# Data Collection and Pre-Processing Lab – E-Commerce Dataset

## Step 1 – Hello, Data!

In [None]:
import pandas as pd

df_raw = pd.read_csv('data/sales_500.csv')
df_raw.head(400)

Unnamed: 0,date,customer_id,product,price,quantity,coupon_code,shipping_city
0,2024-03-13,CUST1000,Monitor,276.34,3,WELCOME15,Calgary
1,2024-03-09,CUST1001,Keyboard,662.10,1,DISCOUNT5,Toronto
2,2024-02-25,CUST1002,Headphones,609.79,4,WELCOME15,Toronto
3,2024-03-08,CUST1003,Keyboard,931.46,1,OFF20,Ottawa
4,2024-06-05,CUST1004,Keyboard,959.94,1,DISCOUNT5,Toronto
...,...,...,...,...,...,...,...
395,2024-01-07,CUST1395,Phone,1228.57,4,SAVE10,Toronto
396,2024-03-04,CUST1396,Headphones,669.92,1,SAVE10,Toronto
397,2024-03-27,CUST1397,Phone,577.88,4,,Montreal
398,2024-01-18,CUST1398,Laptop,704.77,1,,Ottawa


## Step 2 – Pick the Right Container
A dictionary is appropriate because it allows fast key-based lookups 
(e.g., customer_id → transactions) and flexible mutation during cleaning.
Namedtuples are immutable and better suited for fixed schemas, while sets
are useful only for uniqueness checks, not structured records.

## Step 3 – Implement Functions and Data Structure

In [None]:
class TransactionCleaner:
    def __init__(self, df):
        self.df = df.copy()

    def clean(self):
        self.df = self.df.dropna(subset=['price', 'quantity'])
        self.df = self.df[self.df['quantity'] > 0]
        return self.df

    def total(self):
        return (self.df['price'] * self.df['quantity']).sum()

cleaner = TransactionCleaner(df_raw)
df_step3 = cleaner.clean()
cleaner.total()

np.float64(940632.29)

## Step 4 – Bulk Loaded

In [None]:
transactions_dict = df_step3.to_dict(orient='records')
transactions_dict[:499]

[{'date': '2024-03-13',
  'customer_id': 'CUST1000',
  'product': 'Monitor',
  'price': 276.34,
  'quantity': 3,
  'coupon_code': 'WELCOME15',
  'shipping_city': 'Calgary'},
 {'date': '2024-03-09',
  'customer_id': 'CUST1001',
  'product': 'Keyboard',
  'price': 662.1,
  'quantity': 1,
  'coupon_code': 'DISCOUNT5',
  'shipping_city': 'Toronto'},
 {'date': '2024-02-25',
  'customer_id': 'CUST1002',
  'product': 'Headphones',
  'price': 609.79,
  'quantity': 4,
  'coupon_code': 'WELCOME15',
  'shipping_city': 'Toronto'},
 {'date': '2024-03-08',
  'customer_id': 'CUST1003',
  'product': 'Keyboard',
  'price': 931.46,
  'quantity': 1,
  'coupon_code': 'OFF20',
  'shipping_city': 'Ottawa'},
 {'date': '2024-06-05',
  'customer_id': 'CUST1004',
  'product': 'Keyboard',
  'price': 959.94,
  'quantity': 1,
  'coupon_code': 'DISCOUNT5',
  'shipping_city': 'Toronto'},
 {'date': '2024-06-14',
  'customer_id': 'CUST1005',
  'product': 'Phone',
  'price': 87.05,
  'quantity': 4,
  'coupon_code': 'OF

## Step 5 – Quick Profiling

In [None]:
min_price = df_step3['price'].min()
mean_price = df_step3['price'].mean()
max_price = df_step3['price'].max()

unique_cities = set(df_step3['shipping_city'])
min_price, mean_price, max_price, len(unique_cities)

(np.float64(26.86), np.float64(761.9980400000001), np.float64(1499.58), 5)

## Step 6 – Spot the Grime
Missing values, zero/negative quantities, and inconsistent coupon codes.

## Step 7 – Cleaning Rules

In [None]:
before_rows = len(df_raw)
df_clean = df_raw.dropna(subset=['price', 'quantity'])
df_clean = df_clean[df_clean['quantity'] > 0]
after_rows = len(df_clean)
before_rows, after_rows

(500, 500)

## Step 8 – Transformations

In [None]:
def parse_discount(code):
    if pd.isna(code):
        return 0
    digits = ''.join(filter(str.isdigit, str(code)))
    return int(digits) if digits else 0

df_clean['discount_percent'] = df_clean['coupon_code'].apply(parse_discount)
df_clean[['coupon_code', 'discount_percent']].head()

Unnamed: 0,coupon_code,discount_percent
0,WELCOME15,15
1,DISCOUNT5,5
2,WELCOME15,15
3,OFF20,20
4,DISCOUNT5,5


## Step 9 – Feature Engineering

In [None]:
df_clean['order_date'] = pd.to_datetime(df_clean['date'])
latest_date = df_clean['order_date'].max()
df_clean['days_since_purchase'] = (latest_date - df_clean['order_date']).dt.days

## Step 10 – Mini-Aggregation

In [None]:
df_clean['revenue'] = df_clean['price'] * df_clean['quantity']
revenue_by_city = df_clean.groupby('shipping_city')['revenue'].sum().to_dict()
list(revenue_by_city.items())[:5]

[('Calgary', 161850.34),
 ('Montreal', 184951.42),
 ('Ottawa', 217806.6),
 ('Toronto', 198684.23),
 ('Vancouver', 177339.7)]

## Step 11 – Serialization Checkpoint

In [None]:
df_clean.to_csv('data/cleaned_sales.csv', index=False)
df_clean.to_json('data/cleaned_sales.json', orient='records', indent=2)

  df_clean.to_json('data/cleaned_sales.json', orient='records', indent=2)


## Step 12 – Soft Interview Reflection
Functions and classes helped modularize the data-cleaning logic, making the workflow reusable, readable, and easier to maintain.

## Data Dictionary
| Field | Type | Description | Source |
|------|------|------------|--------|
| date | Date | Order date | Primary CSV |
| customer_id | String | Customer identifier | Primary CSV |
| product | String | Product name | Primary CSV |
| price | Float | Unit price | Primary CSV |
| quantity | Integer | Quantity purchased | Primary CSV |
| coupon_code | String | Promo code | Primary CSV |
| discount_percent | Integer | Parsed discount | Derived |
| shipping_city | String | Delivery city | Primary CSV |
| days_since_purchase | Integer | Days since order | Synthetic |