# Data Collection and Pre-Processing - Lab

**Name:** Jarius Bedward

**Course:** PROG8245 Machine Learning Programming

**Date:** 2025-09-30

### Imports & Setup

In [None]:
import pandas as pd
import numpy as np
from dataclasses import dataclass, field
from datetime import datetime
import re, os, json


# Paths

RAW_CSV = "data/1000 Sales Records.csv"
META_CSV = "data/shop-product-catalog.csv"
CLEAN_CSV = "data/transaction_clean.csv"
CLEAN_JSON = "data/transaction_clean.json"

# Load the first 500 rows from the primary file
df_raw = pd.read_csv(RAW_CSV).head(500)


### Step 1: Hello, Data!

Load the primary transactions file and display the first 3 rows

In [None]:
df_raw.head(3)

### Step 2: Pick the right container

For this pandas.DataFrame would be efficient for tabular work.
For row level logic like pre-order cleaning and totals a small python class would be sufficient

### Step 3: Order Class

In [None]:
@dataclass
class Order:
    order_date: str
    region: str
    country: str
    item_type: str
    sales_channel: str
    order_priority: str
    order_id: int
    ship_date: str
    units_sold: int
    unit_price: float
    unit_cost: float
    total_revenue: float
    total_cost: float
    total_profit: float
    product_id: str = None # links to catalog
    discount_pct: float = field(default=0.0, init=False)

    def clean(self):
        # Ensure numeric
        self.units_sold = int(self.units_sold)
        self.unit_price = float(self.unit_price)
        self.unit_cost = float(self.unit_cost)

    def total(self):
        gross = self.units_sold * self.unit_price
        net = gross * (1 - self.discount_pct)
        return round (net, 2)

row = df_raw.iloc[0].to_dict()
order = Order(**row)
order.clean()
order.total()

### Step 4: Bulk Load Orders

In [None]:
def df_to_orders(df):
    orders = []
    for _, r in df.iterrows():
        o = Order(**r.to_dict())
        o.clean()
        orders.append(o)
    return orders

orders = df_to_orders(df_raw)
len(orders)

### Step 5: Quick Profiling

In [None]:
min_price = df_raw["Unit Price"].min()
mean_price = df_raw["Unit Price"].mean()
max_price = df_raw["Unit Price"].max()
unique_countries = df_raw["Country"].nunique()

print(f"Price range: {min_price}-{max_price}, mean={mean_price:.2f}")
print("Unique country count:", unique_countries)

### Step 6: Spot the Grime

Identify 3 dirty cases

    1. Inconsitent country names
    2. Negative or zero "Units Sold"
    3. Rounding mismatches between totals vs Unit*Quantity

In [None]:
(df_raw["Units Sold"] <= 0).sum()
rounding_issue = (df_raw["Total Revenue"] != df_raw["Units Sold"] * df_raw["Unit Price"]).sum()
rounding_issue

### Step 7: Clean Rules

In [1]:
df_clean = df_raw.copy()

# Fix the negative units

df_clean.loc[df_clean["Units Sold"] <= 0, "Units Sold"] = 1

# Normalize the names
df_clean["Country"] = df_clean["Country"].str.strio().str.title()

df_clean["Total Revenue"] = df_clean["Units Sold"] * df_clean["Unit Price"]
df_clean["Total Cost"] = df_clean["Units Sold"] * df_clean["Unit Cost"]
df_clean['Total Profit'] = df_clean['Total Revenue'] - df_clean['Total Cost']

df_clean.head(3)

SyntaxError: invalid syntax (2396007248.py, line 12)

### Step 8: Transformations

In [None]:
df_meta = pd.read_csv(META_CSV)
df_meta.head(3)


df_enriched = df_clean.merge(df_meta, how="left", on="Item Type", right_on="product_name")
df_enriched.head(3)

### Step 9: Feature Engineering

In [3]:
# Days since the order has shipped

df_enriched["Order Date"] = pd.to_datetime(df_enriched["Order Date"])
df_enriched["Ship Date"] = pd.to_datetime(df_enriched["Ship Date"])
ref_date = df_enriched["Ship Date"].max()
df_enriched["days_since_ship"] = (ref_date - df_enriched["Ship Date"]).dt.days

# High value falg
df_enriched["is_high_value"] = df_enriched["Total Revenue"] > 10000
df_enriched[["Order Date", "Ship Date", "days_since_ship", "is_high_value"]].head()

NameError: name 'pd' is not defined

### Step 10: Mini-Aggregation