# Data Collection and Pre-Processing - Lab

**Name:** Jarius Bedward

**Course:** PROG8245 Machine Learning Programming

**Date:** 2025-09-30

### Imports & Setup

In [None]:
import pandas as pd
import numpy as np
from dataclasses import dataclass, field



# Paths for primary and secondary data

sales_df = pd.read_csv("data/1000 Sales Records.csv")
catalog_df = pd.read_csv("data/shop-product-catalog.csv")




### Step 1: Hello, Data!

Load the primary transactions file and display the first 3 rows

In [None]:
display(sales_df.head(3))
display(catalog_df.head(3))

### Step 2: Pick the right container

For this dataset we would use a dictionary of lists from padnas dataframe.
Pandas dataframe is built on dictionary like structures so its the most appropriate for this

### Step 3: Order Class

In [None]:

class SalesCleaner:
    def __init__(self, df):
        self.df = df.copy()

    def clean(self):
        #Cleaning: Strip the whitespace in item type
       self.df["Item Type"] = self.df["Item Type"].str.strip()

    def total_revenue(self):

        return self.df ["Total Revenue"].sum()

cleaner = SalesCleaner(sales_df)
cleaned_sales = cleaner.clean()
print("Total Revenue:", cleaner.total_revenue())

### Step 4: Bulk Load Orders

In [None]:
profit_map = dict(zip(cleaned_sales["Order ID"], cleaned_sales["Total Profit"]))
list(profit_map.items())[:5]

### Step 5: Quick Profiling

In [None]:
print("Min price:", cleaned_sales["Unit Price"].min())
print("Mean price:", cleaned_sales["Unit Price"].mean())
print("Max price:", cleaned_sales["Unit price"].max())
print("Unique countries:", len(set(cleaned_sales["Country"])))

### Step 6: Spot the Grime

Identify 3 dirty cases

    1. Extra whitespace in "Item type"
    2. Inconsistent date formats in "order date
    3. Negative or zero "Units sold"

In [None]:
print("Whitespace check:", cleaned_sales["Item Type"].str.contains(" ").sum())
print("date format sample:", cleaned_sales["Order Date"].head())
print("Units sold ≤ 0:", (cleaned_sales["Units Sold"] <= 0).sum())

### Step 7: Clean Rules

We apply rules inside the clean method to normalize thew whitespace, convert dates to datetime and filter out bad units

In [1]:
def clean_sales(df):
    df = df.copy()
    df["Item Type"] = df["Item Type"].str.strip()
    df["Order Date"] = pd.to_datetime(df["Order Date"])
    df["Ship Date"] = pd.to_datetime(df["Ship Date"])
    df = df[df["Units Sold"] > 0]
    return df

sales_cleaned = clean_sales(sales_df)
sales_cleaned.info()

SyntaxError: invalid syntax (2396007248.py, line 12)

### Step 8: Transformations

In [None]:
df_meta = pd.read_csv(META_CSV)
df_meta.head(3)


df_enriched = df_clean.merge(df_meta, how="left", on="Item Type", right_on="product_name")
df_enriched.head(3)

### Step 9: Feature Engineering

In [3]:
# Days since the order has shipped

df_enriched["Order Date"] = pd.to_datetime(df_enriched["Order Date"])
df_enriched["Ship Date"] = pd.to_datetime(df_enriched["Ship Date"])
ref_date = df_enriched["Ship Date"].max()
df_enriched["days_since_ship"] = (ref_date - df_enriched["Ship Date"]).dt.days

# High value falg
df_enriched["is_high_value"] = df_enriched["Total Revenue"] > 10000
df_enriched[["Order Date", "Ship Date", "days_since_ship", "is_high_value"]].head()

NameError: name 'pd' is not defined

### Step 10: Mini-Aggregation

In [None]:
revenue_by_country = df_enriched.groupby("Country")["Total Revenue"].sum().sort_values(ascending=False)
revenue_by_country.head(10)

### Step 11: Serialization

In [None]:
df_enriched.to_csv(CLEAN_CSV, index=False)
df_enriched.to_json(CLEAN_JSON, orient="records", date_format="iso")

### Step 12: Data Dictionary