# 01 — Exploration
_Date: 2025-09-12_

Objectives:
- Load Olist CSVs from 'C:/Users/nazar/OneDrive/Documentos/data'
- Inspect schema, shapes, nulls, basic distributions
- Validate key relationships (`order_id`, `customer_id`, `product_id`, `seller_id`)


In [1]:
import os
import pandas as pd
from pathlib import Path

DATA_PATH = Path('C:/Users/nazar/OneDrive/Documentos/data')
assert DATA_PATH.exists(), f"Raw data path not found: {DATA_PATH}"

files = {
    'customers': 'olist_customers_dataset.csv',
    'orders': 'olist_orders_dataset.csv',
    'order_items': 'olist_order_items_dataset.csv',
    'payments': 'olist_order_payments_dataset.csv',
    'products': 'olist_products_dataset.csv',
    'sellers': 'olist_sellers_dataset.csv',
    'geolocation': 'olist_geolocation_dataset.csv',
    'reviews': 'olist_order_reviews_dataset.csv',
    'categories': 'product_category_name_translation.csv',
}

dfs = {}
for name, fname in files.items():
    path = DATA_PATH / fname
    if path.exists():
        df = pd.read_csv(path)
        dfs[name] = df
        print(f"Loaded {name}: {df.shape}")
    else:
        print(f"WARNING: Missing {path}")


Loaded customers: (99441, 5)
Loaded orders: (99441, 8)
Loaded order_items: (112650, 7)
Loaded payments: (103886, 5)
Loaded products: (32951, 9)
Loaded sellers: (3095, 4)
Loaded geolocation: (1000163, 5)
Loaded reviews: (99224, 7)
Loaded categories: (71, 2)


In [2]:
# Quick peek
list(dfs.keys())


['customers',
 'orders',
 'order_items',
 'payments',
 'products',
 'sellers',
 'geolocation',
 'reviews',
 'categories']

In [3]:
# Shapes & basic nulls
summary = {
    name: {
        'rows': df.shape[0],
        'cols': df.shape[1],
        'null_pct_mean': df.isna().mean().mean()
    }
    for name, df in dfs.items()
}
summary


{'customers': {'rows': 99441, 'cols': 5, 'null_pct_mean': np.float64(0.0)},
 'orders': {'rows': 99441,
  'cols': 8,
  'null_pct_mean': np.float64(0.006169487434760311)},
 'order_items': {'rows': 112650, 'cols': 7, 'null_pct_mean': np.float64(0.0)},
 'payments': {'rows': 103886, 'cols': 5, 'null_pct_mean': np.float64(0.0)},
 'products': {'rows': 32951,
  'cols': 9,
  'null_pct_mean': np.float64(0.008254681193287004)},
 'sellers': {'rows': 3095, 'cols': 4, 'null_pct_mean': np.float64(0.0)},
 'geolocation': {'rows': 1000163, 'cols': 5, 'null_pct_mean': np.float64(0.0)},
 'reviews': {'rows': 99224,
  'cols': 7,
  'null_pct_mean': np.float64(0.21006294560071875)},
 'categories': {'rows': 71, 'cols': 2, 'null_pct_mean': np.float64(0.0)}}

In [4]:
# Validate key uniqueness
checks = {}
if 'orders' in dfs:
    checks['orders.order_id_unique'] = dfs['orders']['order_id'].is_unique
if 'customers' in dfs:
    checks['customers.customer_id_unique'] = dfs['customers']['customer_id'].is_unique
if 'products' in dfs:
    checks['products.product_id_unique'] = dfs['products']['product_id'].is_unique
if 'sellers' in dfs:
    checks['sellers.seller_id_unique'] = dfs['sellers']['seller_id'].is_unique
checks


{'orders.order_id_unique': True,
 'customers.customer_id_unique': True,
 'products.product_id_unique': True,
 'sellers.seller_id_unique': True}

## Notes
- Record any anomalies, missing files, or issues here.
