# Olist E-commerce EDA

This notebook loads the Olist datasets and prepares cleaned versions for analysis.


In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp314-cp314-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.1-cp314-cp314-win_amd64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp314-cp314-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.1 MB 24.0 MB/s eta 0:00:01
   --- ------------------------------------ 1.0/11.1 MB 24.0 MB/s eta 0:00:01
   --------------------- ------------------ 6.0/11.1 MB 9.5 MB/s eta 0:00:01
   ---------------------------------------  11.0/11.1 MB 13.2 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 13.0 MB/s  0:00:00
Downloading numpy-2.4.1-cp314-cp314-win_amd64.whl (12.4 MB)
   ------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# 1) Setup and imports
import os
import glob
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)


In [7]:
# 2) Load data 
DATA_DIR = r"C:\Users\dbswl\OneDrive\바탕 화면\project\olist_eda\data"

def load_csv(filename):
    return pd.read_csv(os.path.join(DATA_DIR, filename))

orders = load_csv("olist_orders_dataset.csv")
customers = load_csv("olist_customers_dataset.csv")
order_items = load_csv("olist_order_items_dataset.csv")
payments = load_csv("olist_order_payments_dataset.csv")
reviews = load_csv("olist_order_reviews_dataset.csv")
products = load_csv("olist_products_dataset.csv")
sellers = load_csv("olist_sellers_dataset.csv")
geolocation = load_csv("olist_geolocation_dataset.csv")

print("orders", orders.shape)
print("customers", customers.shape)
print("order_items", order_items.shape)
print("payments", payments.shape)
print("reviews", reviews.shape)
print("products", products.shape)
print("sellers", sellers.shape)
print("geolocation", geolocation.shape)


orders (99441, 8)
customers (99441, 5)
order_items (112650, 7)
payments (103886, 5)
reviews (99224, 7)
products (32951, 9)
sellers (3095, 4)
geolocation (1000163, 5)


In [8]:

# 3) Review cleanup (simple version)
# 주문 id, 리뷰 점수, 리뷰 제목, 리뷰 내용 까지 같은 것 삭제 
reviews_dedup = reviews.copy()

# Make sure timestamps are proper datetime
reviews_dedup['review_answer_timestamp'] = pd.to_datetime(
    reviews_dedup['review_answer_timestamp'], errors='coerce'
)

# Sort so the latest review stays when duplicates exist
reviews_dedup = reviews_dedup.sort_values('review_answer_timestamp')

# Remove exact duplicates by order + content + score
reviews_dedup = reviews_dedup.drop_duplicates(
    subset=['order_id','review_score','review_comment_title','review_comment_message'],
    keep='last'
)

print("reviews before:", len(reviews))
print("reviews after dedup:", len(reviews_dedup))


reviews before: 99224
reviews after dedup: 98997


In [9]:
# 4) Order date cleanup (simple version)
# 승인일이 배송 인계일 보다 늦은지 표시
# 배송사 인계일이 배송완료일보다 늦은지 표시
# 위의 데이터들을 모두 nat 처리 
# 각 단계별 소요 시간을 시간 단위로 계산
orders_fix = orders.copy()

# Parse dates
orders_fix['order_purchase_timestamp'] = pd.to_datetime(orders_fix['order_purchase_timestamp'], errors='coerce')
orders_fix['order_approved_at'] = pd.to_datetime(orders_fix['order_approved_at'], errors='coerce')
orders_fix['order_delivered_carrier_date'] = pd.to_datetime(orders_fix['order_delivered_carrier_date'], errors='coerce')
orders_fix['order_delivered_customer_date'] = pd.to_datetime(orders_fix['order_delivered_customer_date'], errors='coerce')
orders_fix['order_estimated_delivery_date'] = pd.to_datetime(orders_fix['order_estimated_delivery_date'], errors='coerce')

# Fix impossible sequences (tolerate <= 3 day inversions)
approved_to_carrier_days = (
    orders_fix['order_delivered_carrier_date'] - orders_fix['order_approved_at']
).dt.total_seconds() / 86400
carrier_to_delivered_days = (
    orders_fix['order_delivered_customer_date'] - orders_fix['order_delivered_carrier_date']
).dt.total_seconds() / 86400

orders_fix.loc[approved_to_carrier_days < -3, 'order_delivered_carrier_date'] = pd.NaT
orders_fix.loc[carrier_to_delivered_days < -3, 'order_delivered_customer_date'] = pd.NaT


# Lead times (hours)
orders_fix['lead_purchase_to_approved_hr'] = (
    orders_fix['order_approved_at'] - orders_fix['order_purchase_timestamp']
).dt.total_seconds() / 3600

orders_fix['lead_approved_to_carrier_hr'] = (
    orders_fix['order_delivered_carrier_date'] - orders_fix['order_approved_at']
).dt.total_seconds() / 3600

orders_fix['lead_carrier_to_delivered_hr'] = (
    orders_fix['order_delivered_customer_date'] - orders_fix['order_delivered_carrier_date']
).dt.total_seconds() / 3600

orders_fix['lead_purchase_to_delivered_hr'] = (
    orders_fix['order_delivered_customer_date'] - orders_fix['order_purchase_timestamp']
).dt.total_seconds() / 3600

orders_fix['delay_days'] = (
    orders_fix['order_delivered_customer_date'] - orders_fix['order_estimated_delivery_date']
).dt.days


In [10]:
# 5) Product cleanup (simple version)
# 카테고리 이름이 없는 것 = 같은 셀러 + 같은 무게 + 같은 치수 인 것 같은 것으로 대체! 
products_fix = products.copy()

products_fix['product_category_name'] = products_fix['product_category_name'].fillna('Unknown')
products_fix['product_category_name'] = products_fix['product_category_name'].astype(str).str.strip()
products_fix.loc[products_fix['product_category_name'] == '', 'product_category_name'] = 'Unknown'

products_fix.loc[products_fix['product_weight_g'] <= 0, 'product_weight_g'] = pd.NA

print("Unknown category:", (products_fix['product_category_name'] == 'Unknown').sum())
print("weight NaN:", products_fix['product_weight_g'].isna().sum())

# Inference for missing categories (single step)
# Rule: same seller + exact weight/size match -> assign most common category
products_fix['product_category_name_inferred'] = products_fix['product_category_name']

# Map product -> seller (mode seller per product)
prod_seller = order_items.groupby('product_id')['seller_id'].agg(
    lambda s: s.mode().iloc[0] if not s.mode().empty else pd.NA
)
products_fix['seller_id'] = products_fix['product_id'].map(prod_seller)

# Ensure numeric dims
dim_cols = ['product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']
for c in dim_cols:
    if c in products_fix.columns:
        products_fix[c] = pd.to_numeric(products_fix[c], errors='coerce')

# Build exact signature
sig = products_fix[dim_cols].astype('float').round(3)
products_fix['size_weight_sig'] = sig.astype(str).agg('|'.join, axis=1)

# Build lookup from (seller_id, signature) -> mode category (from known only)
known = products_fix[products_fix['product_category_name_inferred'] != 'Unknown']
lookup = known.groupby(['seller_id', 'size_weight_sig'])['product_category_name_inferred']     .agg(lambda s: s.mode().iloc[0] if not s.mode().empty else pd.NA)

mask_unknown = products_fix['product_category_name_inferred'] == 'Unknown'
products_fix.loc[mask_unknown, 'product_category_name_inferred'] = (
    products_fix.loc[mask_unknown]
    .set_index(['seller_id', 'size_weight_sig'])
    .index
    .map(lookup)
    .fillna('Unknown')
).values

print('Unknown after seller+size/weight (exact) inference:',
      (products_fix['product_category_name_inferred'] == 'Unknown').sum())

# Apply inferred categories to original column and drop helper
products_fix['product_category_name'] = products_fix['product_category_name_inferred']
products_fix = products_fix.drop(columns=['product_category_name_inferred'], errors='ignore')

# Drop helper columns
products_fix = products_fix.drop(columns=['seller_id', 'size_weight_sig'], errors='ignore')


Unknown category: 610
weight NaN: 6
Unknown after seller+size/weight (exact) inference: 498


In [11]:
# 5.1) Industry Classification Mapping
# Mapping Portuguese categories to standardized Industry Groups and Types
INDUSTRY_MAP = {
    'cama_mesa_banho': ('Home_Living', 'Bed_Bath'),
    'moveis_decoracao': ('Home_Living', 'Furniture'),
    'moveis_escritorio': ('Home_Living', 'Furniture'),
    'moveis_sala': ('Home_Living', 'Furniture'),
    'utilidades_domesticas': ('Home_Living', 'Kitchen_Housewares'),
    'casa_construcao': ('Home_Living', 'Home_Care'),
    'ferramentas_jardim': ('Home_Living', 'Home_Care'),
    'informatica_acessorios': ('Tech_Electronics', 'Computers_Telephony'),
    'pcs': ('Tech_Electronics', 'Computers_Telephony'),
    'telefonia': ('Tech_Electronics', 'Computers_Telephony'),
    'eletrodomesticos': ('Tech_Electronics', 'Appliances'),
    'eletroportateis': ('Tech_Electronics', 'Appliances'),
    'climatizacao': ('Tech_Electronics', 'Appliances'),
    'eletronicos': ('Tech_Electronics', 'Entertainment'),
    'consoles_games': ('Tech_Electronics', 'Entertainment'),
    'perfumaria': ('Health_Beauty', 'Beauty'),
    'beleza_saude': ('Health_Beauty', 'Beauty'),
    'esporte_lazer': ('Sports_Leisure', 'Sports_Leisure'),
    'instrumentos_musicais': ('Sports_Leisure', 'Music_Art'),
    'artes': ('Sports_Leisure', 'Music_Art'),
    'relogios_presentes': ('Fashion_Accessories', 'Accessories_Gifts'),
    'malas_acessorios': ('Fashion_Accessories', 'Accessories_Gifts'),
    'fashion_bolsas_e_acessorios': ('Fashion_Accessories', 'Accessories_Gifts'),
    'fashion_calcados': ('Fashion_Accessories', 'Clothing_Shoes'),
    'bebes': ('Kids_Toys', 'Kids_Toys'),
    'brinquedos': ('Kids_Toys', 'Kids_Toys'),
    'automotivo': ('Automotive', 'Automotive'),
    'papelaria': ('Others', 'Books_Stationery'),
    'livros_interesse_geral': ('Others', 'Books_Stationery'),
    'agro_industria_e_comercio': ('Others', 'Industry_Others'),
    'pet_shop': ('Others', 'Industry_Others'),
    'construcao_ferramentas_seguranca': ('Others', 'Industry_Others'),
    'sinalizacao_e_seguranca': ('Others', 'Industry_Others'),
    'cool_stuff': ('Others', 'Industry_Others')
}

def classify_industry(category_name):
    # Returns (Industry_Group, Industry_Type) for a given category name.
    return INDUSTRY_MAP.get(category_name, ('Others', 'Others'))

# Apply classification to products_fix
industry_info = products_fix['product_category_name'].apply(lambda x: pd.Series(classify_industry(x)))
products_fix[['industry_group', 'industry_type']] = industry_info

print("✅ Industry classification applied successfully!")
print("Industry Groups:", products_fix['industry_group'].unique().tolist())


✅ Industry classification applied successfully!
Industry Groups: ['Health_Beauty', 'Sports_Leisure', 'Kids_Toys', 'Home_Living', 'Others', 'Tech_Electronics', 'Fashion_Accessories', 'Automotive']


In [12]:
# 6) Payment cleanup (simple version)
# 바우처로 사서 공짜로 받은건 남기고 undefined만 지운다. 

payments_fix = payments.copy()

# Drop clearly invalid zero-value rows (not_defined only)
payments_fix = payments_fix[~((payments_fix['payment_value'] == 0) & (payments_fix['payment_type'] == 'not_defined'))].copy()

payments_pos = payments_fix[payments_fix['payment_value'] > 0].copy()

print("payments total:", len(payments_fix))
print("payments <= 0:", (payments_fix['payment_value'] <= 0).sum())
print("payments_pos:", len(payments_pos))


payments total: 103883
payments <= 0: 6
payments_pos: 103877


In [13]:
# 7) Geolocation cleanup (simple version)
# Aggregate by zip prefix and keep representative city/state.
# 위경도 평균값 사용 
# 도시 주 최빈값 사용
# 이걸 왜 이렇게 했냐? 굳이 정확한 위치기 우리한테 당장 필요 없다고 생각해서 같은 주인거 다 묶어버림
geo = geolocation.copy()

geolocation_fix = geo.groupby('geolocation_zip_code_prefix').agg(
    geolocation_lat=('geolocation_lat','mean'),
    geolocation_lng=('geolocation_lng','mean'),
    geolocation_city=('geolocation_city', lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
    geolocation_state=('geolocation_state', lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
    count=('geolocation_city','size')
).reset_index()

print("geolocation_fix:", geolocation_fix.shape)


geolocation_fix: (19015, 6)


In [14]:
# 8) Payment vs item totals check (simple version)
# Count orders with large discrepancies (abs delta > 100).
## 주문별 주문 합계 + 배송비 합계 가 페이먼트와 얼만큼 차이나는지 
## 100 이상 차이가 나는 주문들은 체크한다.(2~3만원)차이  -> 할부로 인한 것으로 추정 됨 
items_tot = order_items.groupby('order_id').agg(
    items_total=('price','sum'),
    freight_total=('freight_value','sum')
)
items_tot['items_plus_freight'] = items_tot['items_total'] + items_tot['freight_total']

pay_tot = payments.groupby('order_id').agg(payment_total=('payment_value','sum'))

compare = items_tot.join(pay_tot, how='inner')
compare['delta'] = compare['payment_total'] - compare['items_plus_freight']

large_delta = compare[compare['delta'].abs() > 100]
print("large deltas (abs>100):", len(large_delta))


large deltas (abs>100): 3


In [15]:
# 9) Final cleaned dataset bundle (fixed only)
clean = {
    'orders': orders_fix,
    'payments': payments_fix,
    'reviews': reviews_dedup,
    'products': products_fix,
    'geolocation': geolocation_fix,
}

print('orders = orders_fix', orders_fix.shape)
print('payments = payments_fix', payments_fix.shape)
print('reviews = reviews_dedup', reviews_dedup.shape)
print('products = products_fix', products_fix.shape)
print('geolocation = geolocation_fix', geolocation_fix.shape)


orders = orders_fix (99441, 13)
payments = payments_fix (103883, 5)
reviews = reviews_dedup (98997, 7)
products = products_fix (32951, 11)
geolocation = geolocation_fix (19015, 6)


SyntaxError: invalid syntax (683061335.py, line 1)