## Home & Living Growth Strategy Report
### Olist E-commerce Data Analysis

### Î≥∏ ÌîÑÎ°úÏ†ùÌä∏Îäî Olist Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ Ï§ë
### Home & Living Ïπ¥ÌÖåÍ≥†Î¶¨Î•º Ï§ëÏã¨ÏúºÎ°ú Seller ÌñâÎèôÏù¥
### Îß§Ï∂ú ÏÑ±Ïû•Ïóê Ïñ¥Îñ§ ÏòÅÌñ•ÏùÑ ÎØ∏ÏπòÎäîÏßÄ Î∂ÑÏÑùÌïòÎäî Í≤ÉÏùÑ Î™©ÌëúÎ°ú ÌïúÎã§.
#
### Î∂ÑÏÑù Îã®ÏúÑ:
### - Seller Îã®ÏúÑ Îß§Ï∂ú
### - Seller Îã®ÏúÑ SKU Ïàò
### - Customer Íµ¨Îß§ ÌöüÏàò

0. Environment Setup

In [4]:
# Setup and imports
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from pathlib import Path

1. Load Data, Data Preprocessing

In [15]:
# Olist E-commerce EDA (Colab-ready)
# - Load Olist datasets
# - Basic cleaning / validation
# - Bundle cleaned tables into `clean` dict

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

# notebooks Í∏∞Ï§Ä ÌîÑÎ°úÏ†ùÌä∏ Î£®Ìä∏
BASE_DIR = Path("..")
DATA_DIR = BASE_DIR / "data"

def load_csv(filename: str) -> pd.DataFrame:
    path = DATA_DIR / filename
    if not path.exists():
        raise FileNotFoundError(
            f"Cannot find {filename} at {path}. "
            "Please check the data/ directory."
        )
    return pd.read_csv(path)

orders = load_csv("olist_orders_dataset.csv")
customers = load_csv("olist_customers_dataset.csv")
order_items = load_csv("olist_order_items_dataset.csv")
payments = load_csv("olist_order_payments_dataset.csv")
reviews = load_csv("olist_order_reviews_dataset.csv")
products = load_csv("olist_products_dataset.csv")
sellers = load_csv("olist_sellers_dataset.csv")
geolocation = load_csv("olist_geolocation_dataset.csv")

print("orders", orders.shape)
print("customers", customers.shape)
print("order_items", order_items.shape)
print("payments", payments.shape)
print("reviews", reviews.shape)
print("products", products.shape)
print("sellers", sellers.shape)
print("geolocation", geolocation.shape)

# Review cleanup (simple version)
# If the same order has identical score/text, keep only one record.
reviews_dedup = reviews.copy()

# Make sure timestamps are proper datetime
reviews_dedup["review_answer_timestamp"] = pd.to_datetime(
    reviews_dedup["review_answer_timestamp"], errors="coerce"
)

# Sort so the latest review stays when duplicates exist
reviews_dedup = reviews_dedup.sort_values("review_answer_timestamp")

# Remove exact duplicates by order + content + score
reviews_dedup = reviews_dedup.drop_duplicates(
    subset=["order_id", "review_score", "review_comment_title", "review_comment_message"],
    keep="last",
)

print("reviews before:", len(reviews))
print("reviews after dedup:", len(reviews_dedup))

# Order date cleanup (simple version)
# If a later date happens before an earlier date, set the later one to NaT.
orders_fix = orders.copy()

# Parse dates
orders_fix["order_purchase_timestamp"] = pd.to_datetime(
    orders_fix["order_purchase_timestamp"], errors="coerce"
)
orders_fix["order_approved_at"] = pd.to_datetime(orders_fix["order_approved_at"], errors="coerce")
orders_fix["order_delivered_carrier_date"] = pd.to_datetime(
    orders_fix["order_delivered_carrier_date"], errors="coerce"
)
orders_fix["order_delivered_customer_date"] = pd.to_datetime(
    orders_fix["order_delivered_customer_date"], errors="coerce"
)
orders_fix["order_estimated_delivery_date"] = pd.to_datetime(
    orders_fix["order_estimated_delivery_date"], errors="coerce"
)

# Flags
orders_fix["flag_approved_gt_carrier"] = (
    orders_fix["order_approved_at"] > orders_fix["order_delivered_carrier_date"]
)
orders_fix["flag_carrier_gt_delivered"] = (
    orders_fix["order_delivered_carrier_date"] > orders_fix["order_delivered_customer_date"]
)

# Fix impossible sequences
orders_fix.loc[orders_fix["flag_approved_gt_carrier"], "order_delivered_carrier_date"] = pd.NaT
orders_fix.loc[orders_fix["flag_carrier_gt_delivered"], "order_delivered_customer_date"] = pd.NaT

# Lead times (hours)
orders_fix["lead_purchase_to_approved_hr"] = (
    (orders_fix["order_approved_at"] - orders_fix["order_purchase_timestamp"])
    .dt.total_seconds()
    / 3600
)

orders_fix["lead_approved_to_carrier_hr"] = (
    (orders_fix["order_delivered_carrier_date"] - orders_fix["order_approved_at"])
    .dt.total_seconds()
    / 3600
)

orders_fix["lead_carrier_to_delivered_hr"] = (
    (orders_fix["order_delivered_customer_date"] - orders_fix["order_delivered_carrier_date"])
    .dt.total_seconds()
    / 3600
)

orders_fix["lead_purchase_to_delivered_hr"] = (
    (orders_fix["order_delivered_customer_date"] - orders_fix["order_purchase_timestamp"])
    .dt.total_seconds()
    / 3600
)

orders_fix["delay_days"] = (
    orders_fix["order_delivered_customer_date"] - orders_fix["order_estimated_delivery_date"]
).dt.days

print("approved>carrier flagged:", int(orders_fix["flag_approved_gt_carrier"].sum()))
print("carrier>delivered flagged:", int(orders_fix["flag_carrier_gt_delivered"].sum()))

# Product cleanup (simple version)
# Fill missing categories and set invalid weights to NaN.
products_fix = products.copy()

products_fix["product_category_name"] = products_fix["product_category_name"].fillna("Unknown")
products_fix["product_category_name"] = products_fix["product_category_name"].astype(str).str.strip()
products_fix.loc[products_fix["product_category_name"] == "", "product_category_name"] = "Unknown"

products_fix.loc[products_fix["product_weight_g"] <= 0, "product_weight_g"] = pd.NA

print("Unknown category:", int((products_fix["product_category_name"] == "Unknown").sum()))
print("weight NaN:", int(products_fix["product_weight_g"].isna().sum()))

# Payment cleanup (simple version)
# Flag non-positive payment values and keep a positive-only view.
payments_fix = payments.copy()
payments_fix["flag_payment_le_0"] = payments_fix["payment_value"] <= 0

payments_pos = payments_fix[payments_fix["payment_value"] > 0].copy()

print("payments total:", len(payments_fix))
print("payments <= 0:", int(payments_fix["flag_payment_le_0"].sum()))
print("payments_pos:", len(payments_pos))

# Geolocation cleanup (simple version)
# Aggregate by zip prefix and keep representative city/state.
geo = geolocation.copy()

geolocation_fix = geo.groupby("geolocation_zip_code_prefix").agg(
    geolocation_lat=("geolocation_lat", "mean"),
    geolocation_lng=("geolocation_lng", "mean"),
    geolocation_city=("geolocation_city", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
    geolocation_state=("geolocation_state", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
    count=("geolocation_city", "size"),
).reset_index()

print("geolocation_fix:", geolocation_fix.shape)

# Payment vs item totals check (simple version)
# Count orders with large discrepancies (abs delta > 100).
items_tot = order_items.groupby("order_id").agg(
    items_total=("price", "sum"),
    freight_total=("freight_value", "sum"),
)
items_tot["items_plus_freight"] = items_tot["items_total"] + items_tot["freight_total"]

pay_tot = payments.groupby("order_id").agg(payment_total=("payment_value", "sum"))

compare = items_tot.join(pay_tot, how="inner")
compare["delta"] = compare["payment_total"] - compare["items_plus_freight"]

large_delta = compare[compare["delta"].abs() > 100]
print("large deltas (abs>100):", len(large_delta))

# Final cleaned dataset bundle
clean = {
    "orders": orders_fix,
    "customers": customers,
    "order_items": order_items,
    "payments": payments_fix,
    "payments_pos": payments_pos,
    "reviews": reviews_dedup,
    "products": products_fix,
    "sellers": sellers,
    "geolocation": geolocation_fix,
}

for k, v in clean.items():
    print(k, v.shape)

orders (99441, 8)
customers (99441, 5)
order_items (112650, 7)
payments (103886, 5)
reviews (99224, 7)
products (32951, 9)
sellers (3095, 4)
geolocation (1000163, 5)
reviews before: 99224
reviews after dedup: 98997
approved>carrier flagged: 1359
carrier>delivered flagged: 23
Unknown category: 610
weight NaN: 6
payments total: 103886
payments <= 0: 9
payments_pos: 103877
geolocation_fix: (19015, 6)
large deltas (abs>100): 3
orders (99441, 15)
customers (99441, 5)
order_items (112650, 7)
payments (103886, 6)
payments_pos (103877, 6)
reviews (98997, 7)
products (32951, 9)
sellers (3095, 4)
geolocation (19015, 6)


In [16]:
#Category Ìè¨Î•¥Ìà¨Í∞àÏñ¥->ÏòÅÏñ¥ Î≤àÏó≠
#ÌååÏùº Î∂àÎü¨Ïò§Í∏∞
cat_trans = load_csv("product_category_name_translation.csv")

# Ïó¥ Ïù¥Î¶Ñ ÌôïÏù∏ (ÌòπÏãú encoding Ïù¥Ïäà ÏûàÏúºÎ©¥)
print(cat_trans.columns)

# 3) Î≥ëÌï© (merge)
products = products.merge(
    cat_trans,
    how="left",
    on="product_category_name"
)

# 4) ÏòÅÏñ¥ Ïù¥Î¶ÑÏúºÎ°ú Ïª¨Îüº Ï†ïÎ¶¨
products["product_category_name_english"] = (
    products["product_category_name_english"]
    .fillna("unknown")
)

# (ÏÑ†ÌÉù) Í∏∞Ï°¥ Ìè¨Î•¥Ìà¨Í∞àÏñ¥ Ïª¨ÎüºÏùÑ ÏßÄÏö∞Í≥† ÏòÅÏñ¥Îßå ÏÇ¨Ïö©
# products = products.drop(columns=["product_category_name"])


Index(['product_category_name', 'product_category_name_english'], dtype='object')


In [17]:
#Ïπ¥ÌÖåÍ≥†Î¶¨ -> ÏÇ∞ÏóÖÍµ∞ ÎßµÌïë
#ÏÇ∞ÏóÖÍµ∞ Í∏∞Ï§Ä ÌåêÎß§ ÏûêÎ£å ÌôïÏù∏ÌïòÍ∏∞ ÏúÑÌï¥

industry_map = {
     # Home & Living
    "bed_bath_table": "Home & Living",
    "furniture_decor": "Home & Living",
    "office_furniture": "Home & Living",
    "furniture_living_room": "Home & Living",
    "housewares": "Home & Living",
    "home_construction": "Home & Living",
    "garden_tools": "Home & Living",

    # Tech & Electronics
    "computers_accessories": "Tech & Electronics",
    "pcs": "Tech & Electronics",
    "telephony": "Tech & Electronics",
    "home_appliances": "Tech & Electronics",
    "small_appliances": "Tech & Electronics",
    "air_conditioning": "Tech & Electronics",
    "electronics": "Tech & Electronics",
    "consoles_games": "Tech & Electronics",

    # Health & Beauty
    "perfumery": "Health & Beauty",
    "health_beauty": "Health & Beauty",

    # Sports & Leisure
    "sports_leisure": "Sports & Leisure",
    "musical_instruments": "Sports & Leisure",
    "art": "Sports & Leisure",

    # Fashion & Accessories
    "watches_gifts": "Fashion & Accessories",
    "luggage_accessories": "Fashion & Accessories",
    "fashion_bags_accessories": "Fashion & Accessories",
    "fashion_shoes": "Fashion & Accessories",

    # Kids & Toys
    "baby": "Kids & Toys",
    "toys": "Kids & Toys",

    # Automotive
    "auto": "Automotive",

    # üî• ÏÉàÎ°ú Î∂ÑÎ¶¨
    "stationery": "Life Goods",
    "books_general_interest": "Life Goods",
    "cool_stuff": "Life Goods",

    "pet_shop": "Pet & Agro",
    "agro_industry_and_commerce": "Pet & Agro",

    "construction_tools_safety": "Construction & Safety",
    "signaling_and_security": "Construction & Safety",
}

products_fix=products
products_fix["industry"] = (
    products_fix["product_category_name_english"]
    .map(industry_map)
    .fillna("Others")
)

products.value_counts("industry")

industry
Home & Living            9464
Tech & Electronics       4332
Health & Beauty          3312
Sports & Leisure         3211
Others                   2871
Fashion & Accessories    2700
Kids & Toys              2330
Automotive               1900
Life Goods               1854
Pet & Agro                793
Construction & Safety     184
Name: count, dtype: int64

2. Home & Living EDA& Visualizations

In [22]:
# delivered Ï£ºÎ¨∏Îßå
orders_del = clean["orders"].loc[
    clean["orders"]["order_status"] == "delivered",
    ["order_id"]
].copy()

# Ï£ºÎ¨∏ Îã®ÏúÑ Í≤∞Ï†ú Ìï©(ÏñëÏàò Í≤∞Ï†úÎßå)
pay_order = (clean["payments_pos"]
             .groupby("order_id", as_index=False)
             .agg(payment_total=("payment_value", "sum")))

# ÏïÑÏù¥ÌÖú Î†àÎ≤® Î≤†Ïù¥Ïä§
items = clean["order_items"].copy()
items["item_gmv"] = items["price"] + items["freight_value"]

# Ï£ºÎ¨∏ Îã®ÏúÑ ÏïÑÏù¥ÌÖú Ìï©(Î∂ÑÎ™®)
items_sum = (items.groupby("order_id", as_index=False)
             .agg(order_item_gmv=("item_gmv", "sum")))

# productsÏóêÏÑú industry Î∂ôÏù¥Í∏∞ (ÎÑ§Í∞Ä ÎßåÎì† products_fix Í∏∞Ï§Ä)
prod = products_fix[["product_id", "industry"]].copy()

# 0Ï°∞Ïù∏
base = (items.merge(items_sum, on="order_id", how="left")
            .merge(pay_order, on="order_id", how="left")
            .merge(orders_del, on="order_id", how="inner")
            .merge(prod, on="product_id", how="left"))

# Îß§Ï∂ú Î∞∞Î∂Ñ(Ï£ºÎ¨∏ Í≤∞Ï†úÍ∏àÏï°ÏùÑ ÏïÑÏù¥ÌÖú GMV ÎπÑÏ§ëÏúºÎ°ú Î∞∞Î∂Ñ)
base["item_revenue_alloc"] = np.where(
    (base["order_item_gmv"] > 0) & base["payment_total"].notna(),
    base["payment_total"] * (base["item_gmv"] / base["order_item_gmv"]),
    np.nan
)

# Home & LivingÎßå ÌïÑÌÑ∞
hl = base[base["industry"] == "Home & Living"].copy()

print("HL rows:", hl.shape)
print("HL sellers:", hl["seller_id"].nunique())
print("HL orders:", hl["order_id"].nunique())
print("HL products:", hl["product_id"].nunique())

HL rows: (32935, 12)
HL sellers: 1032
HL orders: 26735
HL products: 9266


In [23]:
# Seller Îã®ÏúÑ Îß§Ï∂úÏùÑ ÏßëÍ≥ÑÌïòÏó¨
# Îß§Ï∂úÏù¥ ÌäπÏ†ï Seller Í∑∏Î£πÏóê ÏñºÎßàÎÇò ÏßëÏ§ëÎêòÏñ¥ ÏûàÎäîÏßÄ ÌôïÏù∏ÌïúÎã§.
#
# Ïù¥Î•º ÌÜµÌï¥,
# - "Î™®Îì† SellerÎ•º Í≥†Î•¥Í≤å ÌÇ§ÏõåÏïº ÌïòÎäîÍ∞Ä?"
# - "ÌäπÏ†ï Seller Í∑∏Î£πÏùÑ ÏßëÏ§ëÏ†ÅÏúºÎ°ú ÏÑ±Ïû•ÏãúÌÇ§Îäî Í≤ÉÏù¥ Ìö®Ïú®Ï†ÅÏù∏Í∞Ä?"
# ÎùºÎäî ÏßàÎ¨∏Ïóê ÎãµÌïòÍ≥†Ïûê ÌïúÎã§.

# ÏÖÄÎü¨ KPI ÏßëÍ≥Ñ
seller_kpi = (
    hl.groupby("seller_id", as_index=False)
      .agg(
          seller_revenue=("item_revenue_alloc", "sum"),
          seller_order_cnt=("order_id", "nunique"),
          seller_sku_cnt=("product_id", "nunique"),
          seller_item_rows=("order_id", "size"),
          avg_price=("price", "mean"),
          avg_freight=("freight_value", "mean"),
      )
      .sort_values("seller_revenue", ascending=False)
)

# Ï†ÑÏ≤¥ Îß§Ï∂ú
total_rev = seller_kpi["seller_revenue"].sum()

# Top N ÏÖÄÎü¨ Ï∂îÏ∂ú Ìï®Ïàò
def get_top_sellers(df, n, total_revenue):
    top_n = df.head(n).copy()
    top_n["rev_share"] = top_n["seller_revenue"] / total_revenue
    share = top_n["seller_revenue"].sum() / total_revenue
    return top_n, share

# Top 5 / 10 / 30
top5,  top5_share  = get_top_sellers(seller_kpi, 5,  total_rev)
top10, top10_share = get_top_sellers(seller_kpi, 10, total_rev)
top30, top30_share = get_top_sellers(seller_kpi, 30, total_rev)

# Í≤∞Í≥º ÌôïÏù∏
print(f"Top 5  revenue share : {top5_share  * 100:.2f}%")
print(f"Top 10 revenue share : {top10_share * 100:.2f}%")
print(f"Top 30 revenue share : {top30_share * 100:.2f}%")

display(top30.head())


Top 5  revenue share : 22.62%
Top 10 revenue share : 31.54%
Top 30 revenue share : 47.77%


Unnamed: 0,seller_id,seller_revenue,seller_order_cnt,seller_sku_cnt,seller_item_rows,avg_price,avg_freight,rev_share
513,7c67e1448b00f6e969d365cea6b010ab,233670.02903,954,191,1331,137.786897,37.752667,0.059197
300,4a3ca9315b744ce9f8e9374361493884,197437.757095,1506,317,1623,103.827381,17.754362,0.050018
875,da8622b14eb17ae2831f4ac5b9dab84a,173888.346444,1072,209,1275,119.143529,17.180769,0.044052
66,1025f0e2d44d7041d6cf58b6550e0bfa,151377.954953,842,146,1345,89.961309,22.583234,0.038349
127,1f50f920176fa81dab994f9023523100,136425.697757,1373,7,1877,54.794678,17.880767,0.034562


In [113]:
# Îß§Ï∂ú Í∏∞Ï§Ä 1~30ÏúÑ Ï†ÑÏ≤¥ Îß§Ï∂ú ÎπÑÏ§ë
C_BLACK  = "#0B0B0B"
C_WHITE  = "#FFFFFF"
C_BLUE   = "#3E84DF"
C_ORANGE = "#FFA742"
GRID     = "rgba(0,0,0,0.08)"

# Pareto DF (Í∏∞Ï°¥ Í≥ÑÏÇ∞ Ïú†ÏßÄ)
pareto = seller_kpi.sort_values("seller_revenue", ascending=False).copy()
pareto["rev_share"] = pareto["seller_revenue"] / pareto["seller_revenue"].sum()
pareto["cum_rev_share"] = pareto["rev_share"].cumsum()
pareto["rank"] = np.arange(1, len(pareto) + 1)

TOP_N = 30
topN_y = float(pareto.loc[pareto["rank"] == TOP_N, "cum_rev_share"].iloc[0])

# Figure
fig = go.Figure()

# Top30 ÏòÅÏó≠ Ï±ÑÏö∞Í∏∞ (ÏßëÏ§ë Íµ¨Í∞Ñ Í∞ïÏ°∞)
fig.add_trace(
    go.Scatter(
        x=pareto.loc[pareto["rank"] <= TOP_N, "rank"],
        y=pareto.loc[pareto["rank"] <= TOP_N, "cum_rev_share"],
        mode="lines",
        line=dict(color=C_ORANGE, width=5),
        fill="tozeroy",
        fillcolor="rgba(255,167,66,0.18)",
        hoverinfo="skip",
        showlegend=False
    )
)

# Ï†ÑÏ≤¥ Pareto Curve
fig.add_trace(
    go.Scatter(
        x=pareto["rank"],
        y=pareto["cum_rev_share"],
        mode="lines",
        line=dict(color=C_BLUE, width=3),
        hovertemplate=(
            "Seller Rank: %{x}<br>"
            "Cumulative Revenue: %{y:.1%}<extra></extra>"
        ),
        name="Cumulative Revenue"
    )
)

# Top30 Ìè¨Ïù∏Ìä∏ Í∞ïÏ°∞
fig.add_trace(
    go.Scatter(
        x=[TOP_N],
        y=[topN_y],
        mode="markers",
        marker=dict(size=14, color=C_ORANGE),
        hovertemplate=f"Top {TOP_N}<br>{topN_y:.1%} Revenue<extra></extra>",
        showlegend=False
    )
)

# Í∏∞Ï§ÄÏÑ†
fig.add_hline(
    y=0.5,
    line_dash="dash",
    line_color=C_BLUE,
    annotation_text="50% Revenue",
    annotation_position="bottom right"
)

fig.add_vline(
    x=TOP_N,
    line_dash="dash",
    line_color=C_ORANGE,
    annotation_text=f"Top {TOP_N} Sellers",
    annotation_position="top left"
)


fig.update_layout(
    width=800,
    height=400,  
    title=dict(
        text="Home & Living Seller Revenue Concentration (Pareto)",
        x=0.02,
        font=dict(size=20, color=C_BLACK)
    ),
    xaxis_title="Seller Rank",
    yaxis_title="Cumulative Revenue Share",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=C_BLACK),
    margin=dict(l=90, r=40, t=90, b=80),
)

fig.update_xaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False
)

fig.update_yaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False,
    tickformat=".0%",
    range=[0, 1.05]
)

fig.show()


### Î∂ÑÏÑù Í≤∞Í≥º,
-Îß§Ï∂ú Í∏∞Ï§Ä ÏÉÅÏúÑ 1~30ÏúÑ SellerÍ∞Ä Ï†ÑÏ≤¥ Îß§Ï∂úÏùò ÏïΩ 47.77%Î•º Ï∞®ÏßÄÌïúÎã§.

-Ïù¥Îäî Seller ÏÑ±Ïû• Ï†ÑÎûµÏùÑ ÏÑ§Í≥ÑÌï† Îïå,
'ÏÉÅÏúÑ Ïû†Ïû¨ Seller ÏßëÏ§ë Ïú°ÏÑ±'Ïù¥ Ïú†Ìö®Ìïú Ï†ëÍ∑ºÏûÑÏùÑ ÏãúÏÇ¨ÌïúÎã§.

### Ïú°ÏÑ±ÏùÑ ÏúÑÌïú ÏÉÅÏúÑ SellerÎì§Ïùò ÌåêÎß§ ÌñâÎèô Î∂ÑÏÑù
 Ï†ÑÏ≤¥ vs Top30 SKU/Ï£ºÎ¨∏/Îß§Ï∂ú ÏöîÏïΩ ÎπÑÍµê

 ‚ÄúÏÉÅÏúÑ ÏÖÄÎü¨Îì§Ïù¥ SKU ÎßéÏù¥ ÌåêÎã§‚Äù ÌôïÏù∏ (Î∂ÑÌè¨/ÏöîÏïΩ)

In [None]:
summary = pd.DataFrame({
    "group": ["All sellers", "Top30 sellers"],
    "seller_cnt": [seller_kpi["seller_id"].nunique(), top30["seller_id"].nunique()],
    "avg_sku": [seller_kpi["seller_sku_cnt"].mean(), top30["seller_sku_cnt"].mean()],
    "med_sku": [seller_kpi["seller_sku_cnt"].median(), top30["seller_sku_cnt"].median()],
    "avg_orders": [seller_kpi["seller_order_cnt"].mean(), top30["seller_order_cnt"].mean()],
    "med_orders": [seller_kpi["seller_order_cnt"].median(), top30["seller_order_cnt"].median()],
    "avg_rev": [seller_kpi["seller_revenue"].mean(), top30["seller_revenue"].mean()],
    "med_rev": [seller_kpi["seller_revenue"].median(), top30["seller_revenue"].median()],
})
summary

Unnamed: 0,group,seller_cnt,avg_sku,med_sku,avg_orders,med_orders,avg_rev,med_rev
0,All sellers,1032,9.187016,3.0,26.400194,5.0,3824.93121,659.76
1,Top30 sellers,30,95.933333,76.0,427.9,286.0,62850.452356,38219.486941


In [25]:
# ÏÉÅÏúÑ 30 ÏÖÄÎü¨Îì§Í≥º Ï†ÑÏ≤¥ ÏÖÄÎü¨Îì§Ïùò SKU Ïàò ÎπÑÍµê, Î°úÍ∑∏ Ïä§ÏºÄÏùº Ï†ÅÏö©
C_BLACK  = "#0B0B0B"
C_BLUE   = "#3E84DF"
C_ORANGE = "#FFA742"
GRID     = "rgba(0,0,0,0.08)"

# Data prep (Í∏∞Ï°¥ Î°úÏßÅ Ïú†ÏßÄ)
sku_df = summary[["group", "avg_sku", "med_sku"]].melt(
    id_vars="group",
    var_name="metric",
    value_name="value"
)

# Í∑∏Î£πÎ≥Ñ ÏÉâÏÉÅ Îß§Ìïë
group_color = {
    "All": C_BLUE,
    "Top30": C_ORANGE
}

# Figure
fig = go.Figure()

for g in sku_df["group"].unique():
    df_g = sku_df[sku_df["group"] == g]

    if "Top" in str(g):
        color = C_ORANGE
    else:
        color = C_BLUE

    fig.add_trace(
        go.Bar(
            x=df_g["metric"],
            y=df_g["value"],
            name=g,
            marker_color=color,
            text=[f"{int(v)}" for v in df_g["value"]],
            textposition="outside"
        )
    )

fig.update_layout(
    width=600,
    height=600,  
    title=dict(
        text="SKU Comparison (All vs Top30)",
        x=0.02,
        font=dict(size=20, color=C_BLACK)
    ),
    xaxis_title="",
    yaxis_title="Number of SKUs (log scale)",
    yaxis_type="log",          # ‚úÖ Î°úÍ∑∏ Ïä§ÏºÄÏùº
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=C_BLACK),
    margin=dict(l=80, r=40, t=80, b=70),
    legend=dict(
        title="",
        orientation="h",
        yanchor="bottom",
        y=-0.25,
        xanchor="left",
        x=0
    ),
    bargap=0.35
)

fig.update_xaxes(
    showgrid=False,
    tickfont=dict(size=12)
)

fig.update_yaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False
)

fig.show()



### SellerÎ≥Ñ SKU ÏàòÏôÄ Îß§Ï∂úÏùò Í¥ÄÍ≥ÑÎ•º Î∂ÑÏÑùÌïúÎã§.

Î™©Ï†Å:
-"Ïûò ÌååÎäî SellerÎì§ÏùÄ Î¨¥ÏóáÏùÑ Îã§Î•¥Í≤å ÌïòÎäîÍ∞Ä?"Ïóê ÎåÄÌïú
ÌñâÎèô Îã®ÏúÑÏùò ÌûåÌä∏Î•º ÏñªÍ∏∞ ÏúÑÌï®Ïù¥Îã§.

-SKU vs Îß§Ï∂ú Í¥ÄÍ≥ÑÏÑ± (ÏÉÅÍ¥Ä + Î°úÍ∑∏ÏÉÅÍ¥Ä)

In [26]:
tmp = (
    seller_kpi
    .dropna(subset=["seller_sku_cnt", "seller_revenue"])
    .copy()
)

print("Î∂ÑÏÑù ÎåÄÏÉÅ ÏÖÄÎü¨ Ïàò:", tmp.shape[0])

# ÏõêÎ≥∏ Ïä§ÏºÄÏùº ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ
corr_spearman = tmp[["seller_sku_cnt", "seller_revenue"]].corr(method="spearman")
corr_pearson  = tmp[["seller_sku_cnt", "seller_revenue"]].corr(method="pearson")

print("\n[ÏõêÎ≥∏ Í∏∞Ï§Ä ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ]")
print("Spearman")
display(corr_spearman)
print("Pearson")
display(corr_pearson)

# Î°úÍ∑∏ Î≥ÄÌôò
tmp["log_sku"] = np.log1p(tmp["seller_sku_cnt"])
tmp["log_rev"] = np.log1p(tmp["seller_revenue"])

# Î°úÍ∑∏ Ïä§ÏºÄÏùº ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ
corr_log_spearman = tmp[["log_sku", "log_rev"]].corr(method="spearman")
corr_log_pearson  = tmp[["log_sku", "log_rev"]].corr(method="pearson")

print("\n[Î°úÍ∑∏ Î≥ÄÌôò Í∏∞Ï§Ä ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ]")
print("Spearman (log)")
display(corr_log_spearman)
print("Pearson (log)")
display(corr_log_pearson)

# Ìïú Ï§Ñ ÏöîÏïΩÏö© Ïà´Ïûê
summary = {
    "spearman_raw": corr_spearman.iloc[0,1],
    "pearson_raw": corr_pearson.iloc[0,1],
    "spearman_log": corr_log_spearman.iloc[0,1],
    "pearson_log": corr_log_pearson.iloc[0,1],
}

print("\n[ÏöîÏïΩ]")
for k, v in summary.items():
    print(f"{k}: {v:.3f}")


Î∂ÑÏÑù ÎåÄÏÉÅ ÏÖÄÎü¨ Ïàò: 1032

[ÏõêÎ≥∏ Í∏∞Ï§Ä ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ]
Spearman


Unnamed: 0,seller_sku_cnt,seller_revenue
seller_sku_cnt,1.0,0.78607
seller_revenue,0.78607,1.0


Pearson


Unnamed: 0,seller_sku_cnt,seller_revenue
seller_sku_cnt,1.0,0.737116
seller_revenue,0.737116,1.0



[Î°úÍ∑∏ Î≥ÄÌôò Í∏∞Ï§Ä ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ]
Spearman (log)


Unnamed: 0,log_sku,log_rev
log_sku,1.0,0.78607
log_rev,0.78607,1.0


Pearson (log)


Unnamed: 0,log_sku,log_rev
log_sku,1.0,0.790676
log_rev,0.790676,1.0



[ÏöîÏïΩ]
spearman_raw: 0.786
pearson_raw: 0.737
spearman_log: 0.786
pearson_log: 0.791


Ï†ÑÏ≤¥ SellerÎ•º ÎåÄÏÉÅÏúºÎ°ú SKU ÏàòÏôÄ Îß§Ï∂úÏùÑ ÎπÑÍµêÌïú Í≤∞Í≥º,

Îëê Î≥ÄÏàòÎäî Ï†ÑÎ∞òÏ†ÅÏúºÎ°ú Ï†ïÎπÑÎ°Ä Í¥ÄÍ≥ÑÎ•º Î≥¥Ïù∏Îã§.

Ï¶â, SKU ÏàòÎäî Seller Îß§Ï∂úÏùÑ ÏÑ§Î™ÖÌïòÎäî

Í∞ÄÏû• ÏßÅÍ¥ÄÏ†ÅÏù¥Î©¥ÏÑúÎèÑ Í∞ïÎ†•Ìïú ÏßÄÌëú Ï§ë ÌïòÎÇòÏù¥Îã§.

In [27]:
# SKU vs Îß§Ï∂ú Í¥ÄÍ≥ÑÏÑ± (ÏÉÅÍ¥Ä + Î°úÍ∑∏ÏÉÅÍ¥Ä)
C_BLACK  = "#0B0B0B"
C_BLUE   = "#3E84DF"
C_ORANGE = "#FFA742"
GRID     = "rgba(0,0,0,0.08)"

# Outlier Ï†úÍ±∞
filt = (tmp["seller_sku_cnt"] <= 200) & (tmp["seller_revenue"] <= 60_000)
df = tmp.loc[filt].copy()

x = df["seller_sku_cnt"]
y = df["seller_revenue"]

# Ï∂îÏÑ∏ÏÑ† (OLS)
coef = np.polyfit(x, y, 1)
trend = np.poly1d(coef)

x_line = np.linspace(x.min(), x.max(), 200)
y_line = trend(x_line)

# Figure
fig = go.Figure()

# Scatter (Filtered points)
fig.add_trace(
    go.Scatter(
        x=x,
        y=y,
        mode="markers",
        marker=dict(
            size=7,
            color=C_BLUE,
            opacity=0.45
        ),
        hovertemplate=(
            "SKUs: %{x}<br>"
            "Revenue: %{y:,.0f}<extra></extra>"
        ),
        name="Sellers (Filtered)"
    )
)

# Trend line
fig.add_trace(
    go.Scatter(
        x=x_line,
        y=y_line,
        mode="lines",
        line=dict(color=C_ORANGE, width=3),
        name="Trend"
    )
)

# Layout
fig.update_layout(
    width=650,
    height=650,  
    title=dict(
        text="SKU vs Revenue (Filtered: SKU ‚â§ 200, Revenue ‚â§ 100k)",
        x=0.02,
        font=dict(size=20, color=C_BLACK)
    ),
    xaxis_title="Number of SKUs",
    yaxis_title="Revenue",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=C_BLACK),
    margin=dict(l=90, r=40, t=80, b=80),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.25,
        xanchor="left",
        x=0
    )
)

fig.update_xaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False,
    range=[0, 200]          
)

fig.update_yaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False,
    tickformat=",.0f",
    range=[0, 60_000]      
)

fig.show()



Îã®ÏàúÌûà 'SKUÍ∞Ä ÎßéÏùÑÏàòÎ°ù Ï¢ãÎã§'Í∞Ä ÏïÑÎãàÎùº,

"Ïñ¥ÎîîÎ∂ÄÌÑ∞ Îß§Ï∂úÏù¥ ÏùòÎØ∏ ÏûàÍ≤å Ï¶ùÍ∞ÄÌïòÎäîÍ∞Ä?"Î•º ÌôïÏù∏ÌïúÎã§.

Ïù¥Î•º ÏúÑÌï¥ SKU Íµ¨Í∞ÑÎ≥Ñ Îß§Ï∂ú Ï§ëÏúÑÍ∞í Î≥ÄÌôîÎ•º Î∂ÑÏÑùÌïúÎã§.

SKU Î∂ÑÏúÑÏàò 10Íµ¨Í∞Ñ(Ï§ëÎ≥µ Íµ¨Í∞Ñ Î∞©ÏßÄ)

In [28]:
tmp2 = seller_kpi.copy()

tmp2["sku_bin"] = pd.qcut(tmp2["seller_sku_cnt"], q=10, duplicates="drop")

sku_bin_table = (tmp2.groupby("sku_bin", as_index=False)
                 .agg(
                     sellers=("seller_id", "nunique"),
                     sku_min=("seller_sku_cnt","min"),
                     sku_max=("seller_sku_cnt","max"),
                     avg_sku=("seller_sku_cnt","mean"),
                     avg_rev=("seller_revenue","mean"),
                     med_rev=("seller_revenue","median")
                 ))

sku_bin_table






Unnamed: 0,sku_bin,sellers,sku_min,sku_max,avg_sku,avg_rev,med_rev
0,"(0.999, 2.0]",494,1,2,1.327935,549.574391,215.755
1,"(2.0, 3.0]",83,3,3,3.0,989.553076,611.68
2,"(3.0, 4.0]",81,4,4,4.0,2735.607731,877.11
3,"(4.0, 6.0]",90,5,6,5.522222,2557.625554,1219.775
4,"(6.0, 9.0]",81,7,9,7.987654,5531.969227,2465.09
5,"(9.0, 17.0]",102,10,17,12.990196,4956.557938,3559.815
6,"(17.0, 317.0]",101,18,317,57.257426,21666.112984,9781.923802


In [29]:
# SKU Î∂ÑÏúÑÏàò 10Íµ¨Í∞Ñ(Ï§ëÎ≥µ Íµ¨Í∞Ñ Î∞©ÏßÄ)
COLOR_BLUE   = "#3E84DF"
COLOR_ORANGE = "#FFA742"
COLOR_BLACK  = "#1f1f1f"
COLOR_GRID   = "rgba(0,0,0,0.08)"

# Data prep
t = sku_bin_table.sort_values("sku_min").reset_index(drop=True).copy()
t["sku_label"] = t["sku_max"].apply(lambda x: f"~{int(x)}")

# ÏûÑÍ≥Ñ Íµ¨Í∞Ñ (SKU 18)
threshold_idx = t[t["sku_min"] <= 18].index.max()
threshold_label = t.loc[threshold_idx, "sku_label"]

# Í∞ïÏ°∞Ìï† Íµ¨Í∞Ñ: ÏûÑÍ≥Ñ Ïù¥ÌõÑ Ï†ÑÏ≤¥
ramp_start = threshold_label
ramp_end = t["sku_label"].iloc[-1]

# Figure
fig = go.Figure()

# "Íµ¨Í∞Ñ" Í∞ïÏ°∞(ÏÑ∏Î°ú Î∞¥Îìú) ‚Äî ÏßÄÏ†ê Í∞ïÏ°∞ X
fig.add_vrect(
    x0=ramp_start,
    x1=ramp_end,
    fillcolor="rgba(255,167,66,0.18)",  # ORANGE alpha
    line_width=0,
    layer="below"
)

# ÏûÑÍ≥Ñ Ïù¥Ï†Ñ(Î∏îÎ£®) Íµ¨Í∞Ñ ÎùºÏù∏
fig.add_trace(
    go.Scatter(
        x=t.loc[:threshold_idx, "sku_label"],
        y=t.loc[:threshold_idx, "med_rev"],
        mode="lines+markers",
        line=dict(color=COLOR_BLUE, width=2),
        marker=dict(size=9, color=COLOR_BLUE, opacity=0.65),
        hovertemplate="SKU Range: %{x}<br>Median Revenue: %{y:,.0f}<extra></extra>",
        showlegend=False
    )
)

# ÏûÑÍ≥Ñ Ïù¥ÌõÑ(Ïò§Î†åÏßÄ) Íµ¨Í∞Ñ ÎùºÏù∏ ‚Äî Íµ¨Í∞Ñ ÏûêÏ≤¥Î•º Îçî Í∞ïÌïòÍ≤å
fig.add_trace(
    go.Scatter(
        x=t.loc[threshold_idx:, "sku_label"],
        y=t.loc[threshold_idx:, "med_rev"],
        mode="lines+markers",
        line=dict(color=COLOR_ORANGE, width=4),  # ‚úÖ ÎëêÍªçÍ≤å
        marker=dict(size=11, color=COLOR_ORANGE, opacity=0.95),
        hovertemplate="SKU Range: %{x}<br>Median Revenue: %{y:,.0f}<extra></extra>",
        showlegend=False
    )
)

# Íµ¨Í∞Ñ ÎùºÎ≤®(ÌÖçÏä§Ìä∏Îßå) ‚Äî ÌôîÏÇ¥Ìëú/ÏßÄÏ†ê ÏóÜÏù¥
fig.add_annotation(
    x=ramp_start,
    y=1.03,
    xref="x",
    yref="paper",
    text="ÏûÑÍ≥Ñ Ïù¥ÌõÑ Îß§Ï∂ú Í∏âÏÉÅÏäπ Íµ¨Í∞Ñ",
    showarrow=False,
    font=dict(size=12, color=COLOR_BLACK),
    bgcolor="rgba(255,255,255,0.9)",
    bordercolor="rgba(0,0,0,0.12)",
    borderwidth=1,
    xanchor="left"
)

# Layout (16:4 PPT)
fig.update_layout(
    width=800,
    height=400,
    title=dict(
        text="Median Revenue by SKU Bin (Home & Living)",
        x=0.01,
        font=dict(size=20, color=COLOR_BLACK)
    ),
    xaxis_title="SKU Range",
    yaxis_title="Median Revenue",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=COLOR_BLACK),
    margin=dict(l=80, r=40, t=70, b=60),
)

fig.update_xaxes(
    showgrid=True,
    gridcolor=COLOR_GRID,
    zeroline=False,
    categoryorder="array",
    categoryarray=t["sku_label"].tolist(),
)

fig.update_yaxes(
    showgrid=True,
    gridcolor=COLOR_GRID,
    zeroline=False,
    tickformat=",.0f",
)

fig.show()


### Î∂ÑÏÑù Í≤∞Í≥º,

SKU 18Í∞ú Ïù¥ÏÉÅ Íµ¨Í∞ÑÏóêÏÑú Îß§Ï∂ú Ï§ëÏúÑÍ∞íÏù¥ Í∞ÄÏû• ÌÅ∞ Ìè≠ÏúºÎ°ú ÏÉÅÏäπÌïúÎã§.

Ïù¥Îäî 'SKU 18Í∞ú'Í∞Ä

Seller ÏÑ±Ïû•ÏùÑ ÏúÑÌïú ÌïòÎÇòÏùò Ïã§ÏßàÏ†ÅÏù∏ Í∏∞Ï§ÄÏÑ†(Threshold)Ïù¥ Îê† Ïàò ÏûàÏùåÏùÑ ÏùòÎØ∏ÌïúÎã§.

In [30]:

# SKU ÏûÑÍ≥ÑÏ†ê(18Í∞ú) Ï†ÑÌõÑ ÎπÑÍµê Î∂ÑÏÑù
THRESHOLD = 18

df = seller_kpi.copy()

# SKU Í∏∞Ï§Ä Í∑∏Î£π ÎÇòÎàÑÍ∏∞
df["sku_group"] = np.where(
    df["seller_sku_cnt"] < THRESHOLD,
    f"< {THRESHOLD}",
    f">= {THRESHOLD}"
)

# Í∑∏Î£πÎ≥Ñ ÌïµÏã¨ ÏßÄÌëú ÎπÑÍµê
summary = (
    df.groupby("sku_group", as_index=False)
      .agg(
          sellers=("seller_id", "nunique"),
          avg_sku=("seller_sku_cnt", "mean"),
          med_sku=("seller_sku_cnt", "median"),
          avg_rev=("seller_revenue", "mean"),
          med_rev=("seller_revenue", "median"),
          avg_orders=("seller_order_cnt", "mean"),
          med_orders=("seller_order_cnt", "median"),
      )
)

summary


Unnamed: 0,sku_group,sellers,avg_sku,med_sku,avg_rev,med_rev,avg_orders,med_orders
0,< 18,931,3.972073,2.0,1889.421695,508.63,11.832438,4.0
1,>= 18,101,57.257426,36.0,21666.112984,9781.923802,160.683168,83.0


18Í∞úÏùò SKU Ïñ¥Îñ§ Ïπ¥ÌÖåÍ≥†Î¶¨ÏóêÏÑú Í≥®ÎùºÏïº ÌïòÎäîÍ∞Ä

ÏÉÅÏúÑ 3Í∞úÏùò category ÏóêÏÑú Îß§Ï∂úÏùò 80%Í∞Ä Î∞úÏÉùÌï®

Ïã†Í∑ú ÏßÑÏûÖÏù¥ÎÇò Îß§Ï∂úÏù¥ Ï†ÄÏ°∞Ìïú SellerÎì§ÏóêÍ≤å 

18Í∞ú Ïù¥ÏÉÅÏùò SKU -> Îß§Ï∂ú ÏÉÅÏúÑ 3Í∞úÏùò Category ÎÇ¥ÏóêÏÑú ÏÑ†Ï†ï Ï∂îÏ≤ú

In [31]:
hl_items = (
    order_items
    .merge(
        products_fix[["product_id", "industry", "product_category_name_english"]],
        on="product_id",
        how="left"
    )
    .query("industry == 'Home & Living'")
)

# 2) Ïπ¥ÌÖåÍ≥†Î¶¨Î≥Ñ ÏßëÍ≥Ñ (ÌåêÎß§Í±¥Ïàò + ÎàÑÏ†Å Îß§Ï∂ú)
cat_perf = (
    hl_items
    .groupby("product_category_name_english")
    .agg(
        order_cnt=("order_id", "nunique"),   # Ï£ºÎ¨∏ Í±¥Ïàò(Ï§ëÎ≥µ Ï†úÍ±∞)
        item_cnt=("order_item_id", "count"), # ÌåêÎß§ ÏàòÎüâ(ÎùºÏù∏ÏïÑÏù¥ÌÖú Í∏∞Ï§Ä)
        total_sales=("price", "sum")         # Îß§Ï∂ú(ÏÉÅÌíà price Ìï©)
    )
    .reset_index()
)

# Î≥¥Í∏∞ Ï¢ãÍ≤å Ï†ïÎ†¨ (Ï£ºÎ¨∏Í±¥Ïàò Í∏∞Ï§Ä)
cat_perf = cat_perf.sort_values("order_cnt", ascending=False)
cat_perf.head(10)


Unnamed: 0,product_category_name_english,order_cnt,item_cnt,total_sales
0,bed_bath_table,9417,11115,1036988.68
1,furniture_decor,6449,8334,729762.49
5,housewares,5884,6964,632248.66
3,garden_tools,3518,4347,485256.46
6,office_furniture,1273,1691,273960.7
4,home_construction,490,604,83088.12
2,furniture_living_room,422,503,68916.56


In [32]:
# Ïπ¥ÌÖåÍ≥†Î¶¨Î≥Ñ Îß§Ï∂ú Í∑úÎ™®
C_BLACK = "#0B0B0B"
C_BLUE  = "#3E84DF"
C_GRAY  = "#D3D3D3"
GRID    = "rgba(0,0,0,0.15)"  

# Highlight categories
highlight_cats = [
    "housewares",
    "furniture_decor",
    "bed_bath_table"
]

# Data prep (Í∏∞Ï°¥ Î°úÏßÅ Ïú†ÏßÄ)
plot_df = cat_perf.head(30).sort_values("order_cnt", ascending=True)

colors = [
    C_BLUE if c in highlight_cats else C_GRAY
    for c in plot_df["product_category_name_english"]
]

# Figure
fig = go.Figure()

fig.add_trace(
    go.Bar(
        y=plot_df["product_category_name_english"],
        x=plot_df["order_cnt"],
        orientation="h",
        marker_color=colors,
        hovertemplate=(
            "Category: %{y}<br>"
            "Orders: %{x:,.0f}<extra></extra>"
        ),
        showlegend=False
    )
)

# Î≥Ñ(‚òÖ) Annotation (Í∞ïÏ°∞ Ïπ¥ÌÖåÍ≥†Î¶¨)
max_x = plot_df["order_cnt"].max()

for _, row in plot_df.iterrows():
    if row["product_category_name_english"] in highlight_cats:
        fig.add_annotation(
            x=row["order_cnt"] + max_x * 0.02,
            y=row["product_category_name_english"],
            text="‚òÖ",
            showarrow=False,
            font=dict(size=16, color=C_BLACK),
            xanchor="left",
            yanchor="middle"
        )

# Layout 
fig.update_layout(
    width=700,
    height=700,  
    title=dict(
        text="Home & Living: Core Categories Driving Orders",
        x=0.02,
        font=dict(size=20, color=C_BLACK)
    ),
    xaxis_title="Number of Orders",
    yaxis_title="Category (English)",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=C_BLACK),
    margin=dict(l=180, r=40, t=80, b=80),
)

# Axes (ÎààÍ∏à & Í∑∏Î¶¨Îìú Í∞ïÏ°∞)
fig.update_xaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False,
    ticks="outside",         
    ticklen=8,               
    tickwidth=1.5,
    tickcolor=C_BLACK,
    tickformat=",.0f"
)

fig.update_yaxes(
    showgrid=False,
    ticks="outside",
    tickfont=dict(size=12)
)

fig.show()

Home & Living Ïπ¥ÌÖåÍ≥†Î¶¨Ïùò Ïû¨Íµ¨Îß§Ïú®Ïù¥ ÎÇÆÍ≤å Î≥¥Ïù¥Îäî Ïù¥Ïú†Î•º

Îç∞Ïù¥ÌÑ∞ Í¥ÄÏ†êÏóêÏÑú Ìï¥ÏÑùÌïúÎã§.

Ïπ¥ÌÖåÍ≥†Î¶¨ ÌäπÏÑ±ÏÉÅ Ï†úÌíà ÍµêÏ≤¥ Ï£ºÍ∏∞Í∞Ä Í∏∏Ïñ¥

Í¥ÄÏ∏° Í∏∞Í∞Ñ(2ÎÖÑ) ÎÇ¥ Ïû¨Íµ¨Îß§Í∞Ä Î∞úÏÉùÌïòÏßÄ ÏïäÏïòÏùÑ Í∞ÄÎä•ÏÑ±Ïù¥ ÎÜíÎã§.

In [33]:
#Ïû¨Íµ¨Îß§ Í≥†Í∞ù Ï†ÑÌôòÏú®
# =========================
# Ï£ºÎ¨∏ + Í≥†Í∞ù Í≤∞Ìï©
orders_u = (
    orders_fix[["order_id", "customer_id", "order_purchase_timestamp"]]
    .merge(customers[["customer_id", "customer_unique_id"]],
           on="customer_id",
           how="left")
)

orders_u["order_purchase_timestamp"] = pd.to_datetime(
    orders_u["order_purchase_timestamp"], errors="coerce"
)

# Í≥†Í∞ùÎ≥Ñ Íµ¨Îß§ ÌöüÏàò Í≥ÑÏÇ∞ (ÏÇ∞ÏóÖ Î¨¥Í¥Ä)
purchase_cnt = (
    orders_u
    .groupby("customer_unique_id")
    .size()
    .reset_index(name="purchase_count")
)

# Ïû¨Íµ¨Îß§ Ï†ÑÌôòÏú® Í≥ÑÏÇ∞ Ìï®Ïàò
def conversion_rate(df, n_from, n_to):
    base = df.loc[df["purchase_count"] >= n_from].shape[0]
    conv = df.loc[df["purchase_count"] >= n_to].shape[0]
    rate = conv / base if base > 0 else 0
    return base, conv, rate

# Ï†ÑÌôòÏú® Í≥ÑÏÇ∞
results = []

for f, t in [(1, 2), (2, 3), (3, 4)]:
    base, conv, rate = conversion_rate(purchase_cnt, f, t)
    results.append({
        "from_to": f"{f} ‚Üí {t}",
        "base_customers": base,
        "converted_customers": conv,
        "conversion_rate": rate
    })

conversion_df = pd.DataFrame(results)

# Í≤∞Í≥º Ï∂úÎ†•
display(
    conversion_df.assign(
        conversion_rate=lambda df:
        (df["conversion_rate"] * 100).round(2).astype(str) + "%"
    )
)


Unnamed: 0,from_to,base_customers,converted_customers,conversion_rate
0,1 ‚Üí 2,96096,2997,3.12%
1,2 ‚Üí 3,2997,252,8.41%
2,3 ‚Üí 4,252,49,19.44%


In [127]:
#Ïû¨Íµ¨Îß§ Íµ¨Îß§ Í≥†Í∞ù ÎπÑÏú®

# Color Palette (ÎèôÏùº)
C_BLACK  = "#0B0B0B"
C_BLUE   = "#3E84DF"
C_ORANGE = "#FFA742"

# Ï†ÑÏ≤¥ Í≥†Í∞ù vs Ïû¨Íµ¨Îß§ Í≥†Í∞ù(=2Ìöå Ïù¥ÏÉÅ)
total_customers = int((purchase_cnt["purchase_count"] >= 1).sum())
repeat_2plus = int((purchase_cnt["purchase_count"] >= 2).sum())

rate = repeat_2plus / total_customers if total_customers > 0 else 0
rate_pct = rate * 100

labels = ["Ïû¨Íµ¨Îß§ Í≥†Í∞ù(2Ìöå Ïù¥ÏÉÅ)", "Ï†ÑÏ≤¥ Íµ¨Îß§ Í≥†Í∞ù"]
values = [repeat_2plus, total_customers - repeat_2plus]

# Pie (ÎèÑÎÑõ Ï∂îÏ≤ú: Î∞úÌëúÏö©)
fig = go.Figure(
    data=[
        go.Pie(
            labels=labels,
            values=values,
            hole=0.65,
            textinfo="none",
            marker=dict(colors=[C_ORANGE, C_BLUE]),
            sort=False
        )
    ]
)

# Í∞ÄÏö¥Îç∞ 3% ÌÅ¨Í≤å ÌëúÏãú
fig.add_annotation(
    text=f"<b>{rate_pct:.1f}%</b><br><span style='font-size:12px'>Ïû¨Íµ¨Îß§ Í≥†Í∞ù</span>",
    x=0.5, y=0.5,
    showarrow=False,
    font=dict(color=C_BLACK, size=22)
)

fig.update_layout(
    title=dict(
        text="Ï†ÑÏ≤¥ Í≥†Í∞ù ÎåÄÎπÑ Ïû¨Íµ¨Îß§ Í≥†Í∞ù ÎπÑÏ§ë",
        x=0.02,
        font=dict(size=20, color=C_BLACK)
    ),
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=C_BLACK),
    margin=dict(l=40, r=40, t=70, b=40),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.15,
        xanchor="left",
        x=0
    )
)

fig.update_layout(
    autosize=False,
    width=600,
    height=400
)

fig.show()

print(f"Ï†ÑÏ≤¥ Íµ¨Îß§ Í≥†Í∞ù: {total_customers:,}")
print(f"Ïû¨Íµ¨Îß§ Í≥†Í∞ù(2Ìöå Ïù¥ÏÉÅ): {repeat_2plus:,}")
print(f"ÎπÑÏ§ë: {rate_pct:.2f}%")


Ï†ÑÏ≤¥ Íµ¨Îß§ Í≥†Í∞ù: 96,096
Ïû¨Íµ¨Îß§ Í≥†Í∞ù(2Ìöå Ïù¥ÏÉÅ): 2,997
ÎπÑÏ§ë: 3.12%


In [34]:
#Í∞Å Íµ¨Í∞ÑÎ≥Ñ Ïû¨Íµ¨Îß§ Ï†ÑÌôòÏú® ÏãúÍ∞ÅÌôî
# =====================
# Color Palette
# =====================
C_BLACK  = "#0B0B0B"
C_BLUE   = "#3E84DF"
C_ORANGE = "#FFA742"
GRID     = "rgba(0,0,0,0.08)"

# =====================
# Data
# =====================
df = conversion_df.copy()
df["rate_pct"] = df["conversion_rate"] * 100

# üëâ ÏÉâÏÉÅ: ÎßàÏßÄÎßâ Íµ¨Îß§(3‚Üí4)Îßå Ïò§Î†åÏßÄ
bar_colors = [
    C_ORANGE if x == df["from_to"].iloc[-1] else C_BLUE
    for x in df["from_to"]
]

# =====================
# Figure
# =====================
fig = go.Figure(
    go.Bar(
        x=df["from_to"],
        y=df["rate_pct"],
        marker_color=bar_colors,
        width=0.5,  # ‚úÖ ÎßâÎåÄ ÏñáÍ≤å (Í∏∞Î≥∏ ~0.8)
        text=[f"{v:.2f}%" for v in df["rate_pct"]],
        textposition="outside",
        hovertemplate="Íµ¨Í∞Ñ: %{x}<br>Ï†ÑÌôòÏú®: %{y:.2f}%<extra></extra>",
        showlegend=False
    )
)

# =====================
# Layout
# =====================
fig.update_layout(
    title=dict(
        text="Ïû¨Íµ¨Îß§ Ï†ÑÌôòÏú® (Ï†ÑÏ≤¥ Í≥†Í∞ù Í∏∞Ï§Ä)",
        x=0.02,
        font=dict(size=20, color=C_BLACK)
    ),
    xaxis_title="",
    yaxis_title="Conversion Rate (%)",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color=C_BLACK),
    margin=dict(l=80, r=40, t=70, b=60),
)

fig.update_yaxes(
    showgrid=True,
    gridcolor=GRID,
    zeroline=False,
    rangemode="tozero"
)

fig.update_xaxes(showgrid=False)

fig.update_layout(
    autosize=False,
    width=600,
    height=400
)

fig.update_yaxes(
    range=[0, 25],          
    showgrid=True,
    gridcolor=GRID,
    zeroline=False,
    rangemode="tozero"
)

fig.show()

Îã§Îßå, 3Ìöå Ïù¥ÏÉÅ Íµ¨Îß§ Í≥†Í∞ùÏùò Í≤ΩÏö∞

Î¶¨ÌÖêÏÖòÏù¥ Í∏âÍ≤©Ìûà Ï¶ùÍ∞ÄÌïòÎäî Ìå®ÌÑ¥ÏùÑ Î≥¥Ïù∏Îã§.

Ïù¥Îäî Seller ÏÑ±Ïû• Ï†ÑÎûµÍ≥º Î≥ëÌñâÌïòÏó¨

'Ï∂©ÏÑ± Í≥†Í∞ù ÏãùÎ≥Ñ Î∞è Í¥ÄÎ¶¨' Ï†ÑÎûµÏù¥ Ïú†Ìö®Ìï®ÏùÑ ÏãúÏÇ¨ÌïúÎã§.

## ‚úÖ ÏµúÏ¢Ö Í≤∞Î°†

Olist ÏÑ±Ïû•ÏùÑ ÏúÑÌïú Í∞ÄÏû• Í∞ïÎ†•Ìïú Î†àÎ≤ÑÎäî Îã§ÏùåÍ≥º Í∞ôÏäµÎãàÎã§.

Seller: SKU 18Í∞ú Ïù¥ÏÉÅ

Customer: 3Ìöå Ïù¥ÏÉÅ Íµ¨Îß§ Í≥†Í∞ù Ïû•Í∏∞Ìôî

SellerÎäî ÌÇ§Ïö∞Í≥†,

Ï∂©ÏÑ± CustomerÎäî Ïò§Îûò ÎÇ®Í≤å ÎßåÎì†Îã§.