# Home & Living Growth Strategy Report

**Olist E-commerce Data Analysis**

This report contains data analysis results for establishing growth strategies for Olist's Home & Living category.

## 0. Environment Setup

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

print("âœ… Libraries Loaded")

## 1. Load Data

In [None]:
# Data Path
DATA_PATH = r'data\preprocessed'

# Load CSV Files
df = pd.read_csv(DATA_PATH + '/preprocessed_all_merged.csv')

print(df)

## 2. Data Preprocessing

In [None]:
# Industry Mapping
industry_map = {
    "bed_bath_table": "Home & Living",
    "furniture_decor": "Home & Living",
    "office_furniture": "Home & Living",
    "furniture_living_room": "Home & Living",
    "housewares": "Home & Living",
    "home_construction": "Home & Living",
    "garden_tools": "Home & Living",
    "computers_accessories": "Tech & Electronics",
    "pcs": "Tech & Electronics",
    "telephony": "Tech & Electronics",
    "home_appliances": "Tech & Electronics",
    "small_appliances": "Tech & Electronics",
    "air_conditioning": "Tech & Electronics",
    "electronics": "Tech & Electronics",
    "consoles_games": "Tech & Electronics",
    "perfumery": "Health & Beauty",
    "health_beauty": "Health & Beauty",
    "sports_leisure": "Sports & Leisure",
    "musical_instruments": "Sports & Leisure",
    "art": "Sports & Leisure",
    "watches_gifts": "Fashion & Accessories",
    "luggage_accessories": "Fashion & Accessories",
    "fashion_bags_accessories": "Fashion & Accessories",
    "fashion_shoes": "Fashion & Accessories",
    "baby": "Kids & Toys",
    "toys": "Kids & Toys",
    "auto": "Automotive",
    "stationery": "Life Goods",
    "books_general_interest": "Life Goods",
    "cool_stuff": "Life Goods",
    "pet_shop": "Pet & Agro",
    "agro_industry_and_commerce": "Pet & Agro",
    "construction_tools_safety": "Construction & Safety",
    "signaling_and_security": "Construction & Safety",
}

df['industry'] = df['product_category_name_english'].map(industry_map).fillna('Other')
df['gmv_item'] = df['price'] + df['freight_value']

print("âœ… Industry mapping and GMV calculation complete")

## 3. Why Home & Living?

In [None]:
# Aggregation by Industry
industry_summary = df.groupby('industry').agg({
    'seller_id': 'nunique',
    'order_id': 'nunique',
    'gmv_item': 'sum'
}).reset_index()
industry_summary.columns = ['industry', 'seller_count', 'order_count', 'total_gmv']
industry_summary = industry_summary.sort_values('total_gmv', ascending=False)

display(industry_summary)

### 3-1. Seller Count by Industry

In [None]:
chart_data = industry_summary.sort_values('seller_count', ascending=True)
colors = ['#ff7f0e' if x == 'Home & Living' else '#1f77b4' for x in chart_data['industry']]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=chart_data['industry'],
    x=chart_data['seller_count'],
    orientation='h',
    marker=dict(color=colors),
    text=chart_data['seller_count'],
    texttemplate='%{text:,}',
    textposition='outside'
))
fig.update_layout(
    title='Seller Count by Industry',
    xaxis_title='Seller Count',
    yaxis_title='Industry',
    height=600,
    showlegend=False
)
fig.show()

### 3-2. Order Count by Industry

In [None]:
chart_data = industry_summary.sort_values('order_count', ascending=True)
colors = ['#ff7f0e' if x == 'Home & Living' else '#1f77b4' for x in chart_data['industry']]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=chart_data['industry'],
    x=chart_data['order_count'],
    orientation='h',
    marker=dict(color=colors),
    text=chart_data['order_count'],
    texttemplate='%{text:,}',
    textposition='outside'
))
fig.update_layout(
    title='Order Count by Industry',
    xaxis_title='Order Count',
    yaxis_title='Industry',
    height=600,
    showlegend=False
)
fig.show()

### 3-3. GMV by Industry

In [None]:
chart_data = industry_summary.sort_values('total_gmv', ascending=True)
colors = ['#ff7f0e' if x == 'Home & Living' else '#1f77b4' for x in chart_data['industry']]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=chart_data['industry'],
    x=chart_data['total_gmv'],
    orientation='h',
    marker=dict(color=colors),
    text=chart_data['total_gmv'],
    texttemplate='R$ %{text:,.0f}',
    textposition='outside'
))
fig.update_layout(
    title='GMV by Industry (Total Revenue)',
    xaxis_title='GMV (R$)',
    yaxis_title='Industry',
    height=600,
    showlegend=False
)
fig.show()

## 4. Seller Side: Revenue Concentration

In [None]:
# Filter Home & Living
df_home = df[df['industry'] == 'Home & Living'].copy()
seller_gmv = df_home.groupby('seller_id')['gmv_item'].sum().reset_index()
seller_gmv.columns = ['seller_id', 'total_gmv']
seller_gmv = seller_gmv.sort_values('total_gmv', ascending=False).reset_index(drop=True)

print(f"Home & Living Seller Count: {len(seller_gmv):,}")
print(f"Total GMV: R$ {seller_gmv['total_gmv'].sum():,.2f}")

### 4-1. Top 30 Sellers Revenue Concentration

In [None]:
top_30_sellers = seller_gmv.head(30).copy()
top_30_sellers['cumsum_gmv'] = top_30_sellers['total_gmv'].cumsum()
top_30_sellers['cumsum_pct'] = (top_30_sellers['cumsum_gmv'] / seller_gmv['total_gmv'].sum()) * 100
top_30_concentration = top_30_sellers['cumsum_pct'].iloc[-1]

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Bar(
        x=list(range(1, 31)),
        y=top_30_sellers['total_gmv'],
        name='Individual GMV',
        marker=dict(color='#3498db')
    ),
    secondary_y=False
)
fig.add_trace(
    go.Scatter(
        x=list(range(1, 31)),
        y=top_30_sellers['cumsum_pct'],
        name='Cumulative %',
        mode='lines+markers',
        line=dict(color='#e74c3c', width=3)
    ),
    secondary_y=True
)
fig.update_layout(
    title=f'Top 30 Sellers GMV Concentration (Cumulative {top_30_concentration:.1f}%)',
    xaxis_title='Seller Rank',
    height=600
)
fig.update_yaxes(title_text='GMV (R$)', secondary_y=False)
fig.update_yaxes(title_text='Cumulative %', secondary_y=True)
fig.show()

print(f"ðŸ’¡ Top 30 sellers account for {top_30_concentration:.1f}% of total sales.")

### 4-2. Seller GMV Distribution

In [None]:
median_gmv = seller_gmv['total_gmv'].median()

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=seller_gmv['total_gmv'],
    nbinsx=50,
    marker=dict(color='#3498db')
))
fig.add_vline(
    x=median_gmv,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Median: R$ {median_gmv:,.0f}"
)
fig.update_layout(
    title='Seller GMV Distribution',
    xaxis_title='GMV (R$)',
    xaxis_type='log',
    yaxis_title='Seller Count',
    height=600
)
fig.show()

## 5. Relationship between SKU Count and Sales

In [None]:
seller_sku = df_home.groupby('seller_id')['product_id'].nunique().reset_index()
seller_sku.columns = ['seller_id', 'sku_count']
seller_analysis = seller_gmv.merge(seller_sku, on='seller_id')

print(f"Average SKU Count: {seller_analysis['sku_count'].mean():.1f}")

### 5-1. SKU Count vs GMV

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=seller_analysis['sku_count'],
    y=seller_analysis['total_gmv'],
    mode='markers',
    marker=dict(
        size=8,
        color=seller_analysis['total_gmv'],
        colorscale='Blues',
        showscale=True
    )
))
fig.add_vline(
    x=18,
    line_dash="dash",
    line_color="red",
    line_width=3,
    annotation_text="Threshold: 18 SKUs"
)
fig.update_layout(
    title='Relationship between SKU Count and GMV',
    xaxis_title='SKU Count',
    yaxis_title='GMV (R$)',
    yaxis_type='log',
    height=600
)
fig.show()

print("ðŸ’¡ Sales increase sharply from 18 SKUs onwards.")

### 5-2. Average GMV by SKU Range

In [None]:
def categorize_sku(sku_count):
    if sku_count <= 5:
        return '1-5'
    elif sku_count <= 10:
        return '6-10'
    elif sku_count <= 17:
        return '11-17'
    elif sku_count <= 25:
        return '18-25'
    elif sku_count <= 50:
        return '26-50'
    else:
        return '51+'

seller_analysis['sku_range'] = seller_analysis['sku_count'].apply(categorize_sku)
sku_range_avg = seller_analysis.groupby('sku_range').agg({
    'total_gmv': 'mean',
    'seller_id': 'count'
}).reset_index()
sku_range_avg.columns = ['sku_range', 'avg_gmv', 'seller_count']

range_order = ['1-5', '6-10', '11-17', '18-25', '26-50', '51+']
sku_range_avg['sku_range'] = pd.Categorical(sku_range_avg['sku_range'], categories=range_order, ordered=True)
sku_range_avg = sku_range_avg.sort_values('sku_range')

colors_range = ['#3498db', '#3498db', '#3498db', '#e74c3c', '#e74c3c', '#e74c3c']

fig = go.Figure()
fig.add_trace(go.Bar(
    x=sku_range_avg['sku_range'],
    y=sku_range_avg['avg_gmv'],
    marker=dict(color=colors_range),
    text=sku_range_avg['avg_gmv'],
    texttemplate='R$ %{text:,.0f}',
    textposition='outside'
))
fig.update_layout(
    title='Average GMV by SKU Range',
    xaxis_title='SKU Range',
    yaxis_title='Average GMV (R$)',
    height=600
)
fig.show()

## 6. User Side: Repurchase Analysis

In [None]:
customer_orders = df_home.groupby('customer_unique_id')['order_id'].nunique().reset_index()
customer_orders.columns = ['customer_unique_id', 'order_count']

total_customers = customer_orders['customer_unique_id'].nunique()
repeat_customers = customer_orders[customer_orders['order_count'] >= 2]['customer_unique_id'].nunique()
repeat_rate = (repeat_customers / total_customers) * 100

print(f"Total Customers: {total_customers:,}")
print(f"Returning Customers: {repeat_customers:,}")
print(f"Repurchase Rate: {repeat_rate:.2f}%")

### 6-1. Repurchase Rate

In [None]:
fig = go.Figure()
fig.add_trace(go.Indicator(
    mode="gauge+number",
    value=repeat_rate,
    title={'text': 'Repurchase Rate (%)'},
    gauge=dict(
        axis=dict(range=[0, 30]),
        bar=dict(color="#3498db"),
        steps=[
            dict(range=[0, 10], color="#ecf0f1"),
            dict(range=[10, 20], color="#bdc3c7"),
            dict(range=[20, 30], color="#95a5a6")
        ]
    )
))
fig.update_layout(height=500)
fig.show()

### 6-2. Purchase Retention Funnel

In [None]:
customers_1 = customer_orders[customer_orders['order_count'] >= 1]['customer_unique_id'].nunique()
customers_2 = customer_orders[customer_orders['order_count'] >= 2]['customer_unique_id'].nunique()
customers_3 = customer_orders[customer_orders['order_count'] >= 3]['customer_unique_id'].nunique()
customers_4 = customer_orders[customer_orders['order_count'] >= 4]['customer_unique_id'].nunique()

fig = go.Figure()
fig.add_trace(go.Funnel(
    y=['1st Purchase', '2nd Purchase', '3rd Purchase', '4th Purchase'],
    x=[customers_1, customers_2, customers_3, customers_4],
    textposition="inside",
    texttemplate='%{x:,} Users<br>(%{percentInitial:.1%})',
    marker=dict(color=['#3498db', '#5dade2', '#85c1e9', '#aed6f1'])
))
fig.update_layout(
    title='Purchase Retention Funnel',
    height=600
)
fig.show()

print("ðŸ’¡ Conversion from 1st to 2nd purchase is the most critical challenge.")

## 7. Key Insights

### ðŸ“Š Analysis Summary

1. **Home & Living Selection**: Top tier in Seller Count, Order Count, and GMV.

2. **Seller Side Revenue Concentration**: Top 30 sellers account for a high portion of sales.

3. **SKU Threshold**: Sales skyrocket from 18 SKUs.

4. **Low Repurchase Rate**: Customer retention strategy needs strengthening.

5. **Purchase Retention**: 1st to 2nd purchase conversion is key.

### ðŸ’¡ Strategic Recommendations

- Encourage sellers to secure **18+ SKUs**.
- Focus on improving **repurchase conversion** for first-time buyers.
- Educational programs sharing top seller success stories.
- Strengthen promotions specific to Home & Living.