In [None]:
# E-Commerce Data Analysis Project
# Complete pipeline for data simulation, processing, KPI reporting, and forecasting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.tsa.arima.model import ARIMA
np.random.seed(42)

In [None]:
# Simulate the raw dataset with realistic e-commerce attributes
n_orders = 10000
data = pd.DataFrame({
    'OrderID': np.arange(1, n_orders + 1),
    'CustomerID': np.random.randint(1000, 2000, size=n_orders),
    'OrderDate': pd.date_range(start='2022-01-01', periods=n_orders, freq='H'),
    'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home & Kitchen'], size=n_orders),
    'OrderAmount': np.round(np.random.exponential(scale=80, size=n_orders), 2),
    'IsReturned': np.random.choice([0, 1], size=n_orders, p=[0.9, 0.1])
})
# Convert order date to monthly period and assign customer segments
data['Month'] = data['OrderDate'].dt.to_period('M').dt.to_timestamp()
data['CustomerSegment'] = np.random.choice(['Budget', 'Premium', 'Occasional'], size=n_orders)
# Simulate repeat purchases and estimate customer lifetime value
data['RepeatPurchases'] = np.random.poisson(2, size=n_orders)
data['CLV'] = data['OrderAmount'] * data['RepeatPurchases']
data.head()

In [None]:
# Analyze monthly revenue across product categories
monthly_revenue = data.groupby(['Month', 'Category'])['OrderAmount'].sum().reset_index()
fig = px.line(monthly_revenue, x='Month', y='OrderAmount', color='Category', markers=True,
              title='Monthly Revenue by Product Category')
fig.update_layout(template='plotly_white')
fig.show()

In [None]:
# Calculate average order value for each product category
avg_order = data.groupby('Category')['OrderAmount'].mean().sort_values()
fig = px.bar(x=avg_order.values, y=avg_order.index, orientation='h',
             labels={'x': 'Average Order Value ($)', 'y': 'Category'},
             title='Average Order Value by Category')
fig.update_layout(template='plotly_white')
fig.show()

In [None]:
# Visualize return rate across categories
return_rate = data.groupby('Category')['IsReturned'].mean().sort_values()
fig = px.bar(x=return_rate.values, y=return_rate.index, orientation='h',
             labels={'x': 'Return Rate', 'y': 'Category'},
             title='Return Rate by Product Category')
fig.update_traces(texttemplate='%{x:.2%}', textposition='outside')
fig.update_layout(template='plotly_white')
fig.show()

In [None]:
# Breakdown of revenue by customer segment
segment_revenue = data.groupby('CustomerSegment')['OrderAmount'].sum().reset_index()
fig = px.pie(segment_revenue, names='CustomerSegment', values='OrderAmount',
             title='Revenue Contribution by Customer Segment', hole=0.4)
fig.update_traces(textinfo='percent+label')
fig.update_layout(template='plotly_dark')
fig.show()

In [None]:
# Calculate KPIs (Key Performance Indicators) by segment
kpis = data.groupby('CustomerSegment').agg(
    TotalRevenue=('OrderAmount', 'sum'),
    AvgOrderValue=('OrderAmount', 'mean'),
    ReturnRate=('IsReturned', 'mean'),
    Orders=('OrderID', 'count'),
    AvgCLV=('CLV', 'mean')
).reset_index()
kpis

In [None]:
# Plot KPI bar charts for each customer segment
kpi_titles = {
    'TotalRevenue': 'Total Revenue ($)',
    'AvgOrderValue': 'Average Order Value ($)',
    'AvgCLV': 'Customer Lifetime Value ($)',
    'ReturnRate': 'Return Rate (%)'
}
for metric, title in kpi_titles.items():
    fig = px.bar(kpis.sort_values(metric, ascending=False),
                 x='CustomerSegment', y=metric, color='CustomerSegment',
                 title=title, text=metric)
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(template='plotly_white', showlegend=False)
    fig.show()

In [None]:
# Forecast revenue for the next 6 months using ARIMA
monthly_sales = data.groupby('Month')['OrderAmount'].sum()
monthly_sales.index = pd.to_datetime(monthly_sales.index)
model = ARIMA(monthly_sales, order=(1, 1, 1))
fit = model.fit()
forecast = fit.forecast(6)
future_dates = pd.date_range(start=monthly_sales.index[-1] + pd.offsets.MonthBegin(1), periods=6, freq='MS')
plt.figure(figsize=(10, 5))
plt.plot(monthly_sales, label='Historical')
plt.plot(future_dates, forecast, label='Forecast', linestyle='--')
plt.title('6-Month Revenue Forecast')
plt.xlabel('Month')
plt.ylabel('Revenue ($)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()