In [1]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from data_workflow.config import make_paths
from data_workflow.io import read_parquet
from data_workflow.viz import bar_sorted, time_line, histogram_chart, save_fig
from data_workflow.utils import bootstrap_diff_means

paths = make_paths(ROOT)

FIG_DIR = ROOT / "reports" / "figures"

In [2]:
df = read_parquet(paths.processed / "analytics_table.parquet")
print(f"Loaded {len(df)} rows.")

Loaded 5 rows.


In [3]:
print("Data Types:")
display(df.dtypes.head(5))
print("\nMissing Values:")
display(df.isna().sum().sort_values(ascending=False).head(5))

Data Types:


order_id           string[python]
user_id            string[python]
amount                    Float64
quantity                    Int64
created_at    datetime64[ns, UTC]
dtype: object


Missing Values:


quantity             1
amount               1
amount_winsor        1
amount_is_outlier    1
created_at           1
dtype: int64

In [None]:
'''EDA Questions:
1. What is the total revenue per country?
2. How is the revenue trending month-over-month?
3. What does the distribution of order amounts look like?'''

'EDA Questions:\n1. What is the total revenue per country?\n2. How is the revenue trending month-over-month?\n3. What does the distribution of order amounts look like?\n4. Is there a statistically significant difference between SA and AE order amounts?'

In [5]:
# Aggregation
q1_data = df.groupby("country").agg(revenue=("amount", "sum")).reset_index()

# Plotly Express Chart
fig1 = bar_sorted(q1_data, x="country", y="revenue", title="Total Revenue by Country")
fig1.show()

# Export
save_fig(fig1, FIG_DIR / "revenue_by_country.png")

In [6]:
# Aggregation
q2_data = df.groupby("month").agg(revenue=("amount", "sum")).reset_index().sort_values("month")

# Plotly Express Chart
fig2 = time_line(q2_data, x="month", y="revenue", title="Monthly Revenue Growth")
fig2.show()

# Export
save_fig(fig2, FIG_DIR / "revenue_trend_monthly.png")

In [7]:
# Plotly Express Chart
fig3 = histogram_chart(df, x="amount_winsor", nbins=30, title="Order Amount Distribution")
fig3.show()

# Export
save_fig(fig3, FIG_DIR / "amount_hist_winsor.png")