In [None]:

# Bootstrap: generate full dataset & figures if they don't exist
import os, numpy as np, pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

BASE = os.path.join('..')
DATA = os.path.join(BASE, 'data')
REPORTS = os.path.join(BASE, 'reports', 'figures')
os.makedirs(DATA, exist_ok=True)
os.makedirs(REPORTS, exist_ok=True)

csv_full = os.path.join(DATA, 'transactions_sample.csv')
if not os.path.exists(csv_full):
    rng = np.random.default_rng(42)
    n_users = 15000
    n_days = 365
    start_date = datetime(2024, 1, 1)
    def sample_date():
        return start_date + timedelta(days=int(rng.integers(0, n_days)))
    def sample_channel():
        return rng.choice(['ads','organic','email','push','referral'], p=[0.35,0.35,0.12,0.10,0.08])
    def sample_device():
        return rng.choice(['android','ios','web'], p=[0.5,0.3,0.2])
    def revenue_from_device(device):
        base = rng.normal(35, 10)
        if device == 'ios': base *= 1.25
        elif device == 'web': base *= 1.05
        return max(5, base + rng.normal(0, 8))

    orders = []
    for oid in range(1, 55001):
        uid = int(rng.integers(1, 15000))
        dt = sample_date()
        ch = sample_channel()
        dev = sample_device()
        rev = round(revenue_from_device(dev), 2)
        is_ret = int(rng.random() < 0.45)
        orders.append([oid, uid, dt.strftime('%Y-%m-%d'), ch, dev, rev, is_ret])
    df_full = pd.DataFrame(orders, columns=['order_id','user_id','order_date','channel','device','revenue','is_returning'])
    df_full['cohort_month'] = pd.to_datetime(df_full['order_date']).values.astype('datetime64[M]').astype(str)
    df_full.to_csv(csv_full, index=False)

# Quick plots
df = pd.read_csv(csv_full, parse_dates=['order_date'])
daily = df.groupby('order_date').agg(dau=('user_id','nunique'), revenue=('revenue','sum')).reset_index()
plt.figure(); plt.plot(daily['order_date'], daily['dau']); plt.title('DAU'); plt.tight_layout(); plt.savefig(os.path.join(REPORTS,'dau.png')); plt.close()
plt.figure(); plt.plot(daily['order_date'], daily['revenue']); plt.title('Revenue'); plt.tight_layout(); plt.savefig(os.path.join(REPORTS,'revenue.png')); plt.close()
print('Dataset and figures are ready.')


# E-commerce Analytics & Forecast
Учебный ноутбук: метрики, когорты, прогноз.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('../data/transactions_sample.csv', parse_dates=['order_date'])
df.head()


In [None]:

# DAU by day
daily = df.groupby('order_date').agg(dau=('user_id','nunique'), revenue=('revenue','sum')).reset_index()
daily.head()


In [None]:

# Plot DAU and Revenue
plt.figure()
plt.plot(daily['order_date'], daily['dau'])
plt.title('DAU')
plt.xlabel('Date'); plt.ylabel('Users')
plt.tight_layout(); plt.show()

plt.figure()
plt.plot(daily['order_date'], daily['revenue'])
plt.title('Revenue')
plt.xlabel('Date'); plt.ylabel('Revenue')
plt.tight_layout(); plt.show()


In [None]:

# Simple retention proxy: returning users share by month
df['month'] = df['order_date'].dt.to_period('M')
retention = df.groupby('month')['is_returning'].mean().reset_index()
retention['month'] = retention['month'].astype(str)
retention


In [None]:

# Tiny forecast demo with Gradient Boosting
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error

daily = daily.sort_values('order_date').assign(
    day_of_week=lambda x: x['order_date'].dt.weekday,
    month=lambda x: x['order_date'].dt.month,
)

daily['rev_ma7'] = daily['revenue'].rolling(7, min_periods=1).mean()
X = daily[['day_of_week','month','rev_ma7']].fillna(method='bfill')
y = daily['revenue'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

model = GradientBoostingRegressor(random_state=42).fit(X_train, y_train)
pred = model.predict(X_test)
mape = mean_absolute_percentage_error(y_test, pred)
mape
