# Revenue Forecasting

This notebook demonstrates time series analysis and forecasting techniques for revenue prediction - essential skills for data analysts working on business planning and financial projections.

## What You'll Learn
- Time series data preparation and visualization
- Trend and seasonality decomposition
- Moving average and exponential smoothing methods
- Building and evaluating forecasts
- Communicating forecast results with uncertainty

## Business Context
Accurate revenue forecasting helps businesses with budgeting, resource planning, and setting realistic growth targets.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
import warnings

warnings.filterwarnings('ignore')

# Style configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['font.size'] = 11

print("Libraries loaded successfully!")

## 1. Data Loading & Preparation

In [None]:
# Load revenue data
df = pd.read_csv('../data/samples/revenue_sample.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Data preparation - Create time series
# Find date and amount columns
date_cols = [col for col in df.columns if 'date' in col.lower()]
amount_cols = [col for col in df.columns if 'amount' in col.lower() or 'revenue' in col.lower()]

if date_cols and amount_cols:
    df['date'] = pd.to_datetime(df[date_cols[0]])
    df['revenue'] = pd.to_numeric(df[amount_cols[0]], errors='coerce')
else:
    # Create synthetic time series data
    np.random.seed(42)
    n_days = 365
    dates = pd.date_range(start='2024-01-01', periods=n_days, freq='D')
    
    # Create revenue with trend, seasonality, and noise
    trend = np.linspace(1000, 1500, n_days)  # Upward trend
    weekly_seasonality = 200 * np.sin(2 * np.pi * np.arange(n_days) / 7)  # Weekly pattern
    monthly_seasonality = 300 * np.sin(2 * np.pi * np.arange(n_days) / 30)  # Monthly pattern
    noise = np.random.normal(0, 100, n_days)
    
    revenue = trend + weekly_seasonality + monthly_seasonality + noise
    revenue = np.maximum(revenue, 100)  # Ensure positive
    
    df = pd.DataFrame({'date': dates, 'revenue': revenue})

# Aggregate to daily revenue
daily_revenue = df.groupby('date')['revenue'].sum().reset_index()
daily_revenue = daily_revenue.sort_values('date')
daily_revenue.set_index('date', inplace=True)

print(f"\nTime series shape: {daily_revenue.shape}")
print(f"Date range: {daily_revenue.index.min()} to {daily_revenue.index.max()}")
print(f"Total days: {len(daily_revenue)}")
daily_revenue.head(10)

## 2. Exploratory Time Series Analysis

In [None]:
# Plot the raw time series
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(daily_revenue.index, daily_revenue['revenue'], linewidth=1, alpha=0.8, color='steelblue')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Daily Revenue ($)', fontsize=12)
ax.set_title('Daily Revenue Time Series', fontsize=14, fontweight='bold')

# Add trend line
z = np.polyfit(range(len(daily_revenue)), daily_revenue['revenue'], 1)
p = np.poly1d(z)
ax.plot(daily_revenue.index, p(range(len(daily_revenue))), 
        linestyle='--', color='red', linewidth=2, label='Trend')

ax.legend()
plt.tight_layout()
plt.savefig('../docs/visualizations/revenue_timeseries.png', dpi=150, bbox_inches='tight')
plt.show()

# Summary statistics
print(f"\nSummary Statistics:")
print(f"  Mean daily revenue: ${daily_revenue['revenue'].mean():,.2f}")
print(f"  Std deviation: ${daily_revenue['revenue'].std():,.2f}")
print(f"  Min: ${daily_revenue['revenue'].min():,.2f}")
print(f"  Max: ${daily_revenue['revenue'].max():,.2f}")

In [None]:
# Resample to different frequencies
weekly_revenue = daily_revenue.resample('W').sum()
monthly_revenue = daily_revenue.resample('M').sum()

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Weekly
ax1 = axes[0]
ax1.bar(weekly_revenue.index, weekly_revenue['revenue'], width=5, color='steelblue', alpha=0.8)
ax1.set_title('Weekly Revenue', fontsize=12, fontweight='bold')
ax1.set_ylabel('Revenue ($)')

# Monthly
ax2 = axes[1]
ax2.bar(monthly_revenue.index, monthly_revenue['revenue'], width=20, color='forestgreen', alpha=0.8)
ax2.set_title('Monthly Revenue', fontsize=12, fontweight='bold')
ax2.set_ylabel('Revenue ($)')

plt.tight_layout()
plt.show()

## 3. Time Series Decomposition

Breaking down the time series into its components: Trend, Seasonality, and Residual.

In [None]:
def decompose_time_series(series, period=7):
    """
    Simple additive decomposition of time series.
    
    Returns: trend, seasonal, residual components
    """
    # Trend: Moving average
    trend = series.rolling(window=period, center=True).mean()
    
    # Detrended series
    detrended = series - trend
    
    # Seasonal: Average of each period position
    seasonal = detrended.groupby(detrended.index.dayofweek).transform('mean')
    
    # Residual
    residual = series - trend - seasonal
    
    return trend, seasonal, residual

trend, seasonal, residual = decompose_time_series(daily_revenue['revenue'], period=7)

# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(14, 12), sharex=True)

# Original
axes[0].plot(daily_revenue.index, daily_revenue['revenue'], color='steelblue', linewidth=1)
axes[0].set_title('Original Time Series', fontweight='bold')
axes[0].set_ylabel('Revenue ($)')

# Trend
axes[1].plot(daily_revenue.index, trend, color='red', linewidth=2)
axes[1].set_title('Trend Component', fontweight='bold')
axes[1].set_ylabel('Revenue ($)')

# Seasonal
axes[2].plot(daily_revenue.index, seasonal, color='green', linewidth=1)
axes[2].set_title('Seasonal Component (Weekly)', fontweight='bold')
axes[2].set_ylabel('Revenue ($)')

# Residual
axes[3].plot(daily_revenue.index, residual, color='purple', linewidth=1, alpha=0.7)
axes[3].axhline(y=0, color='black', linestyle='--', linewidth=0.5)
axes[3].set_title('Residual Component', fontweight='bold')
axes[3].set_ylabel('Revenue ($)')

plt.tight_layout()
plt.savefig('../docs/visualizations/time_series_decomposition.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Seasonality Analysis

In [None]:
# Day of week pattern
daily_revenue['day_of_week'] = daily_revenue.index.dayofweek
daily_revenue['day_name'] = daily_revenue.index.day_name()

dow_avg = daily_revenue.groupby(['day_of_week', 'day_name'])['revenue'].mean().reset_index()
dow_avg = dow_avg.sort_values('day_of_week')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Day of week
ax1 = axes[0]
colors = plt.cm.Blues(np.linspace(0.4, 0.9, 7))
bars = ax1.bar(dow_avg['day_name'], dow_avg['revenue'], color=colors, edgecolor='navy')
ax1.set_xlabel('Day of Week', fontsize=12)
ax1.set_ylabel('Average Revenue ($)', fontsize=12)
ax1.set_title('Revenue by Day of Week', fontsize=12, fontweight='bold')
ax1.tick_params(axis='x', rotation=45)

# Add average line
avg = dow_avg['revenue'].mean()
ax1.axhline(y=avg, color='red', linestyle='--', label=f'Average: ${avg:,.0f}')
ax1.legend()

# Month pattern (if enough data)
ax2 = axes[1]
if len(daily_revenue) >= 60:
    daily_revenue['month'] = daily_revenue.index.month
    month_avg = daily_revenue.groupby('month')['revenue'].mean()
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    available_months = [month_names[m-1] for m in month_avg.index]
    ax2.bar(available_months, month_avg.values, color='forestgreen', edgecolor='darkgreen')
    ax2.set_xlabel('Month', fontsize=12)
    ax2.set_ylabel('Average Daily Revenue ($)', fontsize=12)
    ax2.set_title('Revenue by Month', fontsize=12, fontweight='bold')
    ax2.tick_params(axis='x', rotation=45)
else:
    ax2.text(0.5, 0.5, 'Insufficient data for monthly analysis', 
             ha='center', va='center', transform=ax2.transAxes)

plt.tight_layout()
plt.show()

## 5. Forecasting Methods

In [None]:
# Split data into train and test
train_size = int(len(daily_revenue) * 0.8)
train = daily_revenue['revenue'][:train_size]
test = daily_revenue['revenue'][train_size:]

print(f"Training set: {len(train)} days ({train.index.min()} to {train.index.max()})")
print(f"Test set: {len(test)} days ({test.index.min()} to {test.index.max()})")

In [None]:
def simple_moving_average(series, window=7):
    """Forecast using simple moving average."""
    return series.rolling(window=window).mean().iloc[-1]

def exponential_smoothing(series, alpha=0.3):
    """Forecast using simple exponential smoothing."""
    result = [series.iloc[0]]
    for i in range(1, len(series)):
        result.append(alpha * series.iloc[i] + (1 - alpha) * result[-1])
    return result[-1]

def holt_linear(series, alpha=0.3, beta=0.1, forecast_periods=1):
    """Forecast using Holt's linear method (trend-adjusted)."""
    n = len(series)
    
    # Initialize
    level = [series.iloc[0]]
    trend = [series.iloc[1] - series.iloc[0]]
    
    # Update
    for i in range(1, n):
        new_level = alpha * series.iloc[i] + (1 - alpha) * (level[-1] + trend[-1])
        new_trend = beta * (new_level - level[-1]) + (1 - beta) * trend[-1]
        level.append(new_level)
        trend.append(new_trend)
    
    # Forecast
    forecasts = [level[-1] + (i + 1) * trend[-1] for i in range(forecast_periods)]
    return forecasts

# Generate forecasts for test period
forecast_periods = len(test)

# Method 1: Simple Moving Average
sma_forecast = [simple_moving_average(train, window=7)] * forecast_periods

# Method 2: Exponential Smoothing
es_forecast = [exponential_smoothing(train, alpha=0.3)] * forecast_periods

# Method 3: Holt's Linear (with trend)
holt_forecast = holt_linear(train, alpha=0.3, beta=0.1, forecast_periods=forecast_periods)

print(f"7-Day SMA Forecast: ${sma_forecast[0]:,.2f}")
print(f"Exponential Smoothing Forecast: ${es_forecast[0]:,.2f}")
print(f"Holt's Linear Forecast (next day): ${holt_forecast[0]:,.2f}")

In [None]:
# Evaluate forecasts
def calculate_metrics(actual, forecast):
    """Calculate forecast accuracy metrics."""
    actual = np.array(actual)
    forecast = np.array(forecast)
    
    mae = np.mean(np.abs(actual - forecast))
    mse = np.mean((actual - forecast) ** 2)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((actual - forecast) / actual)) * 100
    
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape}

# Calculate metrics for each method
metrics_sma = calculate_metrics(test.values, sma_forecast)
metrics_es = calculate_metrics(test.values, es_forecast)
metrics_holt = calculate_metrics(test.values, holt_forecast)

# Display results
metrics_df = pd.DataFrame({
    'Simple Moving Avg': metrics_sma,
    'Exponential Smoothing': metrics_es,
    "Holt's Linear": metrics_holt
}).T

print("Forecast Accuracy Comparison:")
print(metrics_df.round(2).to_string())
print(f"\nBest method by MAPE: {metrics_df['MAPE'].idxmin()}")

In [None]:
# Visualize forecasts vs actual
fig, ax = plt.subplots(figsize=(14, 7))

# Plot training data
ax.plot(train.index, train.values, label='Training Data', color='steelblue', linewidth=1)

# Plot actual test data
ax.plot(test.index, test.values, label='Actual (Test)', color='black', linewidth=2)

# Plot forecasts
ax.plot(test.index, sma_forecast, label='Simple MA', color='red', linestyle='--', linewidth=2)
ax.plot(test.index, es_forecast, label='Exp Smoothing', color='green', linestyle='--', linewidth=2)
ax.plot(test.index, holt_forecast, label="Holt's Linear", color='orange', linestyle='--', linewidth=2)

# Vertical line at train/test split
ax.axvline(x=train.index[-1], color='gray', linestyle=':', linewidth=2, label='Train/Test Split')

ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Revenue ($)', fontsize=12)
ax.set_title('Revenue Forecast Comparison', fontsize=14, fontweight='bold')
ax.legend(loc='upper left')

plt.tight_layout()
plt.savefig('../docs/visualizations/revenue_forecast.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Future Forecast with Confidence Intervals

In [None]:
# Generate forecast for next 30 days with confidence intervals
forecast_days = 30
full_series = daily_revenue['revenue']

# Use Holt's method for final forecast
future_forecast = holt_linear(full_series, alpha=0.3, beta=0.1, forecast_periods=forecast_days)

# Calculate confidence intervals based on historical residuals
residuals = residual.dropna()
std_error = residuals.std()

# 95% confidence interval
z_95 = 1.96
upper_95 = [f + z_95 * std_error * np.sqrt(i+1) for i, f in enumerate(future_forecast)]
lower_95 = [f - z_95 * std_error * np.sqrt(i+1) for i, f in enumerate(future_forecast)]

# 80% confidence interval
z_80 = 1.28
upper_80 = [f + z_80 * std_error * np.sqrt(i+1) for i, f in enumerate(future_forecast)]
lower_80 = [f - z_80 * std_error * np.sqrt(i+1) for i, f in enumerate(future_forecast)]

# Create forecast dates
last_date = full_series.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(days=1), periods=forecast_days)

# Plot
fig, ax = plt.subplots(figsize=(14, 7))

# Historical data (last 60 days for clarity)
recent_data = full_series[-60:]
ax.plot(recent_data.index, recent_data.values, label='Historical', color='steelblue', linewidth=2)

# Forecast
ax.plot(forecast_dates, future_forecast, label='Forecast', color='red', linewidth=2)

# Confidence intervals
ax.fill_between(forecast_dates, lower_95, upper_95, alpha=0.2, color='red', label='95% CI')
ax.fill_between(forecast_dates, lower_80, upper_80, alpha=0.3, color='red', label='80% CI')

# Vertical line at forecast start
ax.axvline(x=last_date, color='gray', linestyle='--', linewidth=1)

ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Revenue ($)', fontsize=12)
ax.set_title('30-Day Revenue Forecast with Confidence Intervals', fontsize=14, fontweight='bold')
ax.legend(loc='upper left')

plt.tight_layout()
plt.savefig('../docs/visualizations/revenue_forecast_ci.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Growth Rate Analysis

In [None]:
# Calculate month-over-month growth
monthly_revenue = daily_revenue['revenue'].resample('M').sum()
monthly_growth = monthly_revenue.pct_change() * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Monthly revenue
ax1 = axes[0]
ax1.bar(monthly_revenue.index, monthly_revenue.values, color='steelblue', width=20)
ax1.set_title('Monthly Revenue', fontsize=12, fontweight='bold')
ax1.set_ylabel('Revenue ($)')
ax1.tick_params(axis='x', rotation=45)

# Month-over-month growth
ax2 = axes[1]
colors = ['green' if x >= 0 else 'red' for x in monthly_growth.values]
ax2.bar(monthly_growth.index, monthly_growth.values, color=colors, width=20)
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax2.set_title('Month-over-Month Growth (%)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Growth Rate (%)')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"\nAverage monthly growth rate: {monthly_growth.mean():.1f}%")
print(f"Compound monthly growth rate: {((monthly_revenue.iloc[-1] / monthly_revenue.iloc[0]) ** (1/(len(monthly_revenue)-1)) - 1) * 100:.1f}%")

## 8. Key Insights & Recommendations

In [None]:
print("="*60)
print("REVENUE FORECASTING - KEY FINDINGS")
print("="*60)

# Summary stats
total_historical = full_series.sum()
total_forecast = sum(future_forecast)
avg_daily_historical = full_series.mean()
avg_daily_forecast = np.mean(future_forecast)

print(f"\n1. HISTORICAL PERFORMANCE")
print(f"   - Total revenue: ${total_historical:,.2f}")
print(f"   - Average daily revenue: ${avg_daily_historical:,.2f}")
print(f"   - Standard deviation: ${full_series.std():,.2f}")

print(f"\n2. FORECAST (Next 30 Days)")
print(f"   - Projected total: ${total_forecast:,.2f}")
print(f"   - Projected daily average: ${avg_daily_forecast:,.2f}")
print(f"   - 95% CI Range: ${min(lower_95):,.0f} to ${max(upper_95):,.0f}")

projected_change = (avg_daily_forecast - avg_daily_historical) / avg_daily_historical * 100
print(f"   - Projected change: {projected_change:+.1f}%")

print(f"\n3. SEASONALITY INSIGHTS")
best_dow = dow_avg.loc[dow_avg['revenue'].idxmax(), 'day_name']
worst_dow = dow_avg.loc[dow_avg['revenue'].idxmin(), 'day_name']
print(f"   - Best day: {best_dow} (${dow_avg['revenue'].max():,.0f} avg)")
print(f"   - Worst day: {worst_dow} (${dow_avg['revenue'].min():,.0f} avg)")

print(f"\n4. MODEL PERFORMANCE")
print(f"   - Best method: {metrics_df['MAPE'].idxmin()}")
print(f"   - MAPE: {metrics_df['MAPE'].min():.1f}%")

print(f"\n5. RECOMMENDATIONS")
print(f"   - Use forecasts for 7-14 day planning (higher accuracy)")
print(f"   - Account for {std_error:.0f} daily variance in budgeting")
print(f"   - Schedule promotions on {worst_dow} to boost weaker days")
print(f"   - Review forecast weekly and adjust parameters if needed")

print("\n" + "="*60)

In [None]:
# Export forecast data
forecast_df = pd.DataFrame({
    'date': forecast_dates,
    'forecast': future_forecast,
    'lower_80': lower_80,
    'upper_80': upper_80,
    'lower_95': lower_95,
    'upper_95': upper_95
})

forecast_df.to_csv('../data/samples/revenue_forecast.csv', index=False)
print("Forecast exported to data/samples/revenue_forecast.csv")
forecast_df.head(10)