# Sales Forecast Model V3 - Daily Training Approach

This notebook implements a daily-based forecasting model that:
- Trains on daily transaction data (not weekly aggregations)
- Properly handles duplicates and data quality issues
- Captures day-of-week patterns and daily seasonality
- Aggregates daily predictions to weekly output
- Uses proper time-series validation


In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_percentage_error
import os
import platform

print("Libraries imported successfully")
print(f"System: {platform.system()} {platform.machine()}")
print(f"CPU cores: {os.cpu_count()}")


In [None]:
# Load all parquet files from data directory
data_path = "data/"
parquet_files = [f for f in os.listdir(data_path) if f.endswith('.parquet')]
print(f"Found {len(parquet_files)} parquet files")

transactions = None
products = None
stores = None

for file in parquet_files:
    df = pd.read_parquet(os.path.join(data_path, file))
    print(f"{file}: Shape {df.shape}")
    
    if 'internal_store_id' in df.columns and 'quantity' in df.columns:
        transactions = df
        print("-> Identified as TRANSACTIONS data")
    elif 'produto' in df.columns and 'categoria' in df.columns:
        products = df
        print("-> Identified as PRODUCTS data")
    elif 'pdv' in df.columns and 'premise' in df.columns:
        stores = df
        print("-> Identified as STORES data")

print(f"\nData loaded:")
print(f"- Transactions: {transactions.shape[0]:,} rows")
print(f"- Products: {products.shape[0]:,} rows") 
print(f"- Stores: {stores.shape[0]:,} rows")


In [None]:
# Clean transactions data: remove nulls, duplicates, outliers, and filter to 2022
print("Cleaning transactions data...")

initial_rows = len(transactions)
print(f"Initial rows: {initial_rows:,}")

# Remove null values
transactions = transactions.dropna(subset=['internal_store_id', 'internal_product_id', 'quantity', 'transaction_date'])
print(f"After removing nulls: {len(transactions):,} rows")

# Remove duplicates based on key columns
before_dedup = len(transactions)
transactions = transactions.drop_duplicates(subset=['internal_store_id', 'internal_product_id', 'transaction_date', 'quantity', 'gross_value'])
print(f"Removed {before_dedup - len(transactions):,} duplicate rows")

# Keep only positive quantities
transactions = transactions[transactions['quantity'] > 0]
print(f"After removing zero/negative quantities: {len(transactions):,} rows")

# Remove extreme outliers
value_per_unit = transactions['gross_value'] / transactions['quantity']
q01 = value_per_unit.quantile(0.005)
q99 = value_per_unit.quantile(0.995)
valid_value_mask = (value_per_unit >= q01) & (value_per_unit <= q99)

quantity_q99 = transactions['quantity'].quantile(0.995)
valid_qty_mask = transactions['quantity'] <= quantity_q99

before_outliers = len(transactions)
transactions = transactions[valid_value_mask & valid_qty_mask]
print(f"Removed {before_outliers - len(transactions):,} outlier rows")

# Convert dates and filter to 2022
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
transactions = transactions[transactions['transaction_date'].dt.year == 2022]
print(f"Final 2022 data: {len(transactions):,} rows")

print(f"Total reduction: {((initial_rows - len(transactions))/initial_rows)*100:.1f}%")


In [None]:
# Clean and prepare products and stores data
print("Cleaning products and stores data...")

# Clean products
products['descricao'] = products['descricao'].fillna('Unknown')
products['categoria'] = products['categoria'].fillna('Other') 
products['marca'] = products['marca'].fillna('Unknown')

# Clean stores
stores['categoria_pdv'] = stores['categoria_pdv'].fillna('Other')
stores['premise'] = stores['premise'].fillna('Unknown')

print(f"Products cleaned: {len(products):,} rows")
print(f"Stores cleaned: {len(stores):,} rows")


In [None]:
# Merge transactions with products and stores
print("Merging data...")

# Merge with products
merged_data = transactions.merge(
    products, 
    left_on='internal_product_id', 
    right_on='produto', 
    how='left'
)
print(f"After product merge: {len(merged_data):,} rows")

# Merge with stores
merged_data = merged_data.merge(
    stores,
    left_on='internal_store_id', 
    right_on='pdv',
    how='left'
)
print(f"After store merge: {len(merged_data):,} rows")

print("Data merge completed successfully")


In [None]:
# Create daily aggregations with temporal features
print("Creating daily aggregations...")

# Extract date components
merged_data['date'] = merged_data['transaction_date'].dt.date
merged_data['year'] = merged_data['transaction_date'].dt.year
merged_data['month'] = merged_data['transaction_date'].dt.month
merged_data['day'] = merged_data['transaction_date'].dt.day
merged_data['dayofweek'] = merged_data['transaction_date'].dt.dayofweek  # 0=Monday
merged_data['dayofyear'] = merged_data['transaction_date'].dt.dayofyear
merged_data['week'] = merged_data['transaction_date'].dt.isocalendar().week
merged_data['quarter'] = merged_data['transaction_date'].dt.quarter

# Aggregate by day-store-product
daily_data = merged_data.groupby([
    'date', 'year', 'month', 'day', 'dayofweek', 'dayofyear', 'week', 'quarter',
    'internal_store_id', 'internal_product_id',
    'categoria', 'marca', 'premise', 'categoria_pdv'
]).agg({
    'quantity': ['sum', 'mean', 'count'],
    'gross_value': ['sum', 'mean'],
    'net_value': ['sum', 'mean'],
    'gross_profit': ['sum', 'mean']
}).reset_index()

# Flatten column names
daily_data.columns = ['_'.join(col).strip() if col[1] else col[0] for col in daily_data.columns.values]

# Rename for clarity
daily_data.rename(columns={
    'quantity_sum': 'total_quantity',
    'quantity_mean': 'avg_quantity_per_transaction', 
    'quantity_count': 'num_transactions',
    'gross_value_sum': 'total_gross_value',
    'gross_value_mean': 'avg_gross_value',
    'net_value_sum': 'total_net_value',
    'net_value_mean': 'avg_net_value',
    'gross_profit_sum': 'total_gross_profit',
    'gross_profit_mean': 'avg_gross_profit'
}, inplace=True)

print(f"Created daily aggregations: {len(daily_data):,} rows")
print(f"Date range: {daily_data['date'].min()} to {daily_data['date'].max()}")
print(f"Unique store-product pairs: {daily_data[['internal_store_id', 'internal_product_id']].nunique()}")


In [None]:
# Create daily temporal features including day-of-week effects
print("Creating daily temporal features...")

# Convert date to datetime for sorting
daily_data['date'] = pd.to_datetime(daily_data['date'])
daily_data = daily_data.sort_values(['internal_store_id', 'internal_product_id', 'date'])

# Day-of-week cyclical features (0=Monday, 6=Sunday)
daily_data['dayofweek_sin'] = np.sin(2 * np.pi * daily_data['dayofweek'] / 7)
daily_data['dayofweek_cos'] = np.cos(2 * np.pi * daily_data['dayofweek'] / 7)

# Day-of-year cyclical features (seasonality)
daily_data['dayofyear_sin'] = np.sin(2 * np.pi * daily_data['dayofyear'] / 365)
daily_data['dayofyear_cos'] = np.cos(2 * np.pi * daily_data['dayofyear'] / 365)

# Month cyclical features
daily_data['month_sin'] = np.sin(2 * np.pi * daily_data['month'] / 12)
daily_data['month_cos'] = np.cos(2 * np.pi * daily_data['month'] / 12)

# Week cyclical features
daily_data['week_sin'] = np.sin(2 * np.pi * daily_data['week'] / 52)
daily_data['week_cos'] = np.cos(2 * np.pi * daily_data['week'] / 52)

# Is weekend flag
daily_data['is_weekend'] = (daily_data['dayofweek'] >= 5).astype(int)

# Is month start/end
daily_data['is_month_start'] = (daily_data['day'] <= 3).astype(int)
daily_data['is_month_end'] = (daily_data['day'] >= 28).astype(int)

print("Daily temporal features created:")
print("- Day-of-week sine/cosine encoding")
print("- Day-of-year sine/cosine encoding") 
print("- Month sine/cosine encoding")
print("- Week sine/cosine encoding")
print("- Weekend flag")
print("- Month start/end flags")


In [None]:
# Create lag features without data leakage (using only past information)
print("Creating lag features...")

# Sort to ensure proper lag calculation
daily_data = daily_data.sort_values(['internal_store_id', 'internal_product_id', 'date'])

# Group by store-product pairs
grouped = daily_data.groupby(['internal_store_id', 'internal_product_id'])

# Lag features (previous days)
daily_data['quantity_lag_1'] = grouped['total_quantity'].shift(1)
daily_data['quantity_lag_2'] = grouped['total_quantity'].shift(2) 
daily_data['quantity_lag_3'] = grouped['total_quantity'].shift(3)
daily_data['quantity_lag_7'] = grouped['total_quantity'].shift(7)  # Same day last week

# Rolling averages (using only past data)
daily_data['quantity_rolling_3d'] = grouped['total_quantity'].shift(1).rolling(window=3, min_periods=1).mean()
daily_data['quantity_rolling_7d'] = grouped['total_quantity'].shift(1).rolling(window=7, min_periods=1).mean()
daily_data['quantity_rolling_30d'] = grouped['total_quantity'].shift(1).rolling(window=30, min_periods=1).mean()

# Trend features (growth rates)
daily_data['quantity_growth_1d'] = (daily_data['quantity_lag_1'] - daily_data['quantity_lag_2']) / (daily_data['quantity_lag_2'] + 1)
daily_data['quantity_growth_7d'] = (daily_data['quantity_lag_1'] - daily_data['quantity_lag_7']) / (daily_data['quantity_lag_7'] + 1)

# Days since last sale
daily_data['days_since_last_sale'] = grouped['date'].diff().dt.days

# Fill NaN values with 0 for lag features
lag_columns = [col for col in daily_data.columns if 'lag' in col or 'rolling' in col or 'growth' in col or 'days_since' in col]
daily_data[lag_columns] = daily_data[lag_columns].fillna(0)

print(f"Created {len(lag_columns)} lag/trend features:")
for col in lag_columns:
    print(f"- {col}")


In [None]:
# Create store and product aggregate features (using only past data)
print("Creating aggregate features...")

# Store-level daily averages (excluding current day)
store_daily_avg = daily_data.groupby(['internal_store_id', 'date'])['total_quantity'].sum().reset_index()
store_daily_avg.columns = ['internal_store_id', 'date', 'store_daily_total']
store_daily_avg = store_daily_avg.sort_values(['internal_store_id', 'date'])
store_daily_avg['store_avg_quantity'] = (
    store_daily_avg.groupby('internal_store_id')['store_daily_total']
    .shift(1).expanding(min_periods=1).mean()
)

# Product-level daily averages (excluding current day) 
product_daily_avg = daily_data.groupby(['internal_product_id', 'date'])['total_quantity'].sum().reset_index()
product_daily_avg.columns = ['internal_product_id', 'date', 'product_daily_total']
product_daily_avg = product_daily_avg.sort_values(['internal_product_id', 'date'])
product_daily_avg['product_avg_quantity'] = (
    product_daily_avg.groupby('internal_product_id')['product_daily_total']
    .shift(1).expanding(min_periods=1).mean()
)

# Category-level daily averages (excluding current day)
category_daily_avg = daily_data.groupby(['categoria', 'date'])['total_quantity'].sum().reset_index()
category_daily_avg.columns = ['categoria', 'date', 'category_daily_total'] 
category_daily_avg = category_daily_avg.sort_values(['categoria', 'date'])
category_daily_avg['category_avg_quantity'] = (
    category_daily_avg.groupby('categoria')['category_daily_total']
    .shift(1).expanding(min_periods=1).mean()
)

# Merge aggregate features
daily_data = daily_data.merge(
    store_daily_avg[['internal_store_id', 'date', 'store_avg_quantity']], 
    on=['internal_store_id', 'date'], how='left'
)

daily_data = daily_data.merge(
    product_daily_avg[['internal_product_id', 'date', 'product_avg_quantity']],
    on=['internal_product_id', 'date'], how='left'
)

daily_data = daily_data.merge(
    category_daily_avg[['categoria', 'date', 'category_avg_quantity']],
    on=['categoria', 'date'], how='left'
)

# Fill NaN values with 0
daily_data[['store_avg_quantity', 'product_avg_quantity', 'category_avg_quantity']] = (
    daily_data[['store_avg_quantity', 'product_avg_quantity', 'category_avg_quantity']].fillna(0)
)

print("Created aggregate features:")
print("- Store average quantity (historical)")
print("- Product average quantity (historical)")
print("- Category average quantity (historical)")


In [None]:
# Prepare training data with proper time-series split
print("Preparing training data...")

# Define feature columns
categorical_features = ['categoria', 'marca', 'premise', 'categoria_pdv']
numerical_features = [
    'dayofweek', 'dayofyear', 'month', 'day', 'week', 'quarter',
    'dayofweek_sin', 'dayofweek_cos', 'dayofyear_sin', 'dayofyear_cos',
    'month_sin', 'month_cos', 'week_sin', 'week_cos',
    'is_weekend', 'is_month_start', 'is_month_end',
    'num_transactions', 'avg_quantity_per_transaction',
    'total_gross_value', 'avg_gross_value', 'total_net_value', 'avg_net_value',
    'total_gross_profit', 'avg_gross_profit',
    'quantity_lag_1', 'quantity_lag_2', 'quantity_lag_3', 'quantity_lag_7',
    'quantity_rolling_3d', 'quantity_rolling_7d', 'quantity_rolling_30d',
    'quantity_growth_1d', 'quantity_growth_7d', 'days_since_last_sale',
    'store_avg_quantity', 'product_avg_quantity', 'category_avg_quantity'
]

all_features = categorical_features + numerical_features

# Ensure categorical features are strings
for col in categorical_features:
    daily_data[col] = daily_data[col].astype(str)

# Filter training data (skip first few days due to lag features)
min_date = daily_data['date'].min() + pd.Timedelta(days=7)  # Skip first 7 days
train_data = daily_data[daily_data['date'] >= min_date].copy()

# Time-based split for validation (last 30 days)
max_date = train_data['date'].max()
split_date = max_date - pd.Timedelta(days=30)

train_mask = train_data['date'] < split_date
val_mask = train_data['date'] >= split_date

X_train = train_data[train_mask][all_features]
y_train = train_data[train_mask]['total_quantity']
X_val = train_data[val_mask][all_features]
y_val = train_data[val_mask]['total_quantity']

# Apply log transformation to target
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

print(f"Training data: {len(X_train):,} samples")
print(f"Validation data: {len(X_val):,} samples")
print(f"Training date range: {train_data[train_mask]['date'].min()} to {train_data[train_mask]['date'].max()}")
print(f"Validation date range: {train_data[val_mask]['date'].min()} to {train_data[val_mask]['date'].max()}")
print(f"Features: {len(all_features)} ({len(categorical_features)} categorical, {len(numerical_features)} numerical)")


In [None]:
# Train CatBoost model optimized for daily forecasting
print("Training CatBoost model...")

# Get categorical feature indices
cat_feature_indices = [all_features.index(col) for col in categorical_features]

# Optimize for Apple Silicon or use CPU cores
n_threads = max(1, int(os.cpu_count() * 0.9))
print(f"Using {n_threads} CPU threads")

# Model parameters optimized for daily data
model_params = {
    'iterations': 2000,
    'learning_rate': 0.02,
    'depth': 8,
    'l2_leaf_reg': 10,
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 0.8,
    'random_strength': 1.0,
    'border_count': 200,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 42,
    'verbose': 200,
    'early_stopping_rounds': 300,
    'thread_count': n_threads,
    'used_ram_limit': '12GB',
}

print("Model parameters:")
for key, value in model_params.items():
    print(f"  {key}: {value}")

# Initialize and train model
model = CatBoostRegressor(**model_params)

print("\\nStarting training...")
model.fit(
    X_train, y_train_log,
    eval_set=(X_val, y_val_log),
    cat_features=cat_feature_indices,
    early_stopping_rounds=300,
    verbose=200,
    use_best_model=True
)

print("\\nModel training completed!")


In [None]:
print("Evaluating model performance...")

train_pred_log = model.predict(X_train)
val_pred_log = model.predict(X_val)

train_pred = np.expm1(train_pred_log)
val_pred = np.expm1(val_pred_log)
train_actual = y_train.values
val_actual = y_val.values

train_pred = np.maximum(0, train_pred)
val_pred = np.maximum(0, val_pred)

def calculate_wmape(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true) * 100 if np.sum(y_true) > 0 else 0

def calculate_mape(y_true, y_pred):
    mask = y_true > 0
    if not mask.any():
        return 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

train_wmape = calculate_wmape(train_actual, train_pred)
val_wmape = calculate_wmape(val_actual, val_pred)
train_mape = calculate_mape(train_actual, train_pred)
val_mape = calculate_mape(val_actual, val_pred)

train_mae = np.mean(np.abs(train_actual - train_pred))
val_mae = np.mean(np.abs(val_actual - val_pred))

print("\\n" + "="*50)
print("MODEL PERFORMANCE METRICS")
print("="*50)
print(f"Training WMAPE: {train_wmape:.2f}%")
print(f"Validation WMAPE: {val_wmape:.2f}%")
print(f"Training MAPE: {train_mape:.2f}%")
print(f"Validation MAPE: {val_mape:.2f}%")
print(f"Training MAE: {train_mae:.2f}")
print(f"Validation MAE: {val_mae:.2f}")

wmape_diff = abs(val_wmape - train_wmape)
mape_diff = abs(val_mape - train_mape)
print(f"\\nOverfitting Check:")
print(f"WMAPE difference: {wmape_diff:.2f}%")
print(f"MAPE difference: {mape_diff:.2f}%")

if wmape_diff < 3 and mape_diff < 15:
    print("Status: Good generalization")
elif wmape_diff < 6 and mape_diff < 25:
    print("Status: Moderate overfitting")
else:
    print("Status: High overfitting risk")

print("="*50)


In [None]:
# Generate daily predictions for January 2023 (5 weeks)
print("Generating daily predictions for January 2023...")

# Get unique store-product combinations from recent data (last 60 days)
recent_date = daily_data['date'].max() - pd.Timedelta(days=60)
recent_data = daily_data[daily_data['date'] >= recent_date]
active_pairs = recent_data.groupby(['internal_store_id', 'internal_product_id']).agg({
    'total_quantity': 'sum',
    'date': 'count'
}).reset_index()
active_pairs.columns = ['internal_store_id', 'internal_product_id', 'total_qty', 'days_active']

max_pairs = 290000  # Stay within 1.5M prediction limit
active_pairs = active_pairs.sort_values(['total_qty', 'days_active'], ascending=[False, False])
selected_pairs = active_pairs.head(max_pairs)[['internal_store_id', 'internal_product_id']]

print(f"Selected {len(selected_pairs):,} active store-product pairs")

# Get latest features for each pair
latest_features = daily_data.loc[daily_data.groupby(['internal_store_id', 'internal_product_id'])['date'].idxmax()]
latest_features = latest_features.merge(selected_pairs, on=['internal_store_id', 'internal_product_id'], how='inner')

print(f"Latest features available for {len(latest_features):,} pairs")

# Generate predictions for January 2023 (days 1-35, covering 5 weeks)
january_dates = pd.date_range('2023-01-01', periods=35, freq='D')
all_daily_predictions = []

for i, pred_date in enumerate(january_dates):
    print(f"Predicting for {pred_date.strftime('%Y-%m-%d')} ({i+1}/{len(january_dates)})")
    
    # Create prediction data
    pred_data = latest_features.copy()
    
    # Update temporal features for prediction date
    pred_data['date'] = pred_date
    pred_data['year'] = pred_date.year
    pred_data['month'] = pred_date.month
    pred_data['day'] = pred_date.day
    pred_data['dayofweek'] = pred_date.dayofweek
    pred_data['dayofyear'] = pred_date.dayofyear
    pred_data['week'] = pred_date.isocalendar().week
    pred_data['quarter'] = 1  # January is Q1
    
    # Update cyclical features
    pred_data['dayofweek_sin'] = np.sin(2 * np.pi * pred_data['dayofweek'] / 7)
    pred_data['dayofweek_cos'] = np.cos(2 * np.pi * pred_data['dayofweek'] / 7)
    pred_data['dayofyear_sin'] = np.sin(2 * np.pi * pred_data['dayofyear'] / 365)
    pred_data['dayofyear_cos'] = np.cos(2 * np.pi * pred_data['dayofyear'] / 365)
    pred_data['month_sin'] = np.sin(2 * np.pi * pred_data['month'] / 12)
    pred_data['month_cos'] = np.cos(2 * np.pi * pred_data['month'] / 12)
    pred_data['week_sin'] = np.sin(2 * np.pi * pred_data['week'] / 52)
    pred_data['week_cos'] = np.cos(2 * np.pi * pred_data['week'] / 52)
    
    # Update binary features
    pred_data['is_weekend'] = (pred_data['dayofweek'] >= 5).astype(int)
    pred_data['is_month_start'] = (pred_data['day'] <= 3).astype(int)
    pred_data['is_month_end'] = (pred_data['day'] >= 28).astype(int)
    
    # Make predictions
    X_pred = pred_data[all_features]
    pred_log = model.predict(X_pred)
    predictions = np.expm1(pred_log)
    predictions = np.maximum(0, predictions).round().astype(int)
    
    # Store daily predictions
    daily_pred_df = pd.DataFrame({
        'date': pred_date,
        'pdv': pred_data['internal_store_id'].astype(int),
        'produto': pred_data['internal_product_id'].astype(int),
        'quantidade_diaria': predictions
    })
    
    all_daily_predictions.append(daily_pred_df)

# Combine all daily predictions
daily_predictions_df = pd.concat(all_daily_predictions, ignore_index=True)

print(f"\\nGenerated {len(daily_predictions_df):,} daily predictions")
print(f"Date range: {daily_predictions_df['date'].min()} to {daily_predictions_df['date'].max()}")
print(f"Total predicted quantity: {daily_predictions_df['quantidade_diaria'].sum():,}")
print(f"Average daily quantity per store-product: {daily_predictions_df['quantidade_diaria'].mean():.2f}")


In [None]:
print("Aggregating daily predictions to weekly format...")

daily_predictions_df['week'] = ((daily_predictions_df['date'] - pd.Timestamp('2023-01-01')).dt.days // 7) + 1

# Aggregate by week-store-product
weekly_predictions = daily_predictions_df.groupby(['week', 'pdv', 'produto']).agg({
    'quantidade_diaria': 'sum'
}).reset_index()

# Rename for final output format
weekly_predictions.rename(columns={
    'week': 'semana',
    'quantidade_diaria': 'quantidade'
}, inplace=True)

# Ensure proper data types
weekly_predictions['semana'] = weekly_predictions['semana'].astype(int)
weekly_predictions['pdv'] = weekly_predictions['pdv'].astype(int)
weekly_predictions['produto'] = weekly_predictions['produto'].astype(int)
weekly_predictions['quantidade'] = weekly_predictions['quantidade'].astype(int)

# Sort by week, store, product
weekly_predictions = weekly_predictions.sort_values(['semana', 'pdv', 'produto'])

print(f"\\nWeekly aggregation completed:")
print(f"Total weekly predictions: {len(weekly_predictions):,}")
print(f"Weeks covered: {weekly_predictions['semana'].min()} to {weekly_predictions['semana'].max()}")
print(f"Unique stores: {weekly_predictions['pdv'].nunique():,}")
print(f"Unique products: {weekly_predictions['produto'].nunique():,}")
print(f"Total predicted quantity: {weekly_predictions['quantidade'].sum():,}")

# Show sample of predictions
print(f"\\nSample predictions:")
print(weekly_predictions.head(10))

# Weekly summary
weekly_summary = weekly_predictions.groupby('semana').agg({
    'quantidade': ['count', 'sum', 'mean']
}).round(2)
weekly_summary.columns = ['predictions_count', 'total_quantity', 'avg_quantity']
print(f"\\nWeekly summary:")
print(weekly_summary)


In [None]:
# Save predictions to CSV and Parquet files
print("Saving predictions...")

# Save weekly predictions (main output)
csv_filename = "sales_predictions_v3.csv"
parquet_filename = "sales_predictions_v3.parquet"

weekly_predictions.to_csv(csv_filename, sep=';', index=False, encoding='utf-8')
weekly_predictions.to_parquet(parquet_filename, index=False)

print(f"Weekly predictions saved:")
print(f"- CSV: {csv_filename}")
print(f"- Parquet: {parquet_filename}")
print(f"- Rows: {len(weekly_predictions):,}")
print(f"- Columns: {list(weekly_predictions.columns)}")

# Save daily predictions for analysis (optional)
daily_csv_filename = "sales_predictions_daily_v3.csv"
daily_predictions_df.to_csv(daily_csv_filename, sep=';', index=False, encoding='utf-8')
print(f"\\nDaily predictions also saved to: {daily_csv_filename}")

print("\\nForecasting completed successfully.")
print("Model trained on daily data with proper time-series validation.")
print("Predictions generated for 5 weeks of January 2023.")
