# Baseline Supply Chain Demand Forecasting

This is a simplified baseline notebook with NO feature engineering to establish a performance baseline.

In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# For modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {list(train_df.columns[:5])}... {list(train_df.columns[-3:])}")

## 2. Convert Wide to Long Format

In [None]:
def melt_wide_to_long(df, value_name='units_sold'):
    """Convert wide format to long format"""
    # Get ID and date columns
    id_cols = ['store_id', 'product_id']
    date_cols = [col for col in df.columns if col not in id_cols]
    
    # Melt the dataframe
    melted = df.melt(
        id_vars=id_cols,
        value_vars=date_cols,
        var_name='date',
        value_name=value_name
    )
    
    # Convert to proper types
    melted['date'] = pd.to_datetime(melted['date'])
    melted[value_name] = pd.to_numeric(melted[value_name], errors='coerce')
    
    return melted

# Convert train and test
train_long = melt_wide_to_long(train_df, 'units_sold')
test_long = melt_wide_to_long(test_df, 'target')

print(f"Train long shape: {train_long.shape}")
print(f"Test long shape: {test_long.shape}")
print(f"\nTrain date range: {train_long['date'].min()} to {train_long['date'].max()}")
print(f"Test date range: {test_long['date'].min()} to {test_long['date'].max()}")
print(f"\nSample train data:")
print(train_long.head())

## 3. Minimal Feature Engineering (Only Basic Time Features)

In [None]:
# Add only basic temporal features (no lag, no rolling, no complex features)
def add_basic_time_features(df):
    df = df.copy()
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    return df

# Apply to both datasets
train_features = add_basic_time_features(train_long)
test_features = add_basic_time_features(test_long)

print(f"Added basic time features")
print(f"Features: {[c for c in train_features.columns if c not in ['store_id', 'product_id', 'date', 'units_sold']]}")

## 4. Prepare Data for Training

In [None]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

le_store = LabelEncoder()
le_product = LabelEncoder()

# Fit on train and transform both
train_features['store_id_encoded'] = le_store.fit_transform(train_features['store_id'])
train_features['product_id_encoded'] = le_product.fit_transform(train_features['product_id'])

# For test, handle unseen categories
test_features['store_id_encoded'] = test_features['store_id'].map(
    lambda x: le_store.transform([x])[0] if x in le_store.classes_ else -1
)
test_features['product_id_encoded'] = test_features['product_id'].map(
    lambda x: le_product.transform([x])[0] if x in le_product.classes_ else -1
)

print("Categorical variables encoded")

In [None]:
# Define feature columns (minimal - no lag, no rolling)
feature_cols = [
    'store_id_encoded',
    'product_id_encoded',
    'year',
    'month',
    'day',
    'dayofweek',
    'dayofyear'
]

print(f"Using {len(feature_cols)} features: {feature_cols}")

# Prepare training data
train_clean = train_features.dropna(subset=['units_sold'])

X = train_clean[feature_cols]
y = train_clean['units_sold']

print(f"\nTraining samples: {len(X)}")
print(f"Target range: {y.min():.0f} to {y.max():.0f}")

## 5. Train-Validation Split (Time-based)

In [None]:
# Split: Use last 30 days as validation
max_date = train_clean['date'].max()
val_cutoff = max_date - pd.Timedelta(days=30)

train_mask = train_clean['date'] <= val_cutoff
val_mask = train_clean['date'] > val_cutoff

X_train, X_val = X[train_mask], X[val_mask]
y_train, y_val = y[train_mask], y[val_mask]

print(f"Training period: {train_clean[train_mask]['date'].min()} to {train_clean[train_mask]['date'].max()}")
print(f"Validation period: {train_clean[val_mask]['date'].min()} to {train_clean[val_mask]['date'].max()}")
print(f"\nTraining samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

## 6. Train Baseline Model

In [None]:
# Train a simple Random Forest model
print("Training baseline Random Forest model...")

model = RandomForestRegressor(
    n_estimators=50,  # Small for speed
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Model trained!")

## 7. Evaluate Baseline Model

In [None]:
# Make predictions
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

# Calculate metrics
train_mae = mean_absolute_error(y_train, train_pred)
val_mae = mean_absolute_error(y_val, val_pred)

train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))

train_r2 = r2_score(y_train, train_pred)
val_r2 = r2_score(y_val, val_pred)

print("="*60)
print("BASELINE MODEL PERFORMANCE (NO FEATURE ENGINEERING)")
print("="*60)
print(f"\nTraining Set:")
print(f"  MAE:  {train_mae:.4f}")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  R²:   {train_r2:.4f}")

print(f"\nValidation Set:")
print(f"  MAE:  {val_mae:.4f}")
print(f"  RMSE: {val_rmse:.4f}")
print(f"  R²:   {val_r2:.4f}")

print(f"\nOverfitting Check:")
print(f"  Val MAE / Train MAE = {val_mae/train_mae:.2f}")
if val_mae/train_mae > 1.2:
    print("  ⚠️  Model may be overfitting")
else:
    print("  ✅ Good generalization")

## 8. Feature Importance

In [None]:
# Show feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance_df.to_string(index=False))

## 9. Generate Predictions for Test Set

In [None]:
# Train on full dataset for submission
print("Training on full dataset...")
model.fit(X, y)

# Prepare test data
X_test = test_features[feature_cols]

# Make predictions
test_predictions = model.predict(X_test)

# Ensure non-negative predictions
test_predictions = np.maximum(0, test_predictions)

print(f"Generated {len(test_predictions)} predictions")
print(f"Prediction range: {test_predictions.min():.2f} to {test_predictions.max():.2f}")
print(f"Mean prediction: {test_predictions.mean():.2f}")

## 10. Create Submission

In [None]:
# Create submission dataframe
submission = test_features[['store_id', 'product_id', 'date']].copy()
submission['units_sold'] = test_predictions

# Create ID column
submission['id'] = (
    submission['store_id'] + '_' + 
    submission['product_id'] + '_' + 
    submission['date'].dt.strftime('%Y-%m-%d')
)

# Round predictions
submission['units_sold'] = submission['units_sold'].round(2)

# Keep only required columns
submission = submission[['id', 'units_sold']]

print(f"Submission shape: {submission.shape}")
print("\nFirst 5 rows:")
print(submission.head())

# Save submission
submission.to_csv('submission_baseline.csv', index=False)
print("\nSaved to: submission_baseline.csv")

## Summary

This baseline model:
- Uses only 7 basic features (store_id, product_id, year, month, day, dayofweek, dayofyear)
- NO lag features
- NO rolling statistics
- NO external data (prices, weather, promotions, etc.)
- NO advanced feature engineering

**Baseline Performance:**
- Validation MAE: `{val_mae:.4f}`
- Validation R²: `{val_r2:.4f}`

This is your starting point. Now you can:
1. Add lag features and see if MAE improves
2. Add rolling statistics
3. Include external data (prices, discounts, etc.)
4. Use more sophisticated models

Each improvement should result in a lower MAE score!