# Enhanced Baseline with Multiple Models & Ensemble

This notebook tests multiple algorithms on baseline features to find the best performer.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# XGBoost and LightGBM
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not installed. Install with: pip install xgboost")

try:
    import lightgbm as lgb
    from lightgbm import LGBMRegressor
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("LightGBM not installed. Install with: pip install lightgbm")

print("Libraries imported!")

## 1. Load & Prepare Data

In [None]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def melt_wide_to_long(df, value_name='units_sold'):
    """Convert wide format to long format"""
    id_cols = ['store_id', 'product_id']
    date_cols = [col for col in df.columns if col not in id_cols]
    melted = df.melt(id_vars=id_cols, value_vars=date_cols,
                     var_name='date', value_name=value_name)
    melted['date'] = pd.to_datetime(melted['date'])
    melted[value_name] = pd.to_numeric(melted[value_name], errors='coerce')
    return melted

# Convert data
train_long = melt_wide_to_long(train_df, 'units_sold')
test_long = melt_wide_to_long(test_df, 'target')

# Add basic features
for df in [train_long, test_long]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear

# Encode categoricals
le_store = LabelEncoder()
le_product = LabelEncoder()
train_long['store_id_encoded'] = le_store.fit_transform(train_long['store_id'])
train_long['product_id_encoded'] = le_product.fit_transform(train_long['product_id'])

test_long['store_id_encoded'] = test_long['store_id'].map(
    lambda x: le_store.transform([x])[0] if x in le_store.classes_ else -1)
test_long['product_id_encoded'] = test_long['product_id'].map(
    lambda x: le_product.transform([x])[0] if x in le_product.classes_ else -1)

# Define features
feature_cols = ['store_id_encoded', 'product_id_encoded', 'year', 'month', 
                'day', 'dayofweek', 'dayofyear']

print(f"Data prepared with {len(feature_cols)} features")
print(f"Train: {len(train_long)} samples")
print(f"Test: {len(test_long)} samples")

## 2. Train-Validation Split

In [None]:
# Split data
train_clean = train_long.dropna(subset=['units_sold'])

max_date = train_clean['date'].max()
val_cutoff = max_date - pd.Timedelta(days=30)

train_mask = train_clean['date'] <= val_cutoff
val_mask = train_clean['date'] > val_cutoff

X_train = train_clean[train_mask][feature_cols]
y_train = train_clean[train_mask]['units_sold']
X_val = train_clean[val_mask][feature_cols]
y_val = train_clean[val_mask]['units_sold']

print(f"Train: {len(X_train)} samples")
print(f"Val: {len(X_val)} samples")

## 3. Train Multiple Models

In [None]:
# Dictionary to store models and results
models = {}
results = []

print("Training models...\n")

# 1. Linear Regression
print("1. Linear Regression...")
lr = LinearRegression()
lr.fit(X_train, y_train)
models['LinearRegression'] = lr
lr_val_pred = lr.predict(X_val)
lr_mae = mean_absolute_error(y_val, lr_val_pred)
lr_r2 = r2_score(y_val, lr_val_pred)
results.append(['Linear Regression', lr_mae, lr_r2])
print(f"   MAE: {lr_mae:.4f}, R¬≤: {lr_r2:.4f}\n")

# 2. Ridge Regression
print("2. Ridge Regression...")
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
models['Ridge'] = ridge
ridge_val_pred = ridge.predict(X_val)
ridge_mae = mean_absolute_error(y_val, ridge_val_pred)
ridge_r2 = r2_score(y_val, ridge_val_pred)
results.append(['Ridge Regression', ridge_mae, ridge_r2])
print(f"   MAE: {ridge_mae:.4f}, R¬≤: {ridge_r2:.4f}\n")

# 3. Random Forest
print("3. Random Forest...")
rf = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
models['RandomForest'] = rf
rf_val_pred = rf.predict(X_val)
rf_mae = mean_absolute_error(y_val, rf_val_pred)
rf_r2 = r2_score(y_val, rf_val_pred)
results.append(['Random Forest', rf_mae, rf_r2])
print(f"   MAE: {rf_mae:.4f}, R¬≤: {rf_r2:.4f}\n")

# 4. Gradient Boosting
print("4. Gradient Boosting...")
gb = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
gb.fit(X_train, y_train)
models['GradientBoosting'] = gb
gb_val_pred = gb.predict(X_val)
gb_mae = mean_absolute_error(y_val, gb_val_pred)
gb_r2 = r2_score(y_val, gb_val_pred)
results.append(['Gradient Boosting', gb_mae, gb_r2])
print(f"   MAE: {gb_mae:.4f}, R¬≤: {gb_r2:.4f}\n")

In [None]:
# 5. XGBoost (if available)
if XGBOOST_AVAILABLE:
    print("5. XGBoost...")
    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1
    )
    xgb_model.fit(X_train, y_train)
    models['XGBoost'] = xgb_model
    xgb_val_pred = xgb_model.predict(X_val)
    xgb_mae = mean_absolute_error(y_val, xgb_val_pred)
    xgb_r2 = r2_score(y_val, xgb_val_pred)
    results.append(['XGBoost', xgb_mae, xgb_r2])
    print(f"   MAE: {xgb_mae:.4f}, R¬≤: {xgb_r2:.4f}\n")
else:
    print("5. XGBoost - SKIPPED (not installed)\n")

In [None]:
# 6. LightGBM (if available)
if LIGHTGBM_AVAILABLE:
    print("6. LightGBM...")
    lgb_model = LGBMRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        verbose=-1
    )
    lgb_model.fit(X_train, y_train)
    models['LightGBM'] = lgb_model
    lgb_val_pred = lgb_model.predict(X_val)
    lgb_mae = mean_absolute_error(y_val, lgb_val_pred)
    lgb_r2 = r2_score(y_val, lgb_val_pred)
    results.append(['LightGBM', lgb_mae, lgb_r2])
    print(f"   MAE: {lgb_mae:.4f}, R¬≤: {lgb_r2:.4f}\n")
else:
    print("6. LightGBM - SKIPPED (not installed)\n")

## 4. Model Comparison

In [None]:
# Create results dataframe
results_df = pd.DataFrame(results, columns=['Model', 'MAE', 'R¬≤'])
results_df = results_df.sort_values('MAE')

print("="*60)
print("MODEL COMPARISON RESULTS")
print("="*60)
print(results_df.to_string(index=False))

# Best model
best_model_name = results_df.iloc[0]['Model']
best_mae = results_df.iloc[0]['MAE']
print(f"\nüèÜ Best Model: {best_model_name} (MAE: {best_mae:.4f})")

## 5. Create Ensemble Model

In [None]:
# Simple ensemble: average of top 3 models
print("Creating Ensemble (Average of Top 3 Models)...\n")

# Get top 3 models by MAE
top_3_models = results_df.head(3)['Model'].tolist()
print(f"Top 3 models: {top_3_models}")

# Get predictions from top 3 models
ensemble_predictions = []

for model_name in top_3_models:
    if model_name == 'Linear Regression':
        pred = lr.predict(X_val)
    elif model_name == 'Ridge Regression':
        pred = ridge.predict(X_val)
    elif model_name == 'Random Forest':
        pred = rf.predict(X_val)
    elif model_name == 'Gradient Boosting':
        pred = gb.predict(X_val)
    elif model_name == 'XGBoost' and XGBOOST_AVAILABLE:
        pred = xgb_model.predict(X_val)
    elif model_name == 'LightGBM' and LIGHTGBM_AVAILABLE:
        pred = lgb_model.predict(X_val)
    else:
        continue
    ensemble_predictions.append(pred)

# Average predictions
ensemble_val_pred = np.mean(ensemble_predictions, axis=0)
ensemble_mae = mean_absolute_error(y_val, ensemble_val_pred)
ensemble_r2 = r2_score(y_val, ensemble_val_pred)

print(f"\nEnsemble Performance:")
print(f"  MAE: {ensemble_mae:.4f}")
print(f"  R¬≤: {ensemble_r2:.4f}")

# Add to results
results.append(['Ensemble (Top 3)', ensemble_mae, ensemble_r2])

# Compare with best single model
if ensemble_mae < best_mae:
    print(f"\n‚úÖ Ensemble improved by {best_mae - ensemble_mae:.4f} MAE!")
else:
    print(f"\n‚ö†Ô∏è  Best single model is better by {ensemble_mae - best_mae:.4f} MAE")

In [None]:
# Weighted ensemble (weight by inverse of MAE)
print("\nCreating Weighted Ensemble...\n")

# Get top 3 MAE values
top_3_mae = results_df.head(3)['MAE'].values

# Calculate weights (inverse of MAE, normalized)
weights = 1 / top_3_mae
weights = weights / weights.sum()

print(f"Weights: {dict(zip(top_3_models, weights.round(4)))}")

# Weighted average
weighted_val_pred = np.average(ensemble_predictions, axis=0, weights=weights)
weighted_mae = mean_absolute_error(y_val, weighted_val_pred)
weighted_r2 = r2_score(y_val, weighted_val_pred)

print(f"\nWeighted Ensemble Performance:")
print(f"  MAE: {weighted_mae:.4f}")
print(f"  R¬≤: {weighted_r2:.4f}")

results.append(['Weighted Ensemble', weighted_mae, weighted_r2])

## 6. Final Comparison

In [None]:
# Final results
final_results_df = pd.DataFrame(results, columns=['Model', 'MAE', 'R¬≤'])
final_results_df = final_results_df.sort_values('MAE')

print("="*60)
print("FINAL MODEL RANKING")
print("="*60)
print(final_results_df.to_string(index=False))

# Best overall
best_overall = final_results_df.iloc[0]
print(f"\nüèÜ BEST MODEL: {best_overall['Model']}")
print(f"   MAE: {best_overall['MAE']:.4f}")
print(f"   R¬≤:  {best_overall['R¬≤']:.4f}")

## 7. Train Best Model on Full Dataset & Create Submission

In [None]:
# Train best model on full dataset
X_full = train_clean[feature_cols]
y_full = train_clean['units_sold']

best_model_type = best_overall['Model']
print(f"Training {best_model_type} on full dataset...")

if best_model_type == 'Linear Regression':
    final_model = LinearRegression()
elif best_model_type == 'Ridge Regression':
    final_model = Ridge(alpha=1.0)
elif best_model_type == 'Random Forest':
    final_model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
elif best_model_type == 'Gradient Boosting':
    final_model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
elif best_model_type == 'XGBoost':
    final_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1)
elif best_model_type == 'LightGBM':
    final_model = LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, verbose=-1)
elif 'Ensemble' in best_model_type:
    # For ensemble, train all top 3 models on full data
    print("Training ensemble components on full dataset...")
    ensemble_models = []
    for model_name in top_3_models:
        if model_name == 'Linear Regression':
            m = LinearRegression()
        elif model_name == 'Ridge Regression':
            m = Ridge(alpha=1.0)
        elif model_name == 'Random Forest':
            m = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
        elif model_name == 'Gradient Boosting':
            m = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
        elif model_name == 'XGBoost' and XGBOOST_AVAILABLE:
            m = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1)
        elif model_name == 'LightGBM' and LIGHTGBM_AVAILABLE:
            m = LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, verbose=-1)
        m.fit(X_full, y_full)
        ensemble_models.append(m)
    final_model = None  # Special case
else:
    final_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

if final_model is not None:
    final_model.fit(X_full, y_full)
    print("Model trained!")

In [None]:
# Generate predictions on test set
X_test = test_long[feature_cols]

if 'Ensemble' in best_model_type:
    # Ensemble prediction
    test_preds = []
    for m in ensemble_models:
        test_preds.append(m.predict(X_test))
    if 'Weighted' in best_model_type:
        test_predictions = np.average(test_preds, axis=0, weights=weights)
    else:
        test_predictions = np.mean(test_preds, axis=0)
else:
    test_predictions = final_model.predict(X_test)

# Ensure non-negative
test_predictions = np.maximum(0, test_predictions)

print(f"Generated {len(test_predictions)} predictions")
print(f"Range: {test_predictions.min():.2f} to {test_predictions.max():.2f}")
print(f"Mean: {test_predictions.mean():.2f}")

In [None]:
# Create submission
submission = test_long[['store_id', 'product_id', 'date']].copy()
submission['units_sold'] = test_predictions.round(2)
submission['id'] = (
    submission['store_id'] + '_' + 
    submission['product_id'] + '_' + 
    submission['date'].dt.strftime('%Y-%m-%d')
)
submission = submission[['id', 'units_sold']]

# Save with model name in filename
filename = f"submission_{best_model_type.replace(' ', '_').lower()}.csv"
submission.to_csv(filename, index=False)

print(f"\nSubmission saved: {filename}")
print(f"Shape: {submission.shape}")
print("\nFirst 5 rows:")
print(submission.head())

## Summary

This notebook compared multiple models on baseline features:

**Models Tested:**
- Linear Regression
- Ridge Regression
- Random Forest
- Gradient Boosting
- XGBoost (if installed)
- LightGBM (if installed)
- Ensemble (average of top 3)
- Weighted Ensemble (weighted by inverse MAE)

**Results:**
- See the 'FINAL MODEL RANKING' table above
- Best model is automatically selected and used for submission

**Next Steps:**
1. Note the baseline MAE from the best model
2. Add feature engineering (lags, rolling, external data)
3. See if you can beat this baseline score!