In [None]:
"""
BlueBikes Demand Forecasting using Random Forest Regressor
Simplified and optimized version for debugging and fast execution
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

def load_and_check_data(filepath):
    """Load data and perform basic checks"""
    print("="*60)
    print("Loading and Checking Data")
    print("="*60)

    try:
        df = pd.read_csv(filepath)
        print(f"✓ Data loaded successfully")
        print(f"  Shape: {df.shape}")
        print(f"  Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

        # Check for demand column
        if 'demand' not in df.columns:
            print("ERROR: 'demand' column not found!")
            print(f"Available columns: {df.columns.tolist()}")
            return None

        # Check for infinities and extreme values
        print(f"\nTarget variable (demand) statistics:")
        print(f"  Mean: {df['demand'].mean():.2f}")
        print(f"  Std: {df['demand'].std():.2f}")
        print(f"  Min: {df['demand'].min()}")
        print(f"  Max: {df['demand'].max()}")
        print(f"  Nulls: {df['demand'].isna().sum()}")

        # Check for infinite values
        inf_cols = df.columns[df.isin([np.inf, -np.inf]).any()].tolist()
        if inf_cols:
            print(f"\nWARNING: Infinite values found in columns: {inf_cols}")
            # Replace infinites with NaN
            df = df.replace([np.inf, -np.inf], np.nan)

        return df

    except Exception as e:
        print(f"ERROR loading data: {e}")
        return None

def prepare_features(df):
    """Prepare features with basic cleaning"""
    print("\n" + "="*60)
    print("Preparing Features")
    print("="*60)

    # List all potential feature columns (excluding target and metadata)
    exclude_cols = ['demand', 'timestamp', 'station_id', 'weather_category', 'station_type']
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    print(f"Found {len(feature_cols)} feature columns")

    # Select only numeric columns
    numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    print(f"Using {len(numeric_cols)} numeric features")

    X = df[numeric_cols].copy()
    y = df['demand'].copy()

    # Handle missing values
    if X.isna().any().any():
        print(f"Missing values found. Filling with median...")
        X = X.fillna(X.median())

    # Check for remaining issues
    if X.isna().any().any():
        print("WARNING: Still have NaN values after filling")
        X = X.fillna(0)

    print(f"Final feature matrix shape: {X.shape}")
    print(f"Final target shape: {y.shape}")

    return X, y, numeric_cols

def train_simple_baseline(X_train, y_train, X_test, y_test):
    """Train a simple baseline Random Forest"""
    print("\n" + "="*60)
    print("Training Simple Baseline Random Forest")
    print("="*60)

    # Very simple model for testing
    print("Using minimal parameters: n_estimators=10, max_depth=10")

    model = RandomForestRegressor(
        n_estimators=100,  # Very few trees for speed
        max_depth=10,     # Limited depth
        min_samples_split=5,
        random_state=42,
        n_jobs=-1,
        verbose=1  # Show progress
    )

    print("Starting training...")
    start_time = time.time()

    # Train with timeout check
    model.fit(X_train, y_train)

    train_time = time.time() - start_time
    print(f"Training completed in {train_time:.2f} seconds")

    # Make predictions
    print("Making predictions...")
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    # Calculate metrics
    train_metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_train, train_pred)),
        'MAE': mean_absolute_error(y_train, train_pred),
        'R2': r2_score(y_train, train_pred)
    }

    test_metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_test, test_pred)),
        'MAE': mean_absolute_error(y_test, test_pred),
        'R2': r2_score(y_test, test_pred)
    }

    print(f"\nTrain Metrics: RMSE={train_metrics['RMSE']:.2f}, MAE={train_metrics['MAE']:.2f}, R2={train_metrics['R2']:.3f}")
    print(f"Test Metrics:  RMSE={test_metrics['RMSE']:.2f}, MAE={test_metrics['MAE']:.2f}, R2={test_metrics['R2']:.3f}")

    return model, train_metrics, test_metrics

def train_optimized_model(X_train, y_train, X_test, y_test):
    """Train an optimized model with manual parameter selection"""
    print("\n" + "="*60)
    print("Training Optimized Random Forest")
    print("="*60)

    # Test a few parameter combinations manually
    param_sets = [
        {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 10},
        {'n_estimators': 100, 'max_depth': 15, 'min_samples_split': 5},
        {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 10, 'max_features': 0.5}
    ]

    best_score = float('inf')
    best_model = None
    best_params = None

    # Use a validation split from training data
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    for i, params in enumerate(param_sets, 1):
        print(f"\nTesting parameter set {i}/{len(param_sets)}: {params}")

        model = RandomForestRegressor(
            **params,
            random_state=42,
            n_jobs=-1
        )

        start_time = time.time()
        model.fit(X_tr, y_tr)
        train_time = time.time() - start_time

        val_pred = model.predict(X_val)
        val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))

        print(f"  Training time: {train_time:.2f}s")
        print(f"  Validation RMSE: {val_rmse:.2f}")

        if val_rmse < best_score:
            best_score = val_rmse
            best_model = model
            best_params = params

    print(f"\nBest parameters: {best_params}")
    print(f"Best validation RMSE: {best_score:.2f}")

    # Retrain best model on full training data
    print("\nRetraining best model on full training set...")
    best_model.fit(X_train, y_train)

    # Calculate final metrics
    train_pred = best_model.predict(X_train)
    test_pred = best_model.predict(X_test)

    train_metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_train, train_pred)),
        'MAE': mean_absolute_error(y_train, train_pred),
        'R2': r2_score(y_train, train_pred)
    }

    test_metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_test, test_pred)),
        'MAE': mean_absolute_error(y_test, test_pred),
        'R2': r2_score(y_test, test_pred)
    }

    print(f"\nFinal Train Metrics: RMSE={train_metrics['RMSE']:.2f}, MAE={train_metrics['MAE']:.2f}, R2={train_metrics['R2']:.3f}")
    print(f"Final Test Metrics:  RMSE={test_metrics['RMSE']:.2f}, MAE={test_metrics['MAE']:.2f}, R2={test_metrics['R2']:.3f}")

    return best_model, train_metrics, test_metrics, best_params

def analyze_feature_importance(model, feature_names, top_n=15):
    """Analyze feature importance"""
    print("\n" + "="*60)
    print(f"Top {top_n} Feature Importances")
    print("="*60)

    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    for idx, row in importance_df.head(top_n).iterrows():
        print(f"{row['feature']:40}: {row['importance']:.4f}")

    return importance_df

def compare_models(baseline_metrics, optimized_metrics):
    """Compare model performances"""
    print("\n" + "="*60)
    print("Model Comparison")
    print("="*60)

    print("\n{:<20} {:>15} {:>15} {:>15}".format("Metric", "Baseline", "Optimized", "Improvement"))
    print("-" * 70)

    for metric in ['RMSE', 'MAE', 'R2']:
        baseline_val = baseline_metrics[metric]
        optimized_val = optimized_metrics[metric]

        if metric == 'R2':
            improvement = (optimized_val - baseline_val) * 100
            symbol = "+" if improvement > 0 else ""
        else:
            improvement = ((baseline_val - optimized_val) / baseline_val) * 100
            symbol = "+" if improvement > 0 else ""

        print("{:<20} {:>15.3f} {:>15.3f} {:>14}{:.1f}%".format(
            f"Test {metric}:", baseline_val, optimized_val, symbol, improvement
        ))

def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"✓ Model saved to {filename}")

def main():
    """Main execution function"""
    print("="*60)
    print("BlueBikes Random Forest - Simplified Fast Version")
    print("="*60)
    print("This version is optimized for speed and debugging\n")

    # Load and check data
    df = load_and_check_data('bluebikes_ml_ready.csv')
    if df is None:
        return

    # Use entire dataset
    print(f"\nUsing entire dataset with {len(df)} rows...")

    # Prepare features
    X, y, feature_names = prepare_features(df)

    # Create train/test split (simple random split for speed)
    print("\n" + "="*60)
    print("Creating Train/Test Split")
    print("="*60)
    print("Using 80/20 random split (not temporal) for speed")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    print(f"Train size: {len(X_train)}")
    print(f"Test size: {len(X_test)}")

    # Train simple baseline
    baseline_model, baseline_train, baseline_test = train_simple_baseline(
        X_train, y_train, X_test, y_test
    )

    # Train optimized model
    optimized_model, optimized_train, optimized_test, best_params = train_optimized_model(
        X_train, y_train, X_test, y_test
    )

    save_model(baseline_model, "baseline_random_forest.pkl")
    save_model(optimized_model, "optimized_random_forest.pkl")

    # Compare models
    compare_models(baseline_test, optimized_test)

    # Feature importance from optimized model
    feature_importance = analyze_feature_importance(optimized_model, feature_names)

    return baseline_model, optimized_model, feature_importance

# Run the analysis
if __name__ == "__main__":
    try:
        baseline, optimized, importance = main()
    except Exception as e:
        print(f"\nERROR: {e}")
        print("\nTrying minimal debug version...")

        # Ultra-minimal test
        print("\nCreating synthetic data for testing...")
        X_test = np.random.randn(1000, 10)
        y_test = np.random.randn(1000)

        rf_test = RandomForestRegressor(n_estimators=5, max_depth=5, random_state=42)
        rf_test.fit(X_test[:800], y_test[:800])
        pred = rf_test.predict(X_test[800:])
        print(f"Synthetic data test RMSE: {np.sqrt(mean_squared_error(y_test[800:], pred)):.2f}")

BlueBikes Random Forest - Simplified Fast Version
This version is optimized for speed and debugging

Loading and Checking Data
✓ Data loaded successfully
  Shape: (798639, 43)
  Memory usage: 235.35 MB

Target variable (demand) statistics:
  Mean: 2.50
  Std: 2.41
  Min: 1
  Max: 64
  Nulls: 0

Using entire dataset with 798639 rows...

Preparing Features
Found 38 feature columns
Using 33 numeric features
Final feature matrix shape: (798639, 33)
Final target shape: (798639,)

Creating Train/Test Split
Using 80/20 random split (not temporal) for speed
Train size: 638911
Test size: 159728

Training Simple Baseline Random Forest
Using minimal parameters: n_estimators=10, max_depth=10
Starting training...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.7min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


Training completed in 461.69 seconds
Making predictions...


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    4.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.9s finished



Train Metrics: RMSE=1.28, MAE=0.77, R2=0.720
Test Metrics:  RMSE=1.32, MAE=0.79, R2=0.702

Training Optimized Random Forest

Testing parameter set 1/3: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 10}
  Training time: 348.21s
  Validation RMSE: 1.20

Testing parameter set 2/3: {'n_estimators': 100, 'max_depth': 15, 'min_samples_split': 5}
  Training time: 549.32s
  Validation RMSE: 1.21

Testing parameter set 3/3: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 10, 'max_features': 0.5}
  Training time: 202.14s
  Validation RMSE: 1.22

Best parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 10}
Best validation RMSE: 1.20

Retraining best model on full training set...

Final Train Metrics: RMSE=0.83, MAE=0.51, R2=0.881
Final Test Metrics:  RMSE=1.19, MAE=0.68, R2=0.759
✓ Model saved to baseline_random_forest.pkl
✓ Model saved to optimized_random_forest.pkl

Model Comparison

Metric                      Baseline       Optimized     Improveme