---
## 1. Import Libraries and Load Data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Visualization settings
plt.style.use('seaborn-whitegrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

OSError: 'seaborn-whitegrid' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [None]:
# Load dataset
df = pd.read_csv('../../data/data/FloridaBikeRentals.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head(10)

---
## 2. Exploratory Data Analysis (EDA)

In [None]:
# Dataset information
print("Dataset Information:")
print("="*60)
df.info()

print("\n" + "="*60)
print("Missing Values:")
print(df.isnull().sum())

print("\n" + "="*60)
print("Statistical Summary:")
df.describe()

In [None]:
# Check column names
print("Column Names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2}. {col}")

In [None]:
# Distribution of target variable (bike rentals)
# Assuming the target column is 'cnt', 'count', or 'rentals'
# Adjust column name based on actual data

target_candidates = ['cnt', 'count', 'rentals', 'total', 'bikes']
target_col = None

for col in target_candidates:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    print("Available columns:", df.columns.tolist())
    print("\nPlease identify the target column for bike rentals.")
else:
    print(f"Target column identified: {target_col}")
    
    plt.figure(figsize=(14, 5))
    
    plt.subplot(1, 3, 1)
    sns.histplot(df[target_col], bins=50, kde=True)
    plt.title('Distribution of Bike Rentals', fontsize=14, fontweight='bold')
    plt.xlabel('Number of Rentals')
    
    plt.subplot(1, 3, 2)
    sns.boxplot(y=df[target_col], color='lightcoral')
    plt.title('Box Plot - Bike Rentals', fontsize=14, fontweight='bold')
    plt.ylabel('Number of Rentals')
    
    plt.subplot(1, 3, 3)
    from scipy import stats
    stats.probplot(df[target_col], dist="norm", plot=plt)
    plt.title('Q-Q Plot', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nTarget Variable Statistics:")
    print(f"Mean: {df[target_col].mean():.2f}")
    print(f"Median: {df[target_col].median():.2f}")
    print(f"Std: {df[target_col].std():.2f}")
    print(f"Min: {df[target_col].min():.0f}")
    print(f"Max: {df[target_col].max():.0f}")

In [None]:
# Correlation analysis
plt.figure(figsize=(12, 10))

# Select numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()

sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})

plt.title('Correlation Matrix - Bike Rental Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Print top correlations with target
if target_col:
    print(f"\nTop Correlations with {target_col}:")
    target_corr = correlation_matrix[target_col].sort_values(ascending=False)
    print(target_corr)

---
## 3. Feature Engineering

In [None]:
# Create a copy for feature engineering
df_features = df.copy()

# Check if datetime column exists
datetime_candidates = ['datetime', 'dteday', 'date', 'timestamp']
datetime_col = None

for col in datetime_candidates:
    if col in df_features.columns:
        datetime_col = col
        break

if datetime_col:
    # Convert to datetime
    df_features[datetime_col] = pd.to_datetime(df_features[datetime_col])
    
    # Extract temporal features
    df_features['year'] = df_features[datetime_col].dt.year
    df_features['month'] = df_features[datetime_col].dt.month
    df_features['day'] = df_features[datetime_col].dt.day
    df_features['hour'] = df_features[datetime_col].dt.hour
    df_features['dayofweek'] = df_features[datetime_col].dt.dayofweek
    df_features['is_weekend'] = (df_features['dayofweek'] >= 5).astype(int)
    
    print(f"Extracted temporal features from {datetime_col}")
    print("New features: year, month, day, hour, dayofweek, is_weekend")
else:
    print("No datetime column found. Checking for separate time components...")
    # Check if time components already exist
    time_components = ['hr', 'hour', 'mnth', 'month', 'weekday']
    existing_components = [col for col in time_components if col in df_features.columns]
    print(f"Existing time components: {existing_components}")

# Display new shape
print(f"\nNew shape: {df_features.shape}")
df_features.head()

In [None]:
# Encode categorical variables if any
categorical_cols = df_features.select_dtypes(include=['object', 'category']).columns.tolist()

if datetime_col and datetime_col in categorical_cols:
    categorical_cols.remove(datetime_col)

if categorical_cols:
    print(f"Categorical columns to encode: {categorical_cols}")
    df_features = pd.get_dummies(df_features, columns=categorical_cols, drop_first=True)
    print(f"After encoding: {df_features.shape}")
else:
    print("No categorical columns to encode.")

print(f"\nFinal feature set shape: {df_features.shape}")

---
## 4. Prepare Data for Modeling

In [None]:
# Select features and target
# Drop datetime column and target from features
columns_to_drop = [target_col]
if datetime_col and datetime_col in df_features.columns:
    columns_to_drop.append(datetime_col)

X = df_features.drop(columns=columns_to_drop)
y = df_features[target_col]

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nFeatures ({len(X.columns)}):")
for i, col in enumerate(X.columns, 1):
    print(f"{i:2}. {col}")

In [None]:
# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nFeatures: {X_train.shape[1]}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler")
print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")

---
## 5. Model Building

### 5.1 Linear Regression

In [None]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_lr = lr_model.predict(X_train_scaled)
y_test_pred_lr = lr_model.predict(X_test_scaled)

# Evaluation
train_r2_lr = r2_score(y_train, y_train_pred_lr)
test_r2_lr = r2_score(y_test, y_test_pred_lr)
train_rmse_lr = np.sqrt(mean_squared_error(y_train, y_train_pred_lr))
test_rmse_lr = np.sqrt(mean_squared_error(y_test, y_test_pred_lr))
train_mae_lr = mean_absolute_error(y_train, y_train_pred_lr)
test_mae_lr = mean_absolute_error(y_test, y_test_pred_lr)

print("LINEAR REGRESSION RESULTS")
print("="*60)
print(f"Training R¬≤: {train_r2_lr:.4f}")
print(f"Test R¬≤: {test_r2_lr:.4f}")
print(f"\nTraining RMSE: {train_rmse_lr:.4f}")
print(f"Test RMSE: {test_rmse_lr:.4f}")
print(f"\nTraining MAE: {train_mae_lr:.4f}")
print(f"Test MAE: {test_mae_lr:.4f}")

# Cross-validation
cv_scores_lr = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"\nCross-Validation R¬≤ (5-fold): {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")

### 5.2 Polynomial Regression

In [None]:
# Polynomial Regression (degree 2)
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train_scaled)
X_test_poly = poly_features.transform(X_test_scaled)

print(f"Polynomial features created (degree 2)")
print(f"Original features: {X_train_scaled.shape[1]}")
print(f"Polynomial features: {X_train_poly.shape[1]}")

# Train polynomial model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Predictions
y_train_pred_poly = poly_model.predict(X_train_poly)
y_test_pred_poly = poly_model.predict(X_test_poly)

# Evaluation
train_r2_poly = r2_score(y_train, y_train_pred_poly)
test_r2_poly = r2_score(y_test, y_test_pred_poly)
train_rmse_poly = np.sqrt(mean_squared_error(y_train, y_train_pred_poly))
test_rmse_poly = np.sqrt(mean_squared_error(y_test, y_test_pred_poly))
train_mae_poly = mean_absolute_error(y_train, y_train_pred_poly)
test_mae_poly = mean_absolute_error(y_test, y_test_pred_poly)

print("\nPOLYNOMIAL REGRESSION RESULTS (degree 2)")
print("="*60)
print(f"Training R¬≤: {train_r2_poly:.4f}")
print(f"Test R¬≤: {test_r2_poly:.4f}")
print(f"\nTraining RMSE: {train_rmse_poly:.4f}")
print(f"Test RMSE: {test_rmse_poly:.4f}")
print(f"\nTraining MAE: {train_mae_poly:.4f}")
print(f"Test MAE: {test_mae_poly:.4f}")

### 5.3 Ridge Regression (L2 Regularization)

In [None]:
# Ridge Regression with different alpha values
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
ridge_results = []

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_scaled, y_train)
    
    y_test_pred = ridge_model.predict(X_test_scaled)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    ridge_results.append({
        'alpha': alpha,
        'r2': test_r2,
        'rmse': test_rmse,
        'model': ridge_model
    })

# Find best alpha
ridge_results_df = pd.DataFrame(ridge_results)
best_ridge_idx = ridge_results_df['r2'].idxmax()
best_ridge = ridge_results[best_ridge_idx]

print("RIDGE REGRESSION - Alpha Tuning")
print("="*60)
print(ridge_results_df[['alpha', 'r2', 'rmse']])
print(f"\nBest Alpha: {best_ridge['alpha']}")
print(f"Best Test R¬≤: {best_ridge['r2']:.4f}")
print(f"Best Test RMSE: {best_ridge['rmse']:.4f}")

# Use best model
ridge_model = best_ridge['model']
y_train_pred_ridge = ridge_model.predict(X_train_scaled)
y_test_pred_ridge = ridge_model.predict(X_test_scaled)

train_r2_ridge = r2_score(y_train, y_train_pred_ridge)
test_r2_ridge = best_ridge['r2']
train_rmse_ridge = np.sqrt(mean_squared_error(y_train, y_train_pred_ridge))
test_rmse_ridge = best_ridge['rmse']
train_mae_ridge = mean_absolute_error(y_train, y_train_pred_ridge)
test_mae_ridge = mean_absolute_error(y_test, y_test_pred_ridge)

### 5.4 Lasso Regression (L1 Regularization)

In [None]:
# Lasso Regression with different alpha values
lasso_results = []

for alpha in alphas:
    lasso_model = Lasso(alpha=alpha, max_iter=10000)
    lasso_model.fit(X_train_scaled, y_train)
    
    y_test_pred = lasso_model.predict(X_test_scaled)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    lasso_results.append({
        'alpha': alpha,
        'r2': test_r2,
        'rmse': test_rmse,
        'model': lasso_model
    })

# Find best alpha
lasso_results_df = pd.DataFrame(lasso_results)
best_lasso_idx = lasso_results_df['r2'].idxmax()
best_lasso = lasso_results[best_lasso_idx]

print("LASSO REGRESSION - Alpha Tuning")
print("="*60)
print(lasso_results_df[['alpha', 'r2', 'rmse']])
print(f"\nBest Alpha: {best_lasso['alpha']}")
print(f"Best Test R¬≤: {best_lasso['r2']:.4f}")
print(f"Best Test RMSE: {best_lasso['rmse']:.4f}")

# Use best model
lasso_model = best_lasso['model']
y_train_pred_lasso = lasso_model.predict(X_train_scaled)
y_test_pred_lasso = lasso_model.predict(X_test_scaled)

train_r2_lasso = r2_score(y_train, y_train_pred_lasso)
test_r2_lasso = best_lasso['r2']
train_rmse_lasso = np.sqrt(mean_squared_error(y_train, y_train_pred_lasso))
test_rmse_lasso = best_lasso['rmse']
train_mae_lasso = mean_absolute_error(y_train, y_train_pred_lasso)
test_mae_lasso = mean_absolute_error(y_test, y_test_pred_lasso)

# Feature selection analysis
selected_features = X.columns[lasso_model.coef_ != 0]
print(f"\nFeatures selected by Lasso: {len(selected_features)} out of {len(X.columns)}")
print(f"Features eliminated: {len(X.columns) - len(selected_features)}")

---
## 6. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Polynomial (deg 2)', 'Ridge', 'Lasso'],
    'Train_R2': [train_r2_lr, train_r2_poly, train_r2_ridge, train_r2_lasso],
    'Test_R2': [test_r2_lr, test_r2_poly, test_r2_ridge, test_r2_lasso],
    'Train_RMSE': [train_rmse_lr, train_rmse_poly, train_rmse_ridge, train_rmse_lasso],
    'Test_RMSE': [test_rmse_lr, test_rmse_poly, test_rmse_ridge, test_rmse_lasso],
    'Train_MAE': [train_mae_lr, train_mae_poly, train_mae_ridge, train_mae_lasso],
    'Test_MAE': [test_mae_lr, test_mae_poly, test_mae_ridge, test_mae_lasso]
})

# Calculate overfitting metric
comparison_df['Overfitting'] = comparison_df['Train_R2'] - comparison_df['Test_R2']

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))

# Highlight best model
best_model_idx = comparison_df['Test_R2'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
print(f"\nüèÜ Best Model (highest Test R¬≤): {best_model_name}")
print(f"   Test R¬≤: {comparison_df.loc[best_model_idx, 'Test_R2']:.4f}")
print(f"   Test RMSE: {comparison_df.loc[best_model_idx, 'Test_RMSE']:.4f}")
print(f"   Test MAE: {comparison_df.loc[best_model_idx, 'Test_MAE']:.4f}")

In [None]:
# Visualization of model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# R¬≤ Comparison
axes[0, 0].barh(comparison_df['Model'], comparison_df['Test_R2'], color='skyblue')
axes[0, 0].set_xlabel('R¬≤ Score', fontweight='bold')
axes[0, 0].set_title('Test R¬≤ Comparison', fontsize=14, fontweight='bold')
axes[0, 0].set_xlim([0, 1])
for i, v in enumerate(comparison_df['Test_R2']):
    axes[0, 0].text(v + 0.01, i, f'{v:.4f}', va='center')

# RMSE Comparison
axes[0, 1].barh(comparison_df['Model'], comparison_df['Test_RMSE'], color='lightcoral')
axes[0, 1].set_xlabel('RMSE', fontweight='bold')
axes[0, 1].set_title('Test RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
for i, v in enumerate(comparison_df['Test_RMSE']):
    axes[0, 1].text(v + v*0.01, i, f'{v:.2f}', va='center')

# MAE Comparison
axes[1, 0].barh(comparison_df['Model'], comparison_df['Test_MAE'], color='lightgreen')
axes[1, 0].set_xlabel('MAE', fontweight='bold')
axes[1, 0].set_title('Test MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
for i, v in enumerate(comparison_df['Test_MAE']):
    axes[1, 0].text(v + v*0.01, i, f'{v:.2f}', va='center')

# Overfitting Analysis
axes[1, 1].barh(comparison_df['Model'], comparison_df['Overfitting'], 
                color=['red' if x > 0.05 else 'green' for x in comparison_df['Overfitting']])
axes[1, 1].set_xlabel('Overfitting (Train R¬≤ - Test R¬≤)', fontweight='bold')
axes[1, 1].set_title('Overfitting Analysis (Lower is Better)', fontsize=14, fontweight='bold')
axes[1, 1].axvline(x=0.05, color='orange', linestyle='--', label='Threshold (0.05)')
axes[1, 1].legend()
for i, v in enumerate(comparison_df['Overfitting']):
    axes[1, 1].text(v + 0.001, i, f'{v:.4f}', va='center')

plt.tight_layout()
plt.show()

---
## 7. Feature Importance Analysis

In [None]:
# Feature importance from Linear Regression coefficients
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_
})
feature_importance['Abs_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

print("FEATURE IMPORTANCE (Top 20)")
print("="*60)
print(feature_importance.head(20).to_string(index=False))

# Visualization
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
colors = ['red' if x < 0 else 'green' for x in top_features['Coefficient']]
plt.barh(range(len(top_features)), top_features['Coefficient'], color=colors)
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Coefficient Value', fontsize=12, fontweight='bold')
plt.title('Top 20 Feature Importance (Linear Regression)', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.tight_layout()
plt.show()

In [None]:
# Lasso feature selection
lasso_features = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_model.coef_
})
lasso_features['Selected'] = lasso_features['Coefficient'] != 0
lasso_selected = lasso_features[lasso_features['Selected']].copy()
lasso_selected['Abs_Coefficient'] = np.abs(lasso_selected['Coefficient'])
lasso_selected = lasso_selected.sort_values('Abs_Coefficient', ascending=False)

print(f"\nLASSO FEATURE SELECTION")
print("="*60)
print(f"Features selected: {len(lasso_selected)} out of {len(X.columns)}")
print(f"\nSelected features (Top 20):")
print(lasso_selected.head(20)[['Feature', 'Coefficient']].to_string(index=False))

---
## 8. Residual Analysis

In [None]:
# Residual analysis for best model (Linear Regression)
residuals_train = y_train - y_train_pred_lr
residuals_test = y_test - y_test_pred_lr

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Residual plot - Training
axes[0, 0].scatter(y_train_pred_lr, residuals_train, alpha=0.5)
axes[0, 0].axhline(y=0, color='red', linestyle='--')
axes[0, 0].set_xlabel('Predicted Values', fontweight='bold')
axes[0, 0].set_ylabel('Residuals', fontweight='bold')
axes[0, 0].set_title('Residual Plot - Training Set', fontsize=14, fontweight='bold')

# Residual plot - Test
axes[0, 1].scatter(y_test_pred_lr, residuals_test, alpha=0.5, color='orange')
axes[0, 1].axhline(y=0, color='red', linestyle='--')
axes[0, 1].set_xlabel('Predicted Values', fontweight='bold')
axes[0, 1].set_ylabel('Residuals', fontweight='bold')
axes[0, 1].set_title('Residual Plot - Test Set', fontsize=14, fontweight='bold')

# Histogram of residuals
axes[1, 0].hist(residuals_test, bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residuals', fontweight='bold')
axes[1, 0].set_ylabel('Frequency', fontweight='bold')
axes[1, 0].set_title('Distribution of Residuals (Test Set)', fontsize=14, fontweight='bold')
axes[1, 0].axvline(x=0, color='red', linestyle='--')

# Q-Q plot
from scipy import stats
stats.probplot(residuals_test, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot of Residuals', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nRESIDUAL ANALYSIS")
print("="*60)
print(f"Mean of residuals: {residuals_test.mean():.4f} (should be close to 0)")
print(f"Std of residuals: {residuals_test.std():.4f}")
print(f"Min residual: {residuals_test.min():.4f}")
print(f"Max residual: {residuals_test.max():.4f}")

In [None]:
# Actual vs Predicted plot
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Training set
axes[0].scatter(y_train, y_train_pred_lr, alpha=0.5)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 
             'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Values', fontweight='bold')
axes[0].set_ylabel('Predicted Values', fontweight='bold')
axes[0].set_title(f'Training Set: Actual vs Predicted\nR¬≤ = {train_r2_lr:.4f}', 
                  fontsize=14, fontweight='bold')
axes[0].legend()

# Test set
axes[1].scatter(y_test, y_test_pred_lr, alpha=0.5, color='orange')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', lw=2, label='Perfect Prediction')
axes[1].set_xlabel('Actual Values', fontweight='bold')
axes[1].set_ylabel('Predicted Values', fontweight='bold')
axes[1].set_title(f'Test Set: Actual vs Predicted\nR¬≤ = {test_r2_lr:.4f}', 
                  fontsize=14, fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.show()

---
## 9. Summary and Recommendations

In [None]:
print("\n" + "="*80)
print("SESSION 5: REGRESSION ANALYSIS - SUMMARY")
print("="*80)

print("\nüìä DATASET:")
print(f"   ‚Ä¢ Total samples: {len(df):,}")
print(f"   ‚Ä¢ Features: {X.shape[1]}")
print(f"   ‚Ä¢ Training samples: {len(X_train):,}")
print(f"   ‚Ä¢ Test samples: {len(X_test):,}")

print("\nüéØ MODELS TESTED:")
print("   1. Linear Regression")
print("   2. Polynomial Regression (degree 2)")
print("   3. Ridge Regression (L2 regularization)")
print("   4. Lasso Regression (L1 regularization)")

print("\nüèÜ BEST MODEL:")
print(f"   ‚Ä¢ Model: {best_model_name}")
print(f"   ‚Ä¢ Test R¬≤: {comparison_df.loc[best_model_idx, 'Test_R2']:.4f}")
print(f"   ‚Ä¢ Test RMSE: {comparison_df.loc[best_model_idx, 'Test_RMSE']:.4f}")
print(f"   ‚Ä¢ Test MAE: {comparison_df.loc[best_model_idx, 'Test_MAE']:.4f}")

print("\nüìà KEY INSIGHTS:")
print(f"   ‚Ä¢ Model explains {comparison_df.loc[best_model_idx, 'Test_R2']*100:.2f}% of variance in bike rentals")
print(f"   ‚Ä¢ Average prediction error: ¬±{comparison_df.loc[best_model_idx, 'Test_MAE']:.2f} rentals")
print(f"   ‚Ä¢ Top predictive features identified from {X.shape[1]} features")

print("\nüí° RECOMMENDATIONS:")
if comparison_df.loc[best_model_idx, 'Overfitting'] > 0.1:
    print("   ‚ö†Ô∏è  Model shows signs of overfitting - consider more regularization")
else:
    print("   ‚úÖ Model generalizes well - low overfitting")

if test_r2_lr > 0.7:
    print("   ‚úÖ Strong predictive power - model is production-ready")
elif test_r2_lr > 0.5:
    print("   ‚ö†Ô∏è  Moderate predictive power - consider feature engineering")
else:
    print("   ‚ö†Ô∏è  Weak predictive power - need better features or different approach")

print("\nüéì SKILLS DEMONSTRATED:")
print("   ‚úÖ Data preprocessing and feature engineering")
print("   ‚úÖ Train-test split and cross-validation")
print("   ‚úÖ Multiple regression algorithms")
print("   ‚úÖ Hyperparameter tuning (Ridge/Lasso alpha)")
print("   ‚úÖ Model evaluation (R¬≤, RMSE, MAE)")
print("   ‚úÖ Feature importance analysis")
print("   ‚úÖ Residual analysis and diagnostics")
print("   ‚úÖ Model comparison and selection")

print("\n" + "="*80)

In [None]:
# Save results
comparison_df.to_csv('../../data/outputs/session_5_model_comparison.csv', index=False)
feature_importance.to_csv('../../data/outputs/session_5_feature_importance.csv', index=False)

print("‚úÖ Results saved to data/outputs/")
print("   ‚Ä¢ session_5_model_comparison.csv")
print("   ‚Ä¢ session_5_feature_importance.csv")