# DA5401 — Assignment 8: Ensemble Learning for Bike Sharing
**Author:** (Your Name)  
**Course:** DA5401 — Mathematical Foundations of Data Science (Assignment A8)  
**Dataset:** UCI Bike Sharing Demand (via ucimlrepo, ID=275)

**Contents**
1. Data loading & preprocessing (ucimlrepo)  
2. Baseline models (Decision Tree, Linear Regression)  
3. Bagging  
4. Gradient Boosting  
5. Stacking  
6. Comparative RMSE results & plots  


## Part A — Data Loading and Preprocessing (ucimlrepo)
We fetch the dataset directly using `ucimlrepo` and perform preprocessing:
- Drop unnecessary columns  
- Encode categorical variables  
- Train/test split  


In [None]:
# Part A: Data loading and preprocessing using ucimlrepo
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Fetch dataset directly from UCI ML Repo (ID = 275, Bike Sharing)
bike_sharing = fetch_ucirepo(id=275)

# Extract features (X) and targets (y)
X_full = bike_sharing.data.features.copy()
y_full = bike_sharing.data.targets.copy()

# Combine into a single DataFrame
df = pd.concat([X_full, y_full], axis=1)

# Drop irrelevant columns
df = df.drop(columns=['instant','dteday','casual','registered'], errors='ignore')

# Convert categorical columns
cat_cols = ['season','weathersit','mnth','hr','weekday']
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype('category')

# One-hot encoding
df_enc = pd.get_dummies(df, columns=[c for c in cat_cols if c in df.columns], drop_first=True)

# Target and features
if 'cnt' in df_enc.columns:
    y = df_enc['cnt']
    X = df_enc.drop(columns=['cnt'])
else:
    y = df_enc.iloc[:, -1]
    X = df_enc.iloc[:, :-1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Shapes:', X_train.shape, X_test.shape)


### Baseline Models — Decision Tree and Linear Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from math import sqrt

# Decision Tree
dt = DecisionTreeRegressor(max_depth=6, random_state=42)
dt.fit(X_train, y_train)
rmse_dt = sqrt(mean_squared_error(y_test, dt.predict(X_test)))

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
rmse_lr = sqrt(mean_squared_error(y_test, lr.predict(X_test)))

print(f"Decision Tree RMSE: {rmse_dt:.4f}")
print(f"Linear Regression RMSE: {rmse_lr:.4f}")


## Part B — Bagging and Gradient Boosting

In [None]:
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor

# Bagging
bag = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=6), n_estimators=50, random_state=42, n_jobs=-1)
bag.fit(X_train, y_train)
rmse_bag = sqrt(mean_squared_error(y_test, bag.predict(X_test)))

# Gradient Boosting
gbr = GradientBoostingRegressor(random_state=42, n_estimators=200, learning_rate=0.1, max_depth=3)
gbr.fit(X_train, y_train)
rmse_gbr = sqrt(mean_squared_error(y_test, gbr.predict(X_test)))

print(f"Bagging RMSE: {rmse_bag:.4f}")
print(f"Gradient Boosting RMSE: {rmse_gbr:.4f}")


## Part C — Stacking Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor

knn = KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
bag_for_stack = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=6), n_estimators=50, random_state=42, n_jobs=-1)
gbr_for_stack = GradientBoostingRegressor(random_state=42, n_estimators=200, learning_rate=0.1, max_depth=3)

estimators = [('knn', knn), ('bag', bag_for_stack), ('gbr', gbr_for_stack)]
meta_learner = Ridge(alpha=1.0)

stack = StackingRegressor(estimators=estimators, final_estimator=meta_learner, n_jobs=-1)
stack.fit(X_train, y_train)
rmse_stack = sqrt(mean_squared_error(y_test, stack.predict(X_test)))
print(f"Stacking RMSE: {rmse_stack:.4f}")


## Part D — Results & Visualization

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

results = pd.DataFrame({
    'Model': ['Decision Tree', 'Linear Regression', 'Bagging', 'Gradient Boosting', 'Stacking'],
    'RMSE': [rmse_dt, rmse_lr, rmse_bag, rmse_gbr, rmse_stack]
}).sort_values('RMSE')

print(results)

# Plot RMSE comparison
plt.figure(figsize=(8,5))
plt.bar(results['Model'], results['RMSE'])
plt.xticks(rotation=30, ha='right')
plt.title('RMSE Comparison')
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()


### Conclusion — Bias-Variance and Ensemble Insights