In [1]:
from google.colab import files
uploaded = files.upload()


Saving hour.csv to hour.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("hour.csv")

In [4]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
df.isnull().sum()

Unnamed: 0,0
instant,0
dteday,0
season,0
yr,0
mnth,0
hr,0
holiday,0
weekday,0
workingday,0
weathersit,0


In [6]:
df = df.drop(columns=["instant", "dteday"])

target = "cnt"

X = df.drop(columns=[target])
y = df[target]


In [7]:
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [8]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42
)


subag_model = BaggingRegressor(
    estimator=DecisionTreeRegressor(max_depth=15),
    n_estimators=200,
    max_samples=0.7,
    random_state=42
)


gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

models = {
    "RandomForest": rf_model,
    "Subagging": subag_model,
    "GradientBoosting": gb_model
}

In [9]:
cv_results = []

for name, model in models.items():
    rmse_scores = []
    mae_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)

        rmse_scores.append(rmse)
        mae_scores.append(mae)

    print(f"\n{name}")
    print(f"RMSE: {np.mean(rmse_scores):.2f} ± {np.std(rmse_scores):.2f}")
    print(f"MAE : {np.mean(mae_scores):.2f} ± {np.std(mae_scores):.2f}")

    cv_results.append({
        "Model": name,
        "RMSE_mean": np.mean(rmse_scores),
        "RMSE_std": np.std(rmse_scores),
        "MAE_mean": np.mean(mae_scores),
        "MAE_std": np.std(mae_scores)
    })


cv_df = pd.DataFrame(cv_results)
cv_df.to_csv("cv_regression_results.csv", index=False)

print("\nCV results saved as cv_regression_results.csv")


RandomForest
RMSE: 2.64 ± 0.56
MAE : 0.91 ± 0.04

Subagging
RMSE: 2.76 ± 0.58
MAE : 0.94 ± 0.05

GradientBoosting
RMSE: 4.76 ± 0.28
MAE : 2.84 ± 0.10

CV results saved as cv_regression_results.csv


In [10]:
best_model = gb_model
best_model.fit(X, y)

final_preds = best_model.predict(X)

final_output = pd.DataFrame({
    "ActualCnt": y,
    "PredictedCnt": final_preds
})

final_output.to_csv("final_predictions.csv", index=False)

print("Final predictions saved as final_predictions.csv")

Final predictions saved as final_predictions.csv


In [11]:
importances = best_model.feature_importances_

feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("\nTop 8 Important Features:")
print(feature_importance_df.head(8))


Top 8 Important Features:
       Feature    Importance
13  registered  9.361723e-01
12      casual  6.381058e-02
3           hr  1.049812e-05
2         mnth  2.082642e-06
1           yr  1.640368e-06
8         temp  1.294549e-06
5      weekday  6.079307e-07
10         hum  4.984400e-07


In [12]:
from google.colab import files
files.download("cv_regression_results.csv")
files.download("final_predictions.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
comparison = pd.DataFrame({
    "Actual": y.values[:10],
    "Predicted": final_preds[:10]
})

print(comparison)


   Actual  Predicted
0      16  15.798690
1      40  39.250620
2      32  30.828812
3      13  12.721796
4       1   1.989212
5       1   1.989212
6       2   3.835332
7       3   3.271058
8       8   7.964374
9      14  15.084842
