In [None]:
#importing necessary libraries
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
np.random.seed(42)


# --- Load the final, feature-rich dataset ---
# This file already contains the 17 columns needed for the project.
try:
    bikes_final_df = pd.read_csv('../data/hour.csv')
    print("Successfully loaded the final dataset (hour.csv).")
except FileNotFoundError:
    print("Error: 'hour.csv' not found. Please make sure it is in the correct directory.")
    exit()


# Let's look at the columns to confirm it matches the list you provided.
print("\nColumns in the final dataset:")
print(bikes_final_df.columns)

# Display the first few rows to see the data
print("\nFirst 5 rows of the final dataset:")
print(bikes_final_df.head())

# Use .info() to see the data types and non-null counts
print("\nDataset Information:")
bikes_final_df.info()

# Display the data type of every column in the DataFrame
print(bikes_final_df.dtypes)
bikes_final_df.info()
# Select the 'season' column from the DataFrame
season_series = bikes_final_df['season']

# Use the .unique() function to find all the unique values in that column
unique_seasons = season_series.unique()

# Print the unique values
print("The unique values in the 'season' column are:")
print(unique_seasons)
bikes_final_df.describe()
#Cleaning the data - Dropping unwanted features 
columnsToDrop = ['instant','casual','registered','atemp','dteday']

bikesData = bikes_final_df.drop(columns=columnsToDrop)
print(bikesData.columns)
#Divide into training/ test dataset 
bikesData['dayCount'] = pd.Series(range(bikesData.shape[0]))/24

train_set, test_set = train_test_split(bikesData, test_size=0.3, random_state=42)

print(len(train_set), "train +", len(test_set), "test")

train_set.sort_values('dayCount', axis= 0, inplace=True)
test_set.sort_values('dayCount', axis= 0, inplace=True)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
#Feature Scaling 
columnsToScale = ['temp','hum','windspeed']

scaler = StandardScaler()

train_set[columnsToScale] = scaler.fit_transform(train_set[columnsToScale])
test_set[columnsToScale] = scaler.transform(test_set[columnsToScale])
train_set[columnsToScale].describe()
# Preparing to Train the Models 
%pip install xgboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost
from xgboost import XGBRegressor

trainingCols = train_set.drop(['cnt'], axis=1)
trainingLabels = train_set['cnt']
print("\nShape of the training features (trainingCols):", trainingCols.shape)
print("Shape of the training labels (trainingLabels):", trainingLabels.shape)
print("\nFirst 5 rows of training features:")
print(trainingCols.head())
#Train and Analyze the Models 
#Train a Decision Tree Regressor

dec_reg = DecisionTreeRegressor(random_state = 42)

dt_mae_scores = -cross_val_score(dec_reg, trainingCols, trainingLabels, cv=10, scoring="neg_mean_absolute_error")

display_scores(dt_mae_scores)

dt_mse_scores = np.sqrt(-cross_val_score(dec_reg, trainingCols, trainingLabels, cv=10, scoring="neg_mean_squared_error"))

display_scores(dt_mse_scores)
#Train a Linear Regression model

lin_reg = LinearRegression()

lr_mae_scores = -cross_val_score(lin_reg, trainingCols, trainingLabels, cv=10, scoring="neg_mean_absolute_error")

display_scores(lr_mae_scores)

lr_mse_scores = np.sqrt(-cross_val_score(lin_reg, trainingCols, trainingLabels, cv=10, scoring="neg_mean_squared_error"))

display_scores(lr_mse_scores)
#Train a Random Forest Regressor

forest_reg = RandomForestRegressor(n_estimators=150, random_state=42)

rf_mae_scores = -cross_val_score(forest_reg, trainingCols, trainingLabels, cv=10, scoring="neg_mean_absolute_error")

display_scores(rf_mae_scores)

rf_mse_scores = np.sqrt(-cross_val_score(forest_reg, trainingCols, trainingLabels, cv=10, scoring="neg_mean_squared_error"))

display_scores(rf_mse_scores)
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try combinations of hyperparameters
    {'n_estimators': [120, 150], 'max_features': [10, 12], 'max_depth': [15, 28]},
]
#Fine-Tuning the Random Forest Regressor
# Using GridSearchCV to find the best hyperparameters for the Random Forest Regressor
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
#Run GridSearchCV
grid_search.fit(trainingCols, trainingLabels)

print(grid_search.best_estimator_)
print(grid_search.best_params_)
feature_importances = grid_search.best_estimator_.feature_importances_
print(feature_importances)
#Preparing to test the final model on Test dataset 
final_model = grid_search.best_estimator_
test_set.sort_values('dayCount', axis= 0, inplace=True)
test_x_cols = (test_set.drop(['cnt'], axis=1)).columns.values
test_y_cols = 'cnt'

X_test = test_set.loc[:,test_x_cols]
y_test = test_set.loc[:,test_y_cols]
# Make Predictions on the Test dataset using Final Model
test_set.loc[:,'predictedCounts_test'] = final_model.predict(X_test)

mse = mean_squared_error(y_test, test_set.loc[:,'predictedCounts_test'])
final_mse = np.sqrt(mse)
print(final_mse)
test_set.describe()
times = [9,18]
for time in times:
    fig = plt.figure(figsize=(8, 6))
    fig.clf()
    ax = fig.gca()
    test_set_freg_time = test_set[test_set.hr == time]
    test_set_freg_time.plot(kind = 'line', x = 'dayCount', y = 'cnt', ax = ax)
    test_set_freg_time.plot(kind = 'line', x = 'dayCount', y = 'predictedCounts_test', ax =ax)
    plt.show()