In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
data = pd.read_csv(r'F:\GIT_Projects\Regression model\bike_share\ML_Regression_Model\day.csv')

In [3]:
# Display the first few rows of the dataset
print("Sample of the Bike Sharing Demand dataset:")
print(data.head())


Sample of the Bike Sharing Demand dataset:
   instant      dteday  season  yr  mnth  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1        0        6           0   
1        2  2011-01-02       1   0     1        0        0           0   
2        3  2011-01-03       1   0     1        0        1           1   
3        4  2011-01-04       1   0     1        0        2           1   
4        5  2011-01-05       1   0     1        0        3           1   

   weathersit      temp     atemp       hum  windspeed  casual  registered  \
0           2  0.344167  0.363625  0.805833   0.160446     331         654   
1           2  0.363478  0.353739  0.696087   0.248539     131         670   
2           1  0.196364  0.189405  0.437273   0.248309     120        1229   
3           1  0.200000  0.212122  0.590435   0.160296     108        1454   
4           1  0.226957  0.229270  0.436957   0.186900      82        1518   

    cnt  
0   985  
1   801  
2  1349  
3  

In [4]:
# Define features (X) and target (y)
X = data.drop(['cnt', 'casual', 'registered', 'dteday'], axis=1)  # Assuming 'count' as the target
y = data['cnt']

In [5]:

# Identify numeric and categorical features
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'holiday', 'workingday', 'weathersit', 'yr', 'mnth', 'weekday']


In [6]:
# Preprocessing for numeric features: impute missing values and scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [8]:
# Define the pipeline with DecisionTreeRegressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)


In [9]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 867814.1836734693
R-squared (R2) Score: 0.7835811325074993


In [10]:
# Hyperparameter Tuning with RandomizedSearchCV
param_dist = {
    'regressor__max_depth': [None, 10, 20, 30, 40, 50],
    'regressor__min_samples_split': [2, 5, 10, 15, 20],
    'regressor__min_samples_leaf': [1, 2, 4, 6, 8]
}

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=5, verbose=1, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("\nBest Parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best CV Score from RandomizedSearchCV:", random_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Parameters from RandomizedSearchCV: {'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 6, 'regressor__max_depth': 10}
Best CV Score from RandomizedSearchCV: 0.7897952562400541


In [11]:
# Using the best parameters from RandomizedSearchCV to narrow down the GridSearchCV
param_grid = {
    'regressor__max_depth': [random_search.best_params_['regressor__max_depth'] - 10, random_search.best_params_['regressor__max_depth'], random_search.best_params_['regressor__max_depth'] + 10],
    'regressor__min_samples_split': [random_search.best_params_['regressor__min_samples_split'] - 2, random_search.best_params_['regressor__min_samples_split'], random_search.best_params_['regressor__min_samples_split'] + 2],
    'regressor__min_samples_leaf': [random_search.best_params_['regressor__min_samples_leaf'] - 1, random_search.best_params_['regressor__min_samples_leaf'], random_search.best_params_['regressor__min_samples_leaf'] + 1]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest Parameters from GridSearchCV:", grid_search.best_params_)
print("Best CV Score from GridSearchCV:", grid_search.best_score_)

best_pipeline = grid_search.best_estimator_
y_pred_tuned = best_pipeline.predict(X_test)

mse_tuned = mean_squared_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\nMean Squared Error (MSE) - Tuned Model:", mse_tuned)
print("R-squared (R2) Score - Tuned Model:", r2_tuned)


Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best Parameters from GridSearchCV: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 6, 'regressor__min_samples_split': 8}
Best CV Score from GridSearchCV: 0.7897952562400541

Mean Squared Error (MSE) - Tuned Model: 808603.8474187402
R-squared (R2) Score - Tuned Model: 0.7983472358475666


45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp\AppData\Roaming\Python\Python312\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\hp\Ap

In [12]:
# Save the model
joblib.dump(best_pipeline, 'bike_sharing_demand_model.pkl')



['bike_sharing_demand_model.pkl']