# Bike Sharing Demand Prediction

# Import Libraries and Setup

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr, shapiro, normaltest, anderson
from scipy.stats import f_oneway, kruskal
from statsmodels.stats.multicomp import MultiComparison
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import zipfile
import os
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

# Data Loading and Initial Setup

In [9]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
zip_path = "Bike-Sharing-Dataset.zip"
extracted_path = "Bike-Sharing-Dataset"

if not os.path.exists(zip_path):
    import requests
    r = requests.get(url, stream=True)
    with open(zip_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)

if not os.path.exists(extracted_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_path)

df_day = pd.read_csv(os.path.join(extracted_path, 'day.csv'))

print(f"Dataset loaded with shape: {df_day.shape}")
print(f"Features: {list(df_day.columns)}")
print(f"Target variable statistics:")
print(df_day['cnt'].describe())

Dataset loaded with shape: (731, 16)
Features: ['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']
Target variable statistics:
count     731.000000
mean     4504.348837
std      1937.211452
min        22.000000
25%      3152.000000
50%      4548.000000
75%      5956.000000
max      8714.000000
Name: cnt, dtype: float64


# Data Exploration

In [10]:
print("=== DATASET OVERVIEW ===")
print(f"Dataset Shape: {df_day.shape}")
print(f"Memory Usage: {df_day.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Data Types:\n{df_day.dtypes}")
print(f"\n=== MISSING VALUES ===")
print(df_day.isnull().sum())
print(f"\n=== DUPLICATE ROWS ===")
print(f"Number of duplicates: {df_day.duplicated().sum()}")
print(f"\n=== BASIC STATISTICS ===")
print(df_day.describe())
print(f"\n=== CATEGORICAL VARIABLES DISTRIBUTION ===")
categorical_cols = ['season', 'yr', 'mnth', 'weekday', 'workingday', 'holiday', 'weathersit']
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(df_day[col].value_counts().sort_index())
print(f"\n=== TARGET VARIABLE ANALYSIS ===")
print(f"Count Range: {df_day['cnt'].min()} - {df_day['cnt'].max()}")
print(f"Count Mean: {df_day['cnt'].mean():.2f}")
print(f"Count Std: {df_day['cnt'].std():.2f}")
print(f"Count Skewness: {df_day['cnt'].skew():.3f}")
print(f"Count Kurtosis: {df_day['cnt'].kurtosis():.3f}")

=== DATASET OVERVIEW ===
Dataset Shape: (731, 16)
Memory Usage: 0.13 MB
Data Types:
instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

=== MISSING VALUES ===
instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

=== DUPLICATE ROWS ===
Number of duplicates: 0

=== BASIC STATISTICS ===
          instant      season          yr        mnth     holiday     weekday  \
count  731.000000  731.000000  731.000000  731.000000  731.000000  731.000000   
mean   366.000000    