In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 4.2 MB/s eta 0:00:02
   ------- -------------------------------- 1.6/8.7 MB 4.6 MB/s eta 0:00:02
   ------------ --------------------------- 2.6/8.7 MB 4.9 MB/s eta 0:00:02
   ------------------- -------------------- 4.2/8.7 MB 5.3 MB/s eta 0:00:01
   ------------------------- -------------- 5.5/8.7 MB 5.6 MB/s eta 0:00:01
   ------------------------------ --------- 


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
    import sklearn
    print("Scikit-learn version:", sklearn.__version__)

Scikit-learn version: 1.7.2


In [7]:
# ===================================================================
# FINAL SCRIPT - TO BE RUN WITH THE CORRECTED bikes.csv FILE
# ===================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

print("--- Starting Analysis with Confirmed Correct Data ---")

try:
    # 1. Load the dataset
    # Ensure this is reading the correct file you provided.
    df = pd.read_csv('bikes.csv')

    # 2. Preprocessing
    # One-hot encode the categorical variables to make them numerical
    df_encoded = pd.get_dummies(df, columns=['season', 'weekday', 'weather'], drop_first=True)

    # 3. Feature Selection
    # Define features (X) and target (y)
    # We drop 'date', 'realfeel' (to avoid multicollinearity), and 'rentals' (our target)
    X = df_encoded.drop(['rentals', 'realfeel', 'date'], axis=1)
    y = df_encoded['rentals']

    # 4. Train-Test Split
    # Split the data into training and testing sets with a fixed random_state for reproducibility
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 5. Model Training
    # Initialize and train the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    print("Model trained successfully.")

    # 6. Model Evaluation
    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Calculate and print the performance metrics
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)

    print("\n--- FINAL MODEL PERFORMANCE ---")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (RÂ²): {r2:.4f}")

    # 7. Visualization
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, predictions, alpha=0.6, edgecolors='k')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='red', linewidth=2)
    plt.title('Actual vs. Predicted Bike Rentals')
    plt.xlabel('Actual Rentals')
    plt.ylabel('Predicted Rentals')
    plt.grid(True)
    plt.show()

except FileNotFoundError:
    print("ERROR: 'bikes.csv' not found. Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

ModuleNotFoundError: No module named 'seaborn'