In [None]:
path = '/content/drive/MyDrive/Bike Weather Data Mar 2024 (1).csv'

In [None]:
import shap
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [None]:
def preprocess_data(df):
    """
    Preprocess the data:
    - Resample the data hourly by start_station_name.
    - Encode categorical variables.
    - Scale numerical features.

    Args:
    df (pandas.DataFrame): Raw input DataFrame.

    Returns:
    tuple: Preprocessed features (X), target (y), and the preprocessor pipeline.
    """
    try:
        df['started_at'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

        # Drop rows where 'started_at' is NaT
        df = df.dropna(subset=['started_at'])

        # Set the datetime column as the index for resampling
        df.set_index('started_at', inplace=True)

        # Resample data by hour for each station
        hourly_station_data = df.groupby('start_station_name').resample('h').agg({
            'month': 'first',                # Keep first value (since it's constant for each hour)
            'hour': 'first',                 # Same as above
            'day_name': 'first',             # Same
            'duration': 'sum',               # Sum durations for the hour
            'distance_km': 'sum',            # Sum distances for the hour
            'Temperature (°F)': 'mean',      # Average temperature
            'Humidity': 'mean',              # Average humidity
            'Wind Speed': 'mean',            # Average wind speed
            'Precip.': 'sum',                # Total precipitation for the hour
            'Condition': 'first',            # Keep first condition as representative
            'bike_undocked': 'sum'            # Sum undocked bikes
        }).reset_index()

        # Prepare features (X) and target (y)
        X = hourly_station_data.drop(columns=['bike_undocked'])
        y = hourly_station_data['bike_undocked']

        # One-hot encode categorical variables
        X = pd.get_dummies(X, columns=['day_name', 'Condition'], drop_first=True)

        # Frequency encode the 'start_station_name' column
        station_freq = X['start_station_name'].value_counts().to_dict()
        X['start_station_name'] = X['start_station_name'].map(station_freq).fillna(0)

        # Drop unnecessary columns
        X = X.drop(columns=['started_at'])

        # Fill any remaining missing values with 0
        X = X.fillna(0)

        # Split the data chronologically into train, validation, and test sets
        train_size = int(0.7 * len(X))  # 70% for training
        val_size = int(0.15 * len(X))   # 15% for validation
        test_size = len(X) - train_size - val_size  # 15% for test

        X_train, y_train = X[:train_size], y[:train_size]
        X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
        X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

        return X_train, X_val, X_test, y_train, y_val, y_test

    except Exception as e:
        raise

In [None]:
file_path = '/content/drive/MyDrive/Bike Weather Data Mar 2024 (1).csv'
bike_data = pd.read_csv(file_path)

X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(bike_data)

In [None]:
# Handle missing values and categorical features with a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numeric values with mean
            ('scaler', StandardScaler())  # Scale numeric features
        ]), X_train.select_dtypes(include=['float64', 'int64']).columns),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Impute missing categorical values
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
        ]), X_train.select_dtypes(include=['object']).columns)
    ])

In [None]:
# Apply preprocessor to the features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Train a Decision Tree model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_processed, y_train)

# Predict on the test set and calculate mean squared error
y_pred = model.predict(X_test_processed)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Initialize SHAP explainer with the trained Decision Tree model
explainer = shap.Explainer(model, X_train_processed)

# Compute SHAP values for the test set
shap_values = explainer(X_test_processed)

# Extract and print SHAP values for each feature
shap_df = pd.DataFrame(shap_values.values, columns=preprocessor.get_feature_names_out())

# Calculate the mean absolute SHAP value for each feature to get feature importance
shap_importance = shap_df.abs().mean(axis=0).sort_values(ascending=False)

# Display the features and their associated importance values
print("Feature Importance based on SHAP:")
print(shap_importance)

Mean Squared Error: 0.3611535311953611
Feature Importance based on SHAP:
num__month                 0.409370
num__duration              0.242753
num__distance_km           0.114153
num__start_station_name    0.043923
num__hour                  0.037980
num__Temperature (°F)      0.022040
num__Humidity              0.020734
num__Wind Speed            0.017148
num__Precip.               0.000737
dtype: float64
