# Bike Sharing System Demand Prediction

### Load and Explore the Data

In [None]:
import pandas as pd

# Load the dataset
file_path = 'day.csv'
df = pd.read_csv(file_path)
df.head()

 ### Data Preprocessing

In [None]:
# Convert 'season' and 'weathersit' to categorical string values
df['season'] = df['season'].map({1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'})
df['weathersit'] = df['weathersit'].map({
    1: 'Clear', 
    2: 'Mist', 
    3: 'Light Snow/Rain', 
    4: 'Heavy Rain/Snow'
})

# Display the first few rows to confirm changes
df.head() 



In [None]:
# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['season', 'weathersit'], drop_first=True)

# Drop the 'instant' and 'dteday' columns
df = df.drop(columns=['instant', 'dteday'])

# Display the first few rows to confirm changes
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
X = df.drop(columns=['casual', 'registered', 'cnt'])
y = df['cnt']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shape of the training and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Feature Selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Use RFE for feature selection
lm = LinearRegression()
# Choosing 10 features
rfe = RFE(lm, n_features_to_select=10)  
rfe = rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]
print(selected_features)

# Create a new dataframe with the selected features
X_rfe = X[selected_features]
print(X_rfe)

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np


# Add a constant to the model (for statsmodels)
X_rfe = sm.add_constant(X_rfe)


# Convert boolean columns to numeric
X_rfe[['season_spring', 'season_summer', 'season_winter', 'weathersit_Light Snow/Rain', 'weathersit_Mist']] = \
    X_rfe[['season_spring', 'season_summer', 'season_winter', 'weathersit_Light Snow/Rain', 'weathersit_Mist']].astype(int)

# Fit the model using statsmodels
model = sm.OLS(y, X_rfe).fit()

# Get the summary of the model
print(model.summary())
print('VIF Summary')
# Calculate VIF for each feature
vif = pd.DataFrame()
vif['Features'] = X_rfe.columns
vif['VIF'] = [variance_inflation_factor(X_rfe.values, i) for i in range(X_rfe.shape[1])]
print(vif)



### Iteratively Remove Non-significant Features

In [None]:
# Define a function to remove features based on p-values and VIF
def remove_insignificant_features(X, y, threshold_pvalue=0.05, threshold_vif=5):
    while True:
        # Fit the model
        model = sm.OLS(y, X).fit()
        # Calculate VIF
        vif = pd.DataFrame()
        vif['Features'] = X.columns
        vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        
        # Check p-values
        pvalues = model.pvalues
        max_pvalue = pvalues.max()
        if max_pvalue > threshold_pvalue:
            excluded_feature = pvalues.idxmax()
            print(f"Dropping {excluded_feature} with p-value {max_pvalue}")
            X = X.drop(columns=[excluded_feature])
        else:
            break

        # Check VIF
        max_vif = vif['VIF'].max()
        if max_vif > threshold_vif:
            excluded_feature = vif.loc[vif['VIF'].idxmax(), 'Features']
            print(f"Dropping {excluded_feature} with VIF {max_vif}")
            X = X.drop(columns=[excluded_feature])
        else:
            break
    return X

# Remove insignificant features
X_significant = remove_insignificant_features(X_rfe, y)

X_significant


### Build and Train the Model

In [21]:
from sklearn.linear_model import LinearRegression

# Create the model
linear_model = LinearRegression()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_significant, y, test_size=0.3, random_state=42)

# Train the model
linear_model.fit(X_train, y_train)


### Prediction and Evaluation of the Model

In [22]:
from sklearn.metrics import r2_score

# Make predictions
y_train_pred = linear_model.predict(X_train)
y_test_pred = linear_model.predict(X_test)

# Calculate R-squared for train and test sets
print("Train R-squared: ", r2_score(y_train, y_train_pred))
print("Test R-squared: ", r2_score(y_test, y_test_pred))


Train R-squared:  0.8132161720563722
Test R-squared:  0.8150263920911406
