### Problem
A company that rents out bikes wants to use a model to predict the number of bikes that will be rented out on a given day. The model will take into account various factors such as the date (month, day, whether it is a holiday or weekend), as well as weather data (temperature, precipitation, etc.) to make accurate predictions about bike rental demand. This will help the company to better plan for staffing and bike inventory, and optimize revenue.

In [42]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

#### Data Exploration

In [43]:
# Specify the path to your CSV file
file_path = '/Users/gaa43/Documents/Personal/BikeTrips_Prediction/daily-bike-share.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display few rows of the DataFrame to verify it's loaded correctly
df


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1/1/2011,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,2,1/2/2011,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,3,1/3/2011,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,4,1/4/2011,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108
4,5,1/5/2011,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,12/27/2012,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247
727,728,12/28/2012,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644
728,729,12/29/2012,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159
729,730,12/30/2012,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364


In [44]:
# Assuming df is your DataFrame
X = df.drop(['instant', 'dteday', 'rentals'], axis=1)  # Exclude identifiers and target
y = df['rentals']

In [45]:
df.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'rentals'],
      dtype='object')

In [46]:
# Assuming df is your DataFrame
columns_to_convert = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

# Convert specified columns to 'category' data type
X = df[columns_to_convert].apply(lambda x: x.astype('category'))



In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  rentals     731 non-null    int64  
dtypes: float64(4), int64(9), object(1)
memory usage: 80.1+ KB


In [48]:
# Create dummy variables for the columns
X = pd.get_dummies(X, columns=columns_to_convert, drop_first=False)

# Display the first few rows of the DataFrame to check the transformation
print(X.head())

   season_1  season_2  season_3  season_4  yr_0   yr_1  mnth_1  mnth_2  \
0      True     False     False     False  True  False    True   False   
1      True     False     False     False  True  False    True   False   
2      True     False     False     False  True  False    True   False   
3      True     False     False     False  True  False    True   False   
4      True     False     False     False  True  False    True   False   

   mnth_3  mnth_4  ...  weekday_2  weekday_3  weekday_4  weekday_5  weekday_6  \
0   False   False  ...      False      False      False      False       True   
1   False   False  ...      False      False      False      False      False   
2   False   False  ...      False      False      False      False      False   
3   False   False  ...       True      False      False      False      False   
4   False   False  ...      False       True      False      False      False   

   workingday_0  workingday_1  weathersit_1  weathersit_2  weathersi

#### Linear Regression model

In [41]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)


# Evaluate the model using .score() on the test data
# .score() returns the R^2 score for regression models
r2_score = model.score(X_test, y_test)

print(f"R^2 score on the test data: {r2_score}")

R^2 score on the test data: 0.6616720803235854


#### Models Training
SVR, GradientBoostingRegressor, DecisionTreeRegressor, LinearRegression, RandomForestRegressor.

In [52]:
def evaluate_and_compare_models(df):
    # Split the dataset
    X = df.drop('rentals', axis=1)
    y = df['rentals']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'SVR': SVR()
    }

    # Apply feature scaling for SVR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Prepare table
    results = []

    # Train, predict, and evaluate models
    for name, model in models.items():
        if name == 'SVR':
            model.fit(X_train_scaled, y_train)
            predictions = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
        
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        results.append({'Model': name, 'MSE': mse, 'R^2': r2, 'MAE': mae})

    # Convert results to DataFrame for nicer formatting
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by='R^2', ascending=False).reset_index(drop=True)
    print(results_df)

# Generating a synthetic dataset
np.random.seed(42)
n_samples = 100
X = np.random.rand(n_samples, 4)
y = X @ np.array([1.5, -2., 1., 0.5]) + np.random.randn(n_samples) * 0.5 + 3
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4'])
df['rentals'] = y

# Evaluate and compare the models with the synthetic dataset
evaluate_and_compare_models(df)


               Model       MSE       R^2       MAE
0  Linear Regression  0.209870  0.714053  0.379749
1      Random Forest  0.402733  0.451278  0.516807
2                SVR  0.413527  0.436571  0.550959
3  Gradient Boosting  0.421325  0.425946  0.557564
4      Decision Tree  0.660907  0.099515  0.696421


##### Fine Tuning Model parameters

In [55]:
# Assuming `df` is your DataFrame with features and 'rentals' as the target
# Split the dataset
X = df.drop('rentals', axis=1)
y = df['rentals']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features for models sensitive to scale (e.g., SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prepare models with initial parameter grids for tuning
models = {
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto'],
        }
    }
}

results = []

# Perform grid search for each model
for name, setup in models.items():
    grid_search = GridSearchCV(setup['model'], setup['params'], cv=5, scoring='r2', n_jobs=-1)
    if name == 'SVR':
        grid_search.fit(X_train_scaled, y_train)
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test_scaled)
    else:
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    results.append({
        'Model': name,
        'Best Parameters': grid_search.best_params_,
        'MSE': mse,
        'R^2': r2
    })

# Manual tuning for simpler models
# Linear Regression (no hyperparameters to tune in this case)
lr = LinearRegression().fit(X_train, y_train)
lr_predictions = lr.predict(X_test)
results.append({
    'Model': 'Linear Regression',
    'Best Parameters': 'N/A',
    'MSE': mean_squared_error(y_test, lr_predictions),
    'R^2': r2_score(y_test, lr_predictions)
})

# Decision Tree (demonstrating manual parameter adjustment)
dt = DecisionTreeRegressor(max_depth=5, min_samples_split=4, random_state=42).fit(X_train, y_train)
dt_predictions = dt.predict(X_test)
results.append({
    'Model': 'Decision Tree',
    'Best Parameters': {'max_depth': 5, 'min_samples_split': 4},
    'MSE': mean_squared_error(y_test, dt_predictions),
    'R^2': r2_score(y_test, dt_predictions)
})

# Convert results to DataFrame for display
results_df = pd.DataFrame(results).sort_values(by='R^2', ascending=False)
print(results_df)
#print(best_model)


               Model                                    Best Parameters  \
2                SVR   {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}   
3  Linear Regression                                                N/A   
0      Random Forest  {'max_depth': None, 'min_samples_leaf': 2, 'mi...   
1  Gradient Boosting  {'learning_rate': 0.2, 'max_depth': 3, 'n_esti...   
4      Decision Tree           {'max_depth': 5, 'min_samples_split': 4}   

        MSE       R^2  
2  0.203691  0.722471  
3  0.209870  0.714053  
0  0.383042  0.478106  
1  0.452623  0.383302  
4  0.606485  0.173665  


##### Fine Tuning parameters of best performing models

In [56]:

def fine_tune_svr(X, y):
    """
    Perform grid search to fine-tune SVR model parameters.
    
    Parameters:
    - X: Feature matrix
    - y: Target variable
    
    Returns:
    - best_model: The model with the best parameters found.
    - results: DataFrame with the grid search results.
    """
    pipeline = Pipeline([
        ('scaler', StandardScaler()), 
        ('svr', SVR())
    ])

    param_grid = {
        'svr__C': [0.1, 1, 10, 100],
        'svr__kernel': ['rbf', 'linear', 'poly'],
        'svr__gamma': ['scale', 'auto', 0.1, 1, 10]
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best R^2 Score: {best_score}\n")

    results = pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score')
    print("Top 5 Grid Search Results:")
    print(results.head(5)[['params', 'mean_test_score', 'rank_test_score']].to_string(index=False))

    return best_model, results

# Generate a synthetic dataset for demonstration
X, y = make_regression(n_samples=100, n_features=4, noise=0.1, random_state=42)

# Run the fine-tuning function and display results
best_svr_model, svr_results = fine_tune_svr(X, y)


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Parameters: {'svr__C': 100, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}
Best R^2 Score: 0.9999985679126995

Top 5 Grid Search Results:
                                                         params  mean_test_score  rank_test_score
     {'svr__C': 100, 'svr__gamma': 10, 'svr__kernel': 'linear'}         0.999999                1
      {'svr__C': 100, 'svr__gamma': 1, 'svr__kernel': 'linear'}         0.999999                1
    {'svr__C': 100, 'svr__gamma': 0.1, 'svr__kernel': 'linear'}         0.999999                1
 {'svr__C': 100, 'svr__gamma': 'auto', 'svr__kernel': 'linear'}         0.999999                1
{'svr__C': 100, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}         0.999999                1
