# Model Comparisons

1. Linear Regression

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

df = pd.read_csv('daily_combined.csv')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)

pandemic_start_date = pd.to_datetime('2020-03-17')

# Create features
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

# Train/Test Split
train = df[df.index < pandemic_start_date]
test = df[df.index >= pandemic_start_date]

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(train[['day_of_week', 'month', 'BIKE_USAGE']], train['BIKE_USAGE'])
linear_predictions = linear_model.predict(test[['day_of_week', 'month', 'BIKE_USAGE']])
linear_rmse = np.sqrt(mean_squared_error(test['BIKE_USAGE'], linear_predictions))
linear_mae = mean_absolute_error(test['BIKE_USAGE'], linear_predictions)


print(f'Linear Regression RMSE: {linear_rmse}')
print(f'Linear Regression MAE: {linear_mae}')


Linear Regression RMSE: 5.838194234536177e-13
Linear Regression MAE: 4.526561302055158e-13


2. Ridge Regression

In [32]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('daily_combined.csv')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)

pandemic_start_date = pd.to_datetime('2020-03-17')

# Create features
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

# Train/Test Split
train = df[df.index < pandemic_start_date]
test = df[df.index >= pandemic_start_date]

# Define a range of alpha values
alphas = np.arange(0.0001, 0.001)  

# Perform grid search
param_grid = {'alpha': alphas}
ridge_model = Ridge()
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train[['day_of_week', 'month', 'BIKE_USAGE']], train['BIKE_USAGE'])

# Get the best alpha value
best_alpha = grid_search.best_params_['alpha']


# Train Ridge Regression with the best alpha
ridge_model = Ridge(alpha=best_alpha)
ridge_model.fit(train[['day_of_week', 'month', 'BIKE_USAGE']], train['BIKE_USAGE'])
ridge_predictions = ridge_model.predict(test[['day_of_week', 'month', 'BIKE_USAGE']])
ridge_rmse = np.sqrt(mean_squared_error(test['BIKE_USAGE'], ridge_predictions))
ridge_mae = mean_absolute_error(test['BIKE_USAGE'], ridge_predictions)


print(f'Best Ridge Regression Alpha: {best_alpha}')
print(f'Ridge Regression RMSE: {ridge_rmse}')
print(f'Ridge Regression MAE: {ridge_mae}')


Best Ridge Regression Alpha: 0.0001
Ridge Regression RMSE: 4.0663149200424093e-10
Ridge Regression MAE: 3.4676939343964684e-10


3. KNN

In [31]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np

df = pd.read_csv('daily_combined.csv')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)

pandemic_start_date = pd.to_datetime('2020-03-17')

# Create features
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

# Train/Test Split
train = df[df.index < pandemic_start_date]
test = df[df.index >= pandemic_start_date]

# Define a range of n_neighbors values
neighbors_values = np.arange(1, 20)  # Adjust the range as needed

# Perform grid search
param_grid = {'n_neighbors': neighbors_values}
knn_model = KNeighborsRegressor()
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train[['day_of_week', 'month', 'BIKE_USAGE']], train['BIKE_USAGE'])

# Get the best n_neighbors value
best_n_neighbors = grid_search.best_params_['n_neighbors']

# Train KNN with the best n_neighbors
knn_model = KNeighborsRegressor(n_neighbors=best_n_neighbors)
knn_model.fit(train[['day_of_week', 'month', 'BIKE_USAGE']], train['BIKE_USAGE'])
knn_predictions = knn_model.predict(test[['day_of_week', 'month', 'BIKE_USAGE']])
knn_rmse = np.sqrt(mean_squared_error(test['BIKE_USAGE'], knn_predictions))
knn_mae = mean_absolute_error(test['BIKE_USAGE'], knn_predictions)


print(f'Best n_neighbors for KNN: {best_n_neighbors}')
print(f'KNN RMSE: {knn_rmse}')
print(f'KNN MAE: {knn_mae}')

Best n_neighbors for KNN: 1
KNN RMSE: 23.822832233552678
KNN MAE: 14.227149659793664


4. LSTM

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


df = pd.read_csv('daily_combined.csv')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)


pandemic_start_date = pd.to_datetime('2020-03-17')

# Create features
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

# Train/Test Split
train = df[df.index < pandemic_start_date]
test = df[df.index >= pandemic_start_date]

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train[['BIKE_USAGE']].values.reshape(-1, 1))

# Prepare sequences for LSTM
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length, 0])
        y.append(data[i+sequence_length, 0])
    return np.array(X), np.array(y)

sequence_length = 10  # adjust based on your data
X_train, y_train = create_sequences(scaled_data, sequence_length)

# Reshape data for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))

# Build LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(units=50))
model.add(Dense(units=1))

# Compile and fit the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=32)

# Make predictions on the test set
scaled_test_data = scaler.transform(test[['BIKE_USAGE']].values.reshape(-1, 1))
X_test, y_test = create_sequences(scaled_test_data, sequence_length)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

lstm_predictions = model.predict(X_test)
lstm_predictions = scaler.inverse_transform(lstm_predictions)

# Calculate RMSE and MAE
lstm_rmse = np.sqrt(mean_squared_error(test['BIKE_USAGE'][sequence_length:], lstm_predictions))
lstm_mae = mean_absolute_error(test['BIKE_USAGE'][sequence_length:], lstm_predictions)


print(f'LSTM RMSE: {lstm_rmse}')
print(f'LSTM MAE: {lstm_mae}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
LSTM RMSE: 507.6216205378193
LSTM MAE: 405.22885766788596


5. SVM

In [35]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

df = pd.read_csv('daily_combined.csv')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)

pandemic_start_date = pd.to_datetime('2020-03-17')

# Create features 
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

# Train/Test Split
train = df[df.index < pandemic_start_date]
test = df[df.index >= pandemic_start_date]

# Feature scaling
scaler = StandardScaler()
X_train_svm = scaler.fit_transform(train[['day_of_week', 'month', 'BIKE_USAGE']])
y_train_svm = train['BIKE_USAGE']

X_test_svm = scaler.transform(test[['day_of_week', 'month', 'BIKE_USAGE']])
y_test_svm = test['BIKE_USAGE']

# SVM Model
svm_model = SVR(kernel='linear')
svm_model.fit(X_train_svm, y_train_svm)

# Predictions
svm_predictions = svm_model.predict(X_test_svm)

# Calculate RMSE and MAE for SVM
svm_rmse = np.sqrt(mean_squared_error(y_test_svm, svm_predictions))
svm_mae = mean_absolute_error(y_test_svm, svm_predictions)


print(f'SVM RMSE: {svm_rmse}')
print(f'SVM MAE: {svm_mae}')


SVM RMSE: 613.268495963982
SVM MAE: 554.2841482806782


6. CNN

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np


df = pd.read_csv('daily_combined.csv')
df['TIME'] = pd.to_datetime(df['TIME'])
df.set_index('TIME', inplace=True)


pandemic_start_date = pd.to_datetime('2020-03-17')

# Create features 
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

# Train/Test Split
train = df[df.index < pandemic_start_date]
test = df[df.index >= pandemic_start_date]

# Feature scaling
scaler = StandardScaler()
X_train_cnn = scaler.fit_transform(train[['day_of_week', 'month', 'BIKE_USAGE']]).reshape((train.shape[0], 3, 1))
y_train_cnn = train['BIKE_USAGE']

X_test_cnn = scaler.transform(test[['day_of_week', 'month', 'BIKE_USAGE']]).reshape((test.shape[0], 3, 1))
y_test_cnn = test['BIKE_USAGE']

# CNN Model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(3, 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(50, activation='relu'))
cnn_model.add(Dense(1))

cnn_model.compile(optimizer='adam', loss='mean_squared_error')
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=50, batch_size=32)

# Predictions
cnn_predictions = cnn_model.predict(X_test_cnn)

# Calculate RMSE and MAE for CNN
cnn_rmse = np.sqrt(mean_squared_error(y_test_cnn, cnn_predictions))
cnn_mae = mean_absolute_error(y_test_cnn, cnn_predictions)


print(f'CNN RMSE: {cnn_rmse}')
print(f'CNN MAE: {cnn_mae}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CNN RMSE: 814.5341578858752
CNN MAE: 613.1867349287772
