<a href="https://colab.research.google.com/github/Jzakai/linear-regression/blob/main/Linear_Regression(Bicycles).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Linear Regression Model for Predicting number of Bikers on a Given Day

This file entails a dataset about Seattle's Fremont Bridge in the form of a csv file.
The data contains different details about a given day, like weather, temperature and other factors (see the dataframe preview below for more details). The data also contains how many bikers were observed crossing the bridge that day.

The program trains a linear regression model which takes in the parameters of the day and predicts the number of bikers according to those parameters.

In [None]:
from IPython.display import clear_output

In [None]:
# Incase you run this notebook outside colab (where the libraries aren't already pre-installed)

%pip install gdown
%pip install pandas
%pip install numpy

clear_output()

In [None]:
# Download the CSV file.
!gdown 1_eJU8Y-31_l0oq1sSJT6pROJyo-ufuvD

Downloading...
From: https://drive.google.com/uc?id=1_eJU8Y-31_l0oq1sSJT6pROJyo-ufuvD
To: /content/bikers_data.csv
  0% 0.00/213k [00:00<?, ?B/s]100% 213k/213k [00:00<00:00, 99.0MB/s]


In [None]:
import pandas as pd
import numpy as np

In [None]:
data_df = pd.read_csv('bikers_data.csv')

In [None]:
data_df.head()

Unnamed: 0,Date,Number of bikers,Mon,Tue,Wed,Thu,Fri,Sat,Sun,holiday,daylight_hrs,Rainfall (in),Temp (F),dry day
0,2012-10-03,14084.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11.277359,0.0,56.0,1
1,2012-10-04,13900.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,11.219142,0.0,56.5,1
2,2012-10-05,12592.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11.161038,0.0,59.5,1
3,2012-10-06,8024.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11.103056,0.0,60.5,1
4,2012-10-07,8568.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.045208,0.0,60.5,1


In [None]:
data_y = data_df['Number of bikers'] # target
data_x = data_df.drop(['Number of bikers'], axis=1) # input features

In [None]:
data_x.head()

Unnamed: 0,Date,Mon,Tue,Wed,Thu,Fri,Sat,Sun,holiday,daylight_hrs,Rainfall (in),Temp (F),dry day
0,2012-10-03,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11.277359,0.0,56.0,1
1,2012-10-04,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,11.219142,0.0,56.5,1
2,2012-10-05,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11.161038,0.0,59.5,1
3,2012-10-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11.103056,0.0,60.5,1
4,2012-10-07,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.045208,0.0,60.5,1


In [None]:
data_x.describe()

Unnamed: 0,Mon,Tue,Wed,Thu,Fri,Sat,Sun,holiday,daylight_hrs,Rainfall (in),Temp (F),dry day
count,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0,2646.0
mean,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.027967,11.907412,0.117305,54.285714,0.568405
std,0.349993,0.349993,0.349993,0.349993,0.349993,0.349993,0.349993,0.164909,2.615865,0.264038,10.875798,0.495392
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.218894,0.0,25.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.360658,0.0,46.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.812303,0.0,53.5,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.463207,0.11,63.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.781095,3.25,82.0,1.0


Feature engineering

In [None]:
data_x= data_x.drop('Date',axis=1)
data_x['new'] = data_x['daylight_hrs']*data_x['Temp (F)']
data_x= data_x.drop('holiday',axis=1)
data_x= data_x.drop('dry day',axis=1)
data_x['new2'] = data_x['Rainfall (in)']/data_x['Temp (F)']

In [None]:
data_x.head()

Unnamed: 0,Mon,Tue,Wed,Thu,Fri,Sat,Sun,daylight_hrs,Rainfall (in),Temp (F),new,new2
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,11.277359,0.0,56.0,631.532117,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11.219142,0.0,56.5,633.8815,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,11.161038,0.0,59.5,664.081732,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.103056,0.0,60.5,671.734917,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.045208,0.0,60.5,668.235096,0.0


In [None]:
data_x.shape[0] #no. of data samples (rows)

2646

In [None]:
data_x.shape[1] ##no. of features (columns)

12

In [None]:
data_x.size #returns rows*columns

31752

In [None]:
2646*11

29106

In [None]:
len(data_x)

2646

In [None]:

#regulize to cast overfitting(optional)
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
alpha = 0.2  # Regularization strength (lambda)
ridge_model = make_pipeline(StandardScaler(), Ridge(alpha=alpha))

# Fit the model to the data
ridge_model.fit(data_x, data_y)

# Print the coefficients
print('Coefficients:', ridge_model.named_steps['ridge'].coef_)


Coefficients: [  628.65287746   968.22863545   926.62039724   693.39014874
   248.61823165 -1662.72932629 -1802.78096425  1810.84025884
 -2110.29116872  2749.30679372 -1079.70622607   908.6842696 ]


In [None]:
#randomly shuffle features
from sklearn import datasets, utils
shuffled_features, shuffled_targets = utils.shuffle(
    data_x.to_numpy(), #change from df to numpy
    data_y.to_numpy().reshape(-1, 1), # #change from df to numpy and reshape from 1D to 2D with dimensions -1(the original dimension of array/ number of samples) and 1(1 column)
    random_state=(42) # randomly shuffle
)

train_size = int(0.8 * (len(data_x)))

#splitting
X_train, X_test = shuffled_features[:train_size], shuffled_features[train_size:]
y_train, y_test = shuffled_targets[:train_size], shuffled_targets[train_size:]




In [None]:

#scale features
#scaling/ take parameters from features
mus = X_train.mean(axis=0) #across rows(for each feature)
sigmas = X_train.std(axis=0)


# use mus and sigmas from train_features to avoid leakage!
scaled_train_features = (X_train - mus) / sigmas
scaled_test_features = (X_test - mus) / sigmas


In [None]:
#stacked features
m= len(y_train)


#stacking np arrays:
#scaled_train_features = np.c_[np.ones(scaled_train_features.shape[0]), scaled_train_features]
#scaled_test_features = np.c_[np.ones(scaled_test_features.shape[0]), scaled_test_features]

#stacking arrays
scaled_train_features = np.hstack([np.ones(shape=(len(scaled_train_features), 1)), scaled_train_features])
scaled_test_features = np.hstack([np.ones(shape=(len(scaled_test_features), 1)), scaled_test_features])
#X= np.hstack(features)
n = scaled_train_features.shape[1]


In [None]:
#retrun yhat/ linear model mx+b
def model_fn(X):
    return X @ learned_parameters

# gradient decent
def grad_fn(X, y, y_hat):

    return -(1 / m) * (X.T @ (y - y_hat))

#loss function
def loss_fn(y, y_hat): #mean squared error
    return 0.5 * np.mean((y - y_hat)**2)

In [None]:
#initialize a random starting point
prng = np.random.RandomState(42)
learned_parameters =prng.normal(loc=0, scale=1, size=(n, 1))
#or
#learned_parameters =np.zeros((n,1))

In [None]:
#stacked_trainX.shape
learned_parameters.shape

(13, 1)

In [None]:
scaled_test_features.shape

(530, 13)

In [None]:
learning_rate = 0.03
batch_size = 32 # leveraging parallelism to speed up training.
epochs = 10000
log_epochs = 100

for epoch in range(epochs):

    total_loss = 0.0
    for batch_ixs in utils.gen_batches(len(y_train), batch_size):
        features = scaled_train_features[batch_ixs]
        target = y_train[batch_ixs]

        # forward pass
        predictions = model_fn(features)
        loss = loss_fn(target, predictions)
        total_loss += loss

        # backward pass
        grad = grad_fn(features, target, predictions)
        learned_parameters -= grad * learning_rate

    if epoch % log_epochs == 0:
        print(f'Epoch {epoch}  Loss {total_loss / len(y_train):.4f}')

Epoch 0  Loss 2304236.0939
Epoch 100  Loss 105573.2038
Epoch 200  Loss 100799.6307
Epoch 300  Loss 100714.1226
Epoch 400  Loss 100679.1791
Epoch 500  Loss 100657.9347
Epoch 600  Loss 100641.2264
Epoch 700  Loss 100626.2885
Epoch 800  Loss 100612.2574
Epoch 900  Loss 100598.8364
Epoch 1000  Loss 100585.9073
Epoch 1100  Loss 100573.4134
Epoch 1200  Loss 100561.3219
Epoch 1300  Loss 100549.6103
Epoch 1400  Loss 100538.2611
Epoch 1500  Loss 100527.2596
Epoch 1600  Loss 100516.5926
Epoch 1700  Loss 100506.2481
Epoch 1800  Loss 100496.2147
Epoch 1900  Loss 100486.4818
Epoch 2000  Loss 100477.0389
Epoch 2100  Loss 100467.8764
Epoch 2200  Loss 100458.9847
Epoch 2300  Loss 100450.3549
Epoch 2400  Loss 100441.9782
Epoch 2500  Loss 100433.8463
Epoch 2600  Loss 100425.9510
Epoch 2700  Loss 100418.2848
Epoch 2800  Loss 100410.8400
Epoch 2900  Loss 100403.6096
Epoch 3000  Loss 100396.5866
Epoch 3100  Loss 100389.7644
Epoch 3200  Loss 100383.1365
Epoch 3300  Loss 100376.6968
Epoch 3400  Loss 100370.4

In [None]:
predictions = model_fn(scaled_train_features)

training_loss = loss_fn(predictions, y_train)
print(f"Training loss: {training_loss } ")




Training loss: 3168723.6826591124 


In [None]:
predictions = model_fn(scaled_test_features)
test_loss = loss_fn(predictions, y_test)
print(f"Test loss: {test_loss } ")

Test loss: 2949208.588867945 
