# Key Metrics

Key Metrics

## Imports

In [2]:
import numpy as np
import pandas as pd

## Load the data

In [3]:
FILE_NAME = '04_2017_bike_data.csv'
bike = pd.read_csv(FILE_NAME, index_col=None, thousands=',')
bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 30 non-null     object 
 1   day                  30 non-null     object 
 2   temp_high            30 non-null     float64
 3   temp_low             30 non-null     float64
 4   rain_amt             30 non-null     object 
 5   brooklyn_bridge      30 non-null     int64  
 6   manhattan_bridge     30 non-null     int64  
 7   williamsburg_bridge  30 non-null     int64  
 8   queensboro_bridge    30 non-null     int64  
 9   total_cyclists       30 non-null     int64  
 10  Unnamed: 10          1 non-null      object 
dtypes: float64(2), int64(5), object(4)
memory usage: 2.7+ KB


## Convert string to numerical data

In [4]:
# 'T' is for trace amount of rain
bike['rain_amt'] = bike['rain_amt'] \
.replace('0', 0.0) \
.replace('T', 0.001) \
.replace(',', '', regex=True).astype(float)

bike.describe()

Unnamed: 0,temp_high,temp_low,rain_amt,brooklyn_bridge,manhattan_bridge,williamsburg_bridge,queensboro_bridge,total_cyclists
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,63.776667,50.686667,0.128167,2250.1,4353.9,4942.266667,3482.8,15029.066667
std,10.646796,7.243391,0.293974,980.301198,1692.501767,1733.685449,1146.582971,5517.675272
min,46.0,37.0,0.0,461.0,1324.0,1739.0,1372.0,4896.0
25%,55.9,46.0,0.0,1461.25,3147.25,3776.0,2709.5,11088.0
50%,64.0,50.0,0.0005,2422.5,4608.5,5141.5,3560.5,15783.5
75%,67.5,54.75,0.0525,2954.75,5545.25,6040.25,4263.0,18927.5
max,84.9,64.0,1.18,3887.0,7247.0,8079.0,5501.0,24714.0


## Convert date to actual date time

In [5]:
from datetime import datetime

current_year = datetime.now().year

bike['date'] = pd.to_datetime(bike['date'] + ' ' + str(current_year))

## Standardization - centering and scaling

In [6]:
bike['total_cyclists_pred'] = bike['total_cyclists']

numerical_features = ['temp_high', 'temp_low', 'rain_amt']

# prepare features (X - input variables) and the target (y) variable
X = bike[numerical_features]
y = bike['total_cyclists_pred']

from sklearn.model_selection import train_test_split

# 90-10 split
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train[numerical_features])

X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])

#bike.head()
#X.head()
X_train[numerical_features].head()

Unnamed: 0,temp_high,temp_low,rain_amt
26,0.322301,1.110996,-0.413021
8,0.132091,-0.914843,-0.413021
27,1.739371,0.951823,-0.413021
12,-0.05812,-0.350502,-0.413021
21,-0.828473,0.098076,-0.019226


## Multiple Linear Regression (MLR)

In [7]:
from sklearn.linear_model import LinearRegression

ml_reg = LinearRegression()

ml_reg.fit(X_train, y_train)

y_pred_ml_reg = ml_reg.predict(X_train)

pd.Series(ml_reg.coef_, index=X_train.columns).sort_values(ascending=False).round(2)

temp_high    3013.27
temp_low      326.02
rain_amt    -1990.62
dtype: float64

## Mean squared error (MSE)

In [14]:
from sklearn.metrics import mean_squared_error

mse_ml_reg = mean_squared_error(y_true=y_train, y_pred=y_pred_ml_reg)

print('MSE: {:0.2f}M'.format(mse_ml_reg/1e6))

MSE: 8.59M


In [15]:
# The above number should be lower than this number
y_pred_null_model = np.full(y_train.shape, y_train.mean())
mse_null_model = mean_squared_error(y_true=y_train, y_pred=y_pred_null_model)
#mse_null_model
print('MSE Null Model: {:0.2f}M'.format(mse_null_model/1e6))

MSE Null Model: 28.13M


## Root Mean Squared Error (RMSE)

In [16]:
rmse = np.sqrt(mse_ml_reg)
print('RMSE: {:0.2f}'.format(rmse))

RMSE: 2930.31


## Mean Absolute Error (MAE)

In [17]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_true=y_train, y_pred=y_pred_ml_reg)
print('MAE: {:0.2f}'.format(mae))

MAE: 2428.13


## R-Squared

In [18]:
from sklearn.metrics import r2_score

r2 = r2_score(y_true=y_train, y_pred=y_pred_ml_reg)
print('R-Squared: {:0.2f}'.format(r2))

R-Squared: 0.69
