In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [53]:
df=pd.read_csv("/content/bikeshare.csv")

In [54]:
## Data cleaning

In [55]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [56]:
df.shape

(10886, 12)

In [57]:
df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [58]:
df.dtypes

datetime       object
season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
casual          int64
registered      int64
count           int64
dtype: object

In [59]:
# Some preprocessing:
# 1 - Rename "count" to "riders"
# 2 - Drop "casual" and "registered"
# 3 - Cast "datetime" to actually be a datetime
# 4 - Drop only row where weather == 4

df.rename(columns= {"count":"riders"},inplace=True)

In [60]:
df=df[df['weather'] !=4]

In [61]:
df.drop(columns=["casual","registered"],inplace=True)


In [62]:
df.drop(columns=["datetime"],inplace=True)

In [63]:
df.shape

(10885, 9)

In [64]:
df.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,riders
0,1,0,0,1,9.84,14.395,81,0.0,16
1,1,0,0,1,9.02,13.635,80,0.0,40
2,1,0,0,1,9.02,13.635,80,0.0,32
3,1,0,0,1,9.84,14.395,75,0.0,13
4,1,0,0,1,9.84,14.395,75,0.0,1


In [65]:
df.dtypes

season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
riders          int64
dtype: object

In [66]:
df['temp'] = df['temp'].astype(int)

In [67]:
df['atemp'] = df['atemp'].astype(int)

In [68]:
df['windspeed'] = df['windspeed'].astype(int)

In [69]:
df.dtypes

season        int64
holiday       int64
workingday    int64
weather       int64
temp          int64
atemp         int64
humidity      int64
windspeed     int64
riders        int64
dtype: object

In [70]:
#####   Multiple linear regression   ###

In [71]:
from sklearn.preprocessing import PolynomialFeatures

# Create X and y.
X = df.drop('riders', axis=1)
y = df['riders']

# Instantiate our PolynomialFeatures object to create all two-way terms.
# Write the code
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# Fit and transform our X data.
X_overfit=poly.fit_transform(X)

In [72]:
# Check out the dimensions of X_overfit.
X_overfit.shape

(10885, 44)

In [73]:
###Let's split our data up into training and testing sets. Why do we split our data into training and testing sets

In [74]:
# Import train_test_split.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [75]:
# Create train/test splits.
X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
    test_size=0.7,
    random_state=42
)

In [76]:
# Scale our data.
# Relabeling scaled data as "Z" is common.
# write the code

sc = StandardScaler()
Z_train= sc.fit_transform(X_train)
Z_test= sc.transform(X_test)

In [77]:
print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (3265, 44)
y_train shape is: (3265,)
Z_test shape is: (7620, 44)
y_test shape is: (7620,)


In [78]:
# Import the appropriate library and fit our OLS model.

from sklearn.linear_model import LinearRegression

In [79]:
ols = LinearRegression()
ols.fit(Z_train,y_train)

In [80]:
# How does the model score on the training and test data?
print(ols.score(Z_train,y_train))
print(ols.score(Z_test,y_test))

0.304591828340929
0.3003678253764477


In [81]:
ols.coef_

array([-3.70515973e+01, -1.46180938e+13, -1.14268168e+15,  1.70127451e+01,
        5.30514056e+02, -3.80353105e+02, -2.89738670e+01, -3.88935736e+01,
        1.12394991e+02,  1.08998296e+00, -4.03414557e+00,  3.30253760e+00,
       -5.78227327e+01,  5.77452405e+01, -6.40624357e+01, -9.33968380e+00,
        1.46180938e+13, -1.13213835e+13, -7.09631448e+00, -7.15823972e+01,
        6.82773337e+01,  1.14180202e+01, -3.05047698e+00,  1.14268168e+15,
       -2.06842200e+01, -9.67116726e+01,  7.37296694e+01,  5.34251356e+01,
        1.13914556e+01, -3.98201833e+01,  3.82028117e+01, -1.50272508e+01,
        4.38626658e+01, -2.53321533e+01, -1.83629395e+02, -1.73443115e+02,
       -1.90741211e+02,  8.54827881e+00,  3.09010742e+02,  1.20692871e+02,
        4.26531982e+01, -5.21899414e+00,  1.99599533e+01,  1.05419922e+00])

In [82]:
#####  implementation of ridge

In [83]:
# Ridge regressor lives here:
from sklearn.linear_model import Ridge

In [84]:
# Instantiate.
ridge_model = Ridge(alpha=10)

In [85]:
# Fit.
ridge_model.fit(Z_train,y_train)

In [86]:

# Evaluate model using R2.
print(ridge_model.score(Z_train,y_train))
print(ridge_model.score(Z_test,y_test))

0.30089381200353194
0.2989580700263198


In [87]:
from sklearn.linear_model import RidgeCV

In [88]:
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.
r_alpha = np.logspace(0,5,100)

# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas = r_alpha, scoring= 'r2', cv = 5)

# Fit model using best ridge alpha!
ridge_cv = ridge_cv.fit(Z_train,y_train)

In [89]:
# Here is the optimal value of alpha
ridge_cv

In [90]:
print(ridge_cv.score(Z_train,y_train))
print(ridge_cv.score(Z_test,y_test))

0.30347025922687076
0.30138034059408747


In [91]:
# Imports similar to Ridge
from sklearn.linear_model import Lasso, LassoCV

In [92]:
# Reminders
print(" OLS ".center(18, "="))
print(ols.score(Z_train, y_train))
print(ols.score(Z_test, y_test))
print()
print(" Ridge ".center(18, "="))
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

0.304591828340929
0.3003678253764477

0.30347025922687076
0.30138034059408747


In [93]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3,1,100)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV (alphas=l_alphas, cv = 5, max_iter=500)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train,y_train)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [94]:
# Here is the optimal value of alpha
lasso_cv.alpha_

0.07220809018385467

In [95]:
print(" Ridge ".center(18, "="))
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))
print(" Lasso ".center(18, "="))
print(lasso_cv.score(Z_train, y_train))
print(lasso_cv.score(Z_test, y_test))

0.30347025922687076
0.30138034059408747
0.30203410331086566
0.3003654940364858


In [96]:
ridge_cv.coef_

array([ -30.30741968,    0.9395517 ,   -5.62413977,   17.03318211,
        228.42043078,  -77.6643187 ,  -19.12688842,  -10.83273888,
        109.65950612,    1.65703591,   -4.71486162,    3.20916682,
        -16.97627054,   14.44463199,  -62.66080347,  -11.55172561,
          0.9395517 ,    0.        ,   -6.0137921 ,  -38.78199907,
         33.26175394,   11.14297498,   -3.52947821,   -5.62413977,
        -20.2645492 ,  -37.8081998 ,   11.97141942,   53.73174201,
         10.1516349 ,  -39.93569724,   16.21162345,    5.84455349,
         44.1388824 ,  -25.20125314, -102.97301315,  -88.7788722 ,
        -75.91791914,   64.89224898,  148.70212671,    6.51312831,
        -28.98619898,  -10.29265719,   16.48623084,   -3.93650461])

In [97]:
lasso_cv.coef_

array([-20.87265927,   1.47405708,  -0.        ,  12.32223013,
       151.82133652,  -0.        , -11.11829339,  -1.13628328,
       101.3552291 ,   1.87014905,  -4.71135079,   2.49499293,
        -2.18577472,  -0.        , -62.11014205, -12.43663727,
         0.65167368,   0.        ,  -4.73585304,  -6.52226227,
         0.        ,  11.0483082 ,  -4.10380629,  -6.22271356,
       -18.83506056, -15.6975245 , -11.3413784 ,  50.18172919,
         7.86744227, -33.38592217,  10.47803091,  10.45388273,
        41.34131062, -23.7426554 , -93.6689513 ,  -4.16174614,
       -64.04611379,  63.38883998,  53.03161929,  -3.41423447,
       -31.15567746, -15.06841649,  12.97720184,  -4.89408027])

#end