In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [4]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [5]:
tripdata.columns

Index(['tripduration', 'start_station_id', 'start_station_name', 'start_lat',
       'start_lon', 'end_station_id', 'end_station_name', 'end_lat', 'end_lon',
       'bikeid', 'usertype', 'hour', 'min', 'temp', 'feelslike', 'dew',
       'humidity', 'precip', 'precipprob', 'snow', 'snowdepth', 'windspeed',
       'winddir', 'sealevelpressure', 'cloudcover', 'visibility',
       'solarradiation', 'uvindex', 'conditions', 'date', 'dist', 'birthyear',
       'years_old', 'holiday', 'day', 'month', 'seasons', 'gender'],
      dtype='object')

In [6]:
tripdata.shape

(349504, 38)

In [7]:
cols = ['tripduration','dist','hour','birthyear','temp','snowdepth','usertype','gender','cloudcover','holiday','feelslike']

In [8]:
df = tripdata[cols]

In [9]:
df

Unnamed: 0,tripduration,dist,hour,birthyear,temp,snowdepth,usertype,gender,cloudcover,holiday,feelslike
0,15.53,1.084267,2,1992,-11.7,0.0,Subscriber,male,0.0,holiday,-16.5
1,9.17,1.084267,12,1969,-7.3,0.0,Subscriber,female,0.2,holiday,-12.5
2,5.90,0.415696,14,1994,-5.2,0.0,Subscriber,male,0.2,holiday,-5.2
3,4.17,0.240932,17,1991,-5.8,0.0,Subscriber,male,0.2,holiday,-10.6
4,10.22,1.217917,22,1982,-7.2,0.0,Subscriber,male,0.2,holiday,-15.0
...,...,...,...,...,...,...,...,...,...,...,...
349499,18.02,1.872980,11,1993,7.1,0.0,Subscriber,male,94.9,working_day,5.2
349500,5.73,0.828647,21,1983,2.8,0.0,Subscriber,female,0.8,holiday,2.8
349501,20.55,2.657139,12,1988,11.5,0.0,Subscriber,male,29.8,working_day,11.5
349502,17.62,2.315132,15,1991,3.8,0.0,Subscriber,female,99.6,working_day,1.9


In [10]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
print(train.shape)
print(test.shape)

(279603, 11)
(69901, 11)


In [11]:
X_train = train.drop('tripduration', axis=1)
y_train = train['tripduration']

In [13]:
from dython.nominal import associations
from dython.nominal import identify_nominal_columns
categorical_features=identify_nominal_columns(X_train)
categorical_features

['usertype', 'gender', 'holiday']

In [14]:
one_hot_encoded_data_train = pd.get_dummies(X_train, columns = categorical_features)
one_hot_encoded_data_train.head(3)
X_train = one_hot_encoded_data_train

In [15]:
model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train)
print("Intercept is", model_lr.intercept_)
print("coeffients are ", model_lr.coef_)

Intercept is 75.28952240449448
coeffients are  [ 3.68334612e+00  6.84945829e-02 -3.19993325e-02  1.57966478e-01
  3.71470094e-01 -2.31540901e-03 -7.02164807e-02  8.20879599e+00
 -8.20879599e+00  2.91300508e-02 -8.37122503e-01  8.07992452e-01
  8.83180088e-01 -8.83180088e-01]


In [16]:
y_train_pred = model_lr.predict(X_train)
rmse = mean_squared_error(y_train, y_train_pred, squared=False)
print("RMSE is of linear regression is = ", rmse)
print("R2 is of linear regression is = ", r2_score(y_train, y_train_pred))

RMSE is of linear regression is =  9.799220079803547
R2 is of linear regression is =  0.1989518667349991


### Python version 
1. Split 80% and 20% (282213, 11) (70554, 11)
2. Intercept is 75.28952240449448
3. coeffients are  [ 3.68334612e+00  6.84945829e-02 -3.19993325e-02  1.57966478e-01
  3.71470094e-01 -2.31540901e-03 -7.02164807e-02  8.20879599e+00
 -8.20879599e+00  2.91300508e-02 -8.37122503e-01  8.07992452e-01
  8.83180088e-01 -8.83180088e-01]
4. RMSE is of linear regression is =  9.799220079803547
5. R2 is of linear regression is =  0.1989518667349991

### R Version
1. Split 80% and 20% showing (282691,11)(70076, 11) 

Call:
lm(formula = tripduration ~ dist + hour + birthyear + temp + 
    snowdepth + factor(usertype) + factor(gender) + cloudcover + 
    factor(holiday) + feelslike, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-27.698  -3.168  -1.846   0.165 175.174 

Coefficients:
                             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)                 81.747024   4.152722   19.685  < 2e-16 ***
dist                         3.524912   0.036018   97.865  < 2e-16 ***
hour                         0.070350   0.003829   18.371  < 2e-16 ***
birthyear                   -0.029945   0.002092  -14.315  < 2e-16 ***
temp                         0.169994   0.020395    8.335  < 2e-16 ***
snowdepth                    0.370567   0.068815    5.385 7.25e-08 ***
factor(usertype)Subscriber -17.576692   0.097099 -181.019  < 2e-16 ***
factor(gender)male          -0.899655   0.048298  -18.627  < 2e-16 ***
factor(gender)unknown        0.944763   0.107577    8.782  < 2e-16 ***
cloudcover                  -0.002193   0.000445   -4.928 8.30e-07 ***
factor(holiday)working_day  -1.932395   0.167329  -11.548  < 2e-16 ***
feelslike                   -0.078067   0.017495   -4.462 8.11e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 10.3 on 280054 degrees of freedom
Multiple R-squared:  0.2012,	Adjusted R-squared:  0.2011 
F-statistic:  6411 on 11 and 280054 DF,  p-value: < 2.2e-16