In [1]:
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np

# allow plots to appear directly in the notebook
%matplotlib inline

In [2]:
df = pd.read_csv('46A_1.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,tripid,dt,dayofweek,month,day,arrive_time,rush_hour,progrnum,stop_id,...,pressure,humidity,wind_speed,wind_deg,rain_1h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,0,6243537,1517674861,5,2,3,58861,1,1,808,...,1011.0,81.0,8.75,330.0,,75.0,803.0,Clouds,broken clouds,04d
1,1,6243537,1517674904,5,2,3,58904,1,2,809,...,1011.0,81.0,8.75,330.0,,75.0,803.0,Clouds,broken clouds,04d


In [3]:
df.isna().sum()

Unnamed: 0                  0
tripid                      0
dt                          0
dayofweek                   0
month                       0
day                         0
arrive_time                 0
rush_hour                   0
progrnum                    0
stop_id                     0
cum_duration                0
dt_iso                      0
timezone                    0
temp                        0
feels_like                  0
temp_min                    0
temp_max                    0
pressure                    0
humidity                    0
wind_speed                  0
wind_deg                    0
rain_1h                166344
clouds_all                  0
weather_id                  0
weather_main                0
weather_description         0
weather_icon                0
dtype: int64

In [4]:
test = df.drop(columns=['rain_1h','Unnamed: 0','weather_icon','dt_iso','weather_description'])

df_rev1 = test.copy()
df_rev1 = pd.get_dummies(df_rev1)

In [5]:
print(df_rev1.shape)

(166344, 28)
tripid                    int64
dt                        int64
dayofweek                 int64
month                     int64
day                       int64
arrive_time               int64
rush_hour                 int64
progrnum                  int64
stop_id                   int64
cum_duration              int64
timezone                float64
temp                    float64
feels_like              float64
temp_min                float64
temp_max                float64
pressure                float64
humidity                float64
wind_speed              float64
wind_deg                float64
clouds_all              float64
weather_id              float64
weather_main_Clear        uint8
weather_main_Clouds       uint8
weather_main_Drizzle      uint8
weather_main_Fog          uint8
weather_main_Mist         uint8
weather_main_Rain         uint8
weather_main_Snow         uint8
dtype: object


In [7]:
text = 'cum_duration ~ '

for feature in df_rev1:
    text += feature
    text += '+'
    
text

'cum_duration ~ tripid+dt+dayofweek+month+day+arrive_time+rush_hour+progrnum+stop_id+cum_duration+timezone+temp+feels_like+temp_min+temp_max+pressure+humidity+wind_speed+wind_deg+clouds_all+weather_id+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow+'

In [8]:
text1 = 'cum_duration ~ tripid+dt+dayofweek+month+day+arrive_time+rush_hour+progrnum+stop_id+timezone+temp+feels_like+temp_min+temp_max+pressure+humidity+wind_speed+wind_deg+clouds_all+weather_id+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow'

**P-values:** In statistical hypothesis testing, the p-value or probability value is the best probability of obtaining test results at least as extreme as the results actually observed, assuming that the null hypothesis is correct. The range of P-value should be in (0,1). If the P-value of a feature is less than 0.05, we can say this feature can influence the target feature.

**R squared：**is the proportion of the variance in the dependent variable that is predictable from the independent variable. Larger R squared, better performance.


In [9]:
lm1 = smf.ols(formula=text1 , data=df_rev1).fit()

lm1.pvalues

Intercept                2.421469e-19
tripid                   5.356021e-55
dt                       1.694157e-61
dayofweek                0.000000e+00
month                    2.568603e-19
day                      1.898625e-84
arrive_time              0.000000e+00
rush_hour                0.000000e+00
progrnum                 0.000000e+00
stop_id                  1.275115e-31
timezone                 6.576778e-82
temp                     8.423479e-86
feels_like               2.918784e-94
temp_min                 3.960626e-16
temp_max                 1.825015e-16
pressure                 1.732848e-94
humidity                 9.303349e-60
wind_speed              4.443053e-101
wind_deg                 7.923072e-15
clouds_all               3.438069e-21
weather_id               2.906149e-49
weather_main_Clear       3.597387e-38
weather_main_Clouds      8.200053e-58
weather_main_Drizzle     5.789147e-43
weather_main_Fog         8.679650e-27
weather_main_Mist        1.562449e-06
weather_main

In [10]:
lm1.summary()

0,1,2,3
Dep. Variable:,cum_duration,R-squared:,0.85
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,39230.0
Date:,"Tue, 23 Jun 2020",Prob (F-statistic):,0.0
Time:,15:03:44,Log-Likelihood:,-1281400.0
No. Observations:,166344,AIC:,2563000.0
Df Residuals:,166319,BIC:,2563000.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.6744,0.075,-8.993,0.000,-0.821,-0.527
tripid,0.0008,4.94e-05,15.625,0.000,0.001,0.001
dt,-4.593e-06,2.77e-07,-16.553,0.000,-5.14e-06,-4.05e-06
dayofweek,-34.8258,0.823,-42.295,0.000,-36.440,-33.212
month,380.4176,42.330,8.987,0.000,297.451,463.384
day,9.6902,0.497,19.483,0.000,8.715,10.665
arrive_time,-0.0095,8.4e-05,-112.572,0.000,-0.010,-0.009
rush_hour,226.2036,2.937,77.025,0.000,220.448,231.960
progrnum,75.8692,0.089,848.354,0.000,75.694,76.045

0,1,2,3
Omnibus:,10379.127,Durbin-Watson:,0.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23898.31
Skew:,0.4,Prob(JB):,0.0
Kurtosis:,4.676,Cond. No.,1.28e+18


In [11]:
test2 = df.drop(columns=['rain_1h','Unnamed: 0','weather_icon','tripid','dt','dt_iso','timezone','weather_description'])


In [13]:
df_rev2 = test2.copy()
df_rev2 = pd.get_dummies(df_rev2)

print(df_rev2.shape)
print(df_rev2.dtypes)

(166344, 25)
dayofweek                 int64
month                     int64
day                       int64
arrive_time               int64
rush_hour                 int64
progrnum                  int64
stop_id                   int64
cum_duration              int64
temp                    float64
feels_like              float64
temp_min                float64
temp_max                float64
pressure                float64
humidity                float64
wind_speed              float64
wind_deg                float64
clouds_all              float64
weather_id              float64
weather_main_Clear        uint8
weather_main_Clouds       uint8
weather_main_Drizzle      uint8
weather_main_Fog          uint8
weather_main_Mist         uint8
weather_main_Rain         uint8
weather_main_Snow         uint8
dtype: object


In [14]:
lm2 = smf.ols(formula='cum_duration ~ dayofweek+month+day+arrive_time+rush_hour+progrnum+stop_id+temp+feels_like+temp_min+temp_max+pressure+humidity+wind_speed+wind_deg+clouds_all+weather_id+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow', data=df_rev1).fit()


lm2.pvalues

Intercept               2.683769e-17
dayofweek               0.000000e+00
month                   3.375402e-40
day                     0.000000e+00
arrive_time             0.000000e+00
rush_hour               0.000000e+00
progrnum                0.000000e+00
stop_id                 1.389378e-31
temp                    4.569805e-76
feels_like              8.538995e-81
temp_min                1.804988e-10
temp_max                1.622575e-20
pressure                7.965360e-70
humidity                1.933014e-50
wind_speed              3.728509e-87
wind_deg                2.617339e-21
clouds_all              5.929298e-19
weather_id              1.664416e-46
weather_main_Clear      9.494183e-32
weather_main_Clouds     6.354286e-43
weather_main_Drizzle    2.812297e-41
weather_main_Fog        6.697545e-23
weather_main_Mist       5.005382e-13
weather_main_Rain       5.408105e-12
weather_main_Snow       6.652910e-17
dtype: float64

In [15]:
lm2.summary()

0,1,2,3
Dep. Variable:,cum_duration,R-squared:,0.85
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,40870.0
Date:,"Tue, 23 Jun 2020",Prob (F-statistic):,0.0
Time:,15:04:32,Log-Likelihood:,-1281500.0
No. Observations:,166344,AIC:,2563000.0
Df Residuals:,166320,BIC:,2563000.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2372.9763,280.477,-8.460,0.000,-2922.706,-1823.247
dayofweek,-39.0170,0.779,-50.083,0.000,-40.544,-37.490
month,543.0751,40.909,13.275,0.000,462.895,623.255
day,16.7853,0.182,92.152,0.000,16.428,17.142
arrive_time,-0.0094,8.4e-05,-112.043,0.000,-0.010,-0.009
rush_hour,225.8196,2.939,76.841,0.000,220.060,231.580
progrnum,75.8674,0.089,847.715,0.000,75.692,76.043
stop_id,-0.0119,0.001,-11.695,0.000,-0.014,-0.010
temp,505.6132,27.380,18.467,0.000,451.949,559.277

0,1,2,3
Omnibus:,10555.451,Durbin-Watson:,0.056
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24457.414
Skew:,0.404,Prob(JB):,0.0
Kurtosis:,4.696,Cond. No.,3.16e+18


In [16]:
df_c = pd.read_csv('46A_1_calt.csv')
df_c.head(2)

Unnamed: 0.1,Unnamed: 0,tripid,dt,dayofweek,month,day,hour,rush_hour,progrnum,stop_id,cum_duration,temp,pressure,humidity,wind_speed,weather_main
0,0,6643312,1524522272,0,4,23,22,0,1,808,373,9.03,1012.0,81.0,8.2,Clouds
1,1,6643312,1524522645,0,4,23,22,0,2,809,408,8.99,1012.0,75.0,6.7,Clouds


In [17]:
test3 = df_c.drop(columns=['dt','Unnamed: 0'])

In [18]:
df_rev3 = test3.copy()
df_rev3 = pd.get_dummies(df_rev3)

print(df_rev3.shape)
#print(df_rev3.dtypes)

(458084, 20)


In [19]:
text = 'cum_duration ~ '

for feature in df_rev3:
    text += feature
    text += '+'
    
text

'cum_duration ~ tripid+dayofweek+month+day+hour+rush_hour+progrnum+stop_id+cum_duration+temp+pressure+humidity+wind_speed+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow+'

In [20]:
text1 = 'cum_duration ~ tripid+dayofweek+month+day+hour+rush_hour+progrnum+stop_id+temp+pressure+humidity+wind_speed+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow'

In [21]:
lm3 = smf.ols(formula=text1 , data=df_rev3).fit()

lm3.pvalues

Intercept                6.895768e-75
tripid                   0.000000e+00
dayofweek                0.000000e+00
month                    0.000000e+00
day                     4.517487e-116
hour                     0.000000e+00
rush_hour                0.000000e+00
progrnum                 0.000000e+00
stop_id                  1.865537e-81
temp                    1.021753e-157
pressure                5.849542e-179
humidity                 0.000000e+00
wind_speed              4.238639e-172
weather_main_Clear      1.998758e-101
weather_main_Clouds      7.004932e-50
weather_main_Drizzle     2.415602e-64
weather_main_Fog         4.719603e-48
weather_main_Mist        2.320478e-78
weather_main_Rain        1.020291e-40
weather_main_Snow        3.046553e-83
dtype: float64

In [23]:
lm3.summary()

0,1,2,3
Dep. Variable:,cum_duration,R-squared:,0.848
Model:,OLS,Adj. R-squared:,0.848
Method:,Least Squares,F-statistic:,142100.0
Date:,"Tue, 23 Jun 2020",Prob (F-statistic):,0.0
Time:,15:06:32,Log-Likelihood:,-3534500.0
No. Observations:,458084,AIC:,7069000.0
Df Residuals:,458065,BIC:,7069000.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2118.5664,115.685,-18.313,0.000,-2345.305,-1891.828
tripid,0.0009,1.82e-05,50.506,0.000,0.001,0.001
dayofweek,-38.6976,0.433,-89.428,0.000,-39.546,-37.849
month,-126.5067,2.958,-42.771,0.000,-132.304,-120.710
day,-3.6499,0.159,-22.908,0.000,-3.962,-3.338
hour,-27.5965,0.177,-156.118,0.000,-27.943,-27.250
rush_hour,225.2270,1.774,126.962,0.000,221.750,228.704
progrnum,76.8664,0.055,1408.586,0.000,76.759,76.973
stop_id,-0.0119,0.001,-19.120,0.000,-0.013,-0.011

0,1,2,3
Omnibus:,19388.532,Durbin-Watson:,0.053
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38548.658
Skew:,0.312,Prob(JB):,0.0
Kurtosis:,4.277,Cond. No.,5.63e+21


In [24]:
test4 = df_c.drop(columns=['dt','Unnamed: 0','tripid','stop_id'])

In [25]:
df_rev4 = test4.copy()
df_rev4 = pd.get_dummies(df_rev4)

print(df_rev4.shape)


(458084, 18)


In [26]:
text = 'cum_duration ~ '

for feature in df_rev4:
    text += feature
    text += '+'
    
text

'cum_duration ~ dayofweek+month+day+hour+rush_hour+progrnum+cum_duration+temp+pressure+humidity+wind_speed+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow+'

In [27]:
text1 = 'cum_duration ~ dayofweek+month+day+hour+rush_hour+progrnum+temp+pressure+humidity+wind_speed+weather_main_Clear+weather_main_Clouds+weather_main_Drizzle+weather_main_Fog+weather_main_Mist+weather_main_Rain+weather_main_Snow'


In [28]:
lm4 = smf.ols(formula=text1 , data=df_rev4).fit()

lm4.pvalues

Intercept                0.000000e+00
dayofweek                0.000000e+00
month                    1.643668e-22
day                     4.909519e-133
hour                     0.000000e+00
rush_hour                0.000000e+00
progrnum                 0.000000e+00
temp                    3.115915e-151
pressure                2.681005e-191
humidity                 0.000000e+00
wind_speed              1.906031e-186
weather_main_Clear       1.333296e-62
weather_main_Clouds      0.000000e+00
weather_main_Drizzle    2.093448e-289
weather_main_Fog        1.995164e-281
weather_main_Mist       5.653073e-156
weather_main_Rain        0.000000e+00
weather_main_Snow       1.848544e-184
dtype: float64

In [29]:
lm4.summary()

0,1,2,3
Dep. Variable:,cum_duration,R-squared:,0.847
Model:,OLS,Adj. R-squared:,0.847
Method:,Least Squares,F-statistic:,158600.0
Date:,"Tue, 23 Jun 2020",Prob (F-statistic):,0.0
Time:,15:28:36,Log-Likelihood:,-3536000.0
No. Observations:,458084,AIC:,7072000.0
Df Residuals:,458067,BIC:,7072000.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2599.9461,68.287,38.074,0.000,2466.106,2733.786
dayofweek,-39.5767,0.434,-91.243,0.000,-40.427,-38.727
month,11.2170,1.149,9.762,0.000,8.965,13.469
day,2.5196,0.103,24.553,0.000,2.318,2.721
hour,-27.4334,0.177,-154.729,0.000,-27.781,-27.086
rush_hour,225.0522,1.780,126.462,0.000,221.564,228.540
progrnum,76.3720,0.048,1583.755,0.000,76.277,76.466
temp,8.6080,0.329,26.204,0.000,7.964,9.252
pressure,-2.1641,0.073,-29.516,0.000,-2.308,-2.020

0,1,2,3
Omnibus:,19569.846,Durbin-Watson:,0.052
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39306.798
Skew:,0.312,Prob(JB):,0.0
Kurtosis:,4.293,Cond. No.,3.04e+17
