### Getting all the segments for the Inbound route 39A

In [1]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('postgresql+psycopg2://postgres:00001234@localhost:5433/jetaDb')

In [3]:
# Getting the list of segments and routes with the segments
df_39A_in_stops = pd.read_sql_query("select stopids from main_routes where routeid like '39A_40';",engine)
_39A_in_stops = df_39A_in_stops['stopids'].values.tolist()
_39A_in_segments = []
for i in range(len(_39A_in_stops[0])):
    if i < (len(_39A_in_stops[0])-1):
        _39A_in_segments.append(str(_39A_in_stops[0][i])+"_"+str(_39A_in_stops[0][i+1]))

In [4]:
len(_39A_in_segments)

72

In [5]:
print(_39A_in_segments)

['767_768', '768_769', '769_770', '770_771', '771_772', '772_773', '773_774', '774_775', '775_776', '776_777', '777_779', '779_780', '780_781', '781_782', '782_783', '783_784', '784_785', '785_786', '786_793', '793_7576', '7576_7586', '7586_7587', '7587_7588', '7588_328', '328_1443', '1443_1444', '1444_1445', '1445_1647', '1647_1648', '1648_1649', '1649_1911', '1911_1913', '1913_1914', '1914_1805', '1805_1806', '1806_1660', '1660_1661', '1661_1662', '1662_1664', '1664_1665', '1665_1666', '1666_1807', '1807_7167', '7167_1808', '1808_7389', '7389_7025', '7025_4464', '4464_1869', '1869_1870', '1870_1871', '1871_1872', '1872_1873', '1873_1874', '1874_1875', '1875_1876', '1876_1877', '1877_1878', '1878_1879', '1879_1899', '1899_6107', '6107_6108', '6108_6109', '6109_6110', '6110_7020', '7020_7029', '7029_7038', '7038_7011', '7011_2171', '2171_7160', '7160_7047', '7047_7161', '7161_7162']


In [6]:
# We can see that there in all a total of 72 segments present
# Now we will see if we have all the segments as our data files:
import os.path
fileNotFound = 0
for files in _39A_in_segments:
    fname="SegmentedSamples2_actual\\{}.csv".format(files)
    if not os.path.isfile(fname):
        print("File Not Found: {}".format(fname))
        fileNotFound+=1
if fileNotFound==0:
    print("All segment data present!")

All segment data present!


> All files are present for the segment <br>
We will now begin to make the predictive models

## Linear Regression

#### Here we will use scikit learn to show the performance of the linear regression. We will import the segments one by one and create a new dataframe that will hold our regression coeficients for each segment.

In [198]:
# Making the dataframe that will hold the coefficients for each segment
df_ols_coef = pd.DataFrame(columns=['Segment','intercept','arrivaltime','rain','dayofweek_Friday','dayofweek_Monday','dayofweek_Saturday','dayofweek_Sunday','dayofweek_Thursday','dayofweek_Tuesday'])
# Coefficients that hold the error values
df_error_coef = pd.DataFrame(columns=['Segment','R2','Mean Absolute Error','Mean Squared Error','Median Absolute Error'])

In [193]:
# Importing all the important packages that will be needed to run the model
from sklearn.model_selection import train_test_split
from sklearn import linear_model
# Model Evaluation Metrics
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, mean_absolute_error, explained_variance_score

# Column names for the files
colnames=['arrivaltime', 'traveltime', 'segmentid', 'dayofweek','rain','temp']
regr = linear_model.LinearRegression() # Model

In [194]:
# Lists to hold the data that will be added to our coefficients dataframe
segments = []
intercepts = []
arrivalTimes = []
rains = []
fridays = []
mondays = []
saturdays = []
sundays = []
thursdays = []
tuesdays = []
wednesdays = []
r2s = []
mar = []
mse = []
mae = []

In [195]:
# Running a loop through segments and learning simple linear regression model through each segment
for i,files in enumerate(_39A_in_segments):
    fname = "SegmentedSamples2_actual\\{}.csv".format(files)
    if os.path.isfile(fname):
        # Everything will be done here
        # Read the file in a dataframe
        df = pd.read_csv(fname,names=colnames)
        df.drop(['segmentid','temp'],axis=1,inplace=True) # We learnt earlier that temp is not a good predicting independent variable hence we dont need it
        # Converting the datatypes
        df['dayofweek'] = df['dayofweek'].astype('category')
        # Rain had missing values
        df['rain'].fillna(df['rain'].mean(),inplace=True)
        # Split the dataset into independent and dependent features
        df_X = df[['arrivaltime','dayofweek','rain']]
        df_y = df['traveltime']
        # Dummify day of week
        weekday_dummies = pd.get_dummies(df_X.dayofweek, prefix='dayofweek')
        df_X_dummify = pd.concat([df_X,weekday_dummies],axis=1)
        df_X_dummied = df_X_dummify.drop('dayofweek',axis=1)
        df_X_dummied = df_X_dummied.drop('dayofweek_Wednesday',axis=1)
        # Rescaling (Standardisation)
        #df_X_std = (df_X_dummied - df_X_dummied.mean()) / df_X_dummied.std()
        # Split the files in train and test sets
        df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X_dummied, df_y, test_size = 0.3, random_state = 100)
        # Creating our regressor
        regr.fit(df_X_train, df_y_train)
        df_y_pred = regr.predict(df_X_test)
        regr_predictions = pd.DataFrame({'True Travel Time':df_y_test,'Estimated Travel Time':df_y_pred})
        # Writing the coefficients in the dataframes
        segments.append(files)
        intercepts.append(regr.intercept_)
        arrivalTimes.append(regr.coef_[0])
        rains.append(regr.coef_[1])
        fridays.append(regr.coef_[2])
        mondays.append(regr.coef_[3])
        saturdays.append(regr.coef_[4])
        sundays.append(regr.coef_[5])
        thursdays.append(regr.coef_[6])
        tuesdays.append(regr.coef_[7])
        r2s.append(r2_score(df_y_test,df_y_pred))
        mar.append(mean_absolute_error(df_y_test,df_y_pred))
        mse.append(mean_squared_error(df_y_test,df_y_pred))
        mae.append(median_absolute_error(df_y_test,df_y_pred))
#         print('Variance: {}'.format(explained_variance_score(df_y_test,df_y_pred)))
    else:
        print("File: {}, not found".format(fname))
        break
df_ols_coef['Segment']=segments
df_ols_coef['intercept']=intercepts
df_ols_coef['arrivaltime']=arrivalTimes
df_ols_coef['rain']=rains
df_ols_coef['dayofweek_Friday']=fridays
df_ols_coef['dayofweek_Monday']=mondays
df_ols_coef['dayofweek_Saturday']=saturdays
df_ols_coef['dayofweek_Sunday']=sundays
df_ols_coef['dayofweek_Thursday']=thursdays
df_ols_coef['dayofweek_Tuesday']=tuesdays
df_error_coef['Segment']=segments
df_error_coef['R2']=r2s
df_error_coef['Mean Absolute Error']=mar
df_error_coef['Mean Squared Error']=mse
df_error_coef['Median Absolute Error']=mae

In [178]:
df_ols_coef

Unnamed: 0,Segment,intercept,arrivaltime,rain,dayofweek_Friday,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday
0,767_768,119.613813,-0.000179,1.266745,1.653120,4.505787,-6.424188,-6.714799,3.671679,2.077016,1.231385
1,768_769,64.658188,0.000006,0.853329,6.035300,1.505619,-6.434660,-13.304114,6.282759,2.600173,3.314924
2,769_770,66.755920,-0.000202,0.290486,2.629493,1.970217,-5.966691,-7.312122,4.112366,1.946966,2.619771
3,770_771,33.126570,-0.000044,-0.081414,0.940621,-0.289851,-1.345330,-2.270757,1.310110,0.358601,1.296606
4,771_772,58.604961,-0.000327,1.154692,2.805233,1.323962,-5.554963,-4.594139,2.137157,2.285984,1.596766
5,772_773,75.664868,-0.000510,1.256443,1.608308,0.135896,-1.003856,-2.833036,1.325977,0.133722,0.632989
6,773_774,71.182717,-0.000158,-0.033690,1.005433,-1.462299,-0.059047,1.748890,0.974383,-1.775743,-0.431617
7,774_775,72.159105,-0.000316,0.430424,3.792668,-0.946042,-1.922808,-5.262922,1.851765,0.632707,1.854631
8,775_776,75.445711,-0.000352,0.489834,3.039628,-1.655335,-0.817017,-4.278704,2.859700,-0.247054,1.098783
9,776_777,55.124569,-0.000233,0.245806,2.443955,-1.037075,-0.211346,-2.863634,1.050710,0.029771,0.587619


In [197]:
df_new_test = pd.DataFrame({'arrivaltime':[39540],'rain':[0.0],'dayofweek_Friday':[1],'dayofweek_Monday':[0],'dayofweek_Saturday':[0],'dayofweek_Sunday':[0],'dayofweek_Thursday':[0],'dayofweek_Tuesday':[0]})
cols = df_new_test.columns.tolist()
cols = cols[0:1] + cols[-1:] + cols[1:-1]
df_new_test = df_new_test[cols]
listofvalues = df_new_test.values.tolist()
arrivaltime=int(listofvalues[0][0])
totaltraveltime=0

for i, rows in df_ols_coef.iterrows():
    if i >=1:
        traveltime = rows['intercept']+(rows['arrivaltime']*arrivaltime)+(rows['rain']*listofvalues[0][1])+(rows['dayofweek_Friday']*listofvalues[0][2])+(rows['dayofweek_Monday']*listofvalues[0][3])+(rows['dayofweek_Saturday']*listofvalues[0][4])+(rows['dayofweek_Sunday']*listofvalues[0][5])+(rows['dayofweek_Thursday']*listofvalues[0][6])+(rows['dayofweek_Tuesday']*listofvalues[0][7])
        arrivaltime+=traveltime
        totaltraveltime+=traveltime
print('Arrival Time',arrivaltime)
print('Total Travel Time',totaltraveltime)

Arrival Time 44368.83676186131
Total Travel Time 4828.836761861351


In [138]:
arrivaltime

9.50505243777239e+25

In [130]:
df_ols_coef

Unnamed: 0,Segment,intercept,arrivaltime,rain,dayofweek_Friday,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday
0,767_768,111.221808,-2.840075,0.676445,3.398836e-01,1.335684e+00,-2.209294e+00,-2.061900e+00,1.068809e+00,5.242796e-01,1.946779e-01
1,767_768,111.221808,-2.840075,0.676445,3.398836e-01,1.335684e+00,-2.209294e+00,-2.061900e+00,1.068809e+00,5.242796e-01,1.946779e-01
2,768_769,66.314186,0.093659,0.369220,9.490699e+13,9.588758e+13,7.743336e+13,7.815654e+13,9.715985e+13,1.026186e+14,1.013344e+14
3,769_770,57.082442,-3.402935,0.117477,6.606144e-01,4.217487e-01,-2.008103e+00,-2.323583e+00,1.215776e+00,4.419418e-01,6.919952e-01
4,770_771,31.089765,-0.734573,-0.041628,2.544330e-01,-1.874063e-01,-4.695284e-01,-7.059809e-01,3.954064e-01,4.990170e-02,4.046241e-01
5,771_772,42.312029,-5.522805,0.482917,-1.574308e+14,-1.591226e+14,-1.304192e+14,-1.256949e+14,-1.605592e+14,-1.700936e+14,-1.668863e+14
6,772_773,49.108429,-8.680061,0.573727,4.945977e-01,-3.543625e-02,-3.642829e-01,-8.647421e-01,3.959978e-01,-3.866847e-02,1.508981e-01
7,773_774,62.678651,-2.682812,-0.012445,3.353213e+14,3.356348e+14,2.731537e+14,2.652357e+14,3.393881e+14,3.562996e+14,3.571540e+14
8,774_775,55.987245,-5.338284,0.217484,1.208003e+00,-5.031889e-01,-6.891912e-01,-1.636424e+00,5.126847e-01,7.048623e-02,5.305010e-01
9,775_776,57.197550,-5.935929,0.218277,9.849619e-01,-7.133553e-01,-3.310886e-01,-1.307159e+00,9.203319e-01,-2.181126e-01,2.959042e-01


## Regularization

Here we are going to perform Ridge or Lasso Regression and check if we can improve our results

### Ridge Regression

In [59]:
from sklearn.linear_model import Ridge, Lasso

In [35]:
df_ridge = pd.read_csv("SegmentedSamples2_actual\\767_768.csv",names=colnames)

In [72]:
df_ridge_X_dummify

Unnamed: 0,arrivaltime,dayofweek,rain,dayofweek_Friday,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday
0,32380,Thursday,0.0,0,0,0,0,1,0,0
1,53523,Tuesday,0.0,0,0,0,0,0,1,0
2,39607,Tuesday,0.0,0,0,0,0,0,1,0
3,58016,Wednesday,0.0,0,0,0,0,0,0,1
4,39612,Tuesday,0.6,0,0,0,0,0,1,0
5,28163,Friday,0.0,1,0,0,0,0,0,0
6,70849,Friday,0.0,1,0,0,0,0,0,0
7,35418,Monday,0.0,0,1,0,0,0,0,0
8,58170,Tuesday,0.0,0,0,0,0,0,1,0
9,28187,Wednesday,0.0,0,0,0,0,0,0,1


In [36]:
df_ridge.drop(['segmentid','temp'],axis=1,inplace=True) # We learnt earlier that temp is not a good predicting independent variable hence we dont need it
df_ridge['dayofweek'] = df_ridge['dayofweek'].astype('category')
df_ridge['rain'].fillna(df_ridge['rain'].mean(),inplace=True)
df_ridge_X = df_ridge[['arrivaltime','dayofweek','rain']]
df_ridge_y = df_ridge['traveltime']
weekday_dummies = pd.get_dummies(df_ridge_X.dayofweek, prefix='dayofweek')
df_ridge_X_dummify = pd.concat([df_ridge_X,weekday_dummies],axis=1)
df_ridge_X_dummied = df_ridge_X_dummify.drop('dayofweek',axis=1)
df_ridge_X_std = (df_ridge_X_dummied - df_ridge_X_dummied.mean()) / df_ridge_X_dummied.std()
df_ridge_y_std = (df_ridge_y - df_ridge_y.mean()) / df_ridge_y.std()

In [37]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_ridge_X_std, df_ridge_y, test_size = 0.3, random_state = 100)

In [65]:
ridgereg = Lasso(alpha=[1e-15],normalize=True)
ridgereg.fit(df_X_train,df_y_train)



Lasso(alpha=[1e-15], copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [66]:
y_pred = ridgereg.predict(df_X_test)

In [67]:
ridge_predictions = pd.DataFrame({'True Travel Time':df_y_test,'Estimated Travel Time':y_pred})

In [68]:
ridge_predictions_true = (ridge_predictions * ridge_predictions.std()) + ridge_predictions.mean()

In [69]:
mean_absolute_error(df_y_test,y_pred)

17.568713961273232

## Random Forest Regression

In [154]:
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True, random_state=1)

In [160]:
# Making the dataframe that will hold the coefficients for each segment
#df_rfc_coef = pd.DataFrame(columns=['Segment','intercept','arrivaltime','rain','dayofweek_Friday','dayofweek_Monday','dayofweek_Saturday','dayofweek_Sunday','dayofweek_Thursday','dayofweek_Tuesday','dayofweek_Wednesday'])
df_rfc_error_coef = pd.DataFrame(columns=['Segment','R2','Mean Absolute Error','Mean Squared Error','Median Absolute Error'])
segments = []
intercepts = []
arrivalTimes = []
rains = []
fridays = []
mondays = []
saturdays = []
sundays = []
thursdays = []
tuesdays = []
wednesdays = []
r2s = []
mar = []
mse = []
mae = []
# Running a loop through segments and learning simple linear regression model through each segment
for i,files in enumerate(_39A_in_segments):
    fname = "SegmentedSamples2_actual\\{}.csv".format(files)
    if os.path.isfile(fname):
        # Everything will be done here
        # Read the file in a dataframe
        df = pd.read_csv(fname,names=colnames)
        df.drop(['segmentid','temp'],axis=1,inplace=True) # We learnt earlier that temp is not a good predicting independent variable hence we dont need it
        # Converting the datatypes
        df['dayofweek'] = df['dayofweek'].astype('category')
        # Rain had missing values
        df['rain'].fillna(df['rain'].mean(),inplace=True)
        # Split the dataset into independent and dependent features
        df_X = df[['arrivaltime','dayofweek','rain']]
        df_y = df['traveltime']
        # Dummify day of week
        weekday_dummies = pd.get_dummies(df_X.dayofweek, prefix='dayofweek')
        df_X_dummify = pd.concat([df_X,weekday_dummies],axis=1)
        df_X_dummied = df_X_dummify.drop('dayofweek',axis=1)
        # Rescaling (Standardisation)
        #df_X_std = (df_X_dummied - df_X_dummied.mean()) / df_X_dummied.std()
        # Split the files in train and test sets
        df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X_dummied, df_y, test_size = 0.3, random_state = 100)
        rfc.fit(df_X_train, df_y_train)
        df_y_pred = rfc.predict(df_X_test)
        regr_predictions = pd.DataFrame({'True Travel Time':df_y_test,'Estimated Travel Time':df_y_pred})
        # Writing the coefficients in the dataframes
        segments.append(files)
        r2s.append(r2_score(df_y_test,df_y_pred))
        mar.append(mean_absolute_error(df_y_test,df_y_pred))
        mse.append(mean_squared_error(df_y_test,df_y_pred))
        mae.append(median_absolute_error(df_y_test,df_y_pred))
#         print('Variance: {}'.format(explained_variance_score(df_y_test,df_y_pred)))
    else:
        print("File: {}, not found".format(fname))
        break
df_rfc_error_coef['Segment']=segments
df_rfc_error_coef['R2']=r2s
df_rfc_error_coef['Mean Absolute Error']=mar
df_rfc_error_coef['Mean Squared Error']=mse
df_rfc_error_coef['Median Absolute Error']=mae

In [176]:
df_rfc_error_coef['Mean Absolute Error'].mean()

16.40921728467843

In [177]:
df_rfc_error_coef

Unnamed: 0,Segment,R2,Mean Absolute Error,Mean Squared Error,Median Absolute Error
0,767_768,-0.192216,19.334323,593.613624,16.165000
1,768_769,-0.106908,24.629917,1074.188867,18.637500
2,769_770,-0.150830,19.726434,634.642945,16.190000
3,770_771,-0.259827,9.800353,215.152568,7.030000
4,771_772,0.087423,13.478388,438.029738,8.690000
5,772_773,-0.059442,21.402030,798.491285,16.970000
6,773_774,-0.025870,31.072347,1833.009832,21.212500
7,774_775,-0.011827,15.726299,431.884391,12.570000
8,775_776,-0.041503,18.528828,555.649070,15.380000
9,776_777,-0.173532,17.144393,503.387295,13.642500
