In [50]:
library(ggmap)
library(dplyr)
library(VGAM)
citation("ggmap")


To cite ggmap in publications, please use:

  D. Kahle and H. Wickham. ggmap: Spatial Visualization with ggplot2.
  The R Journal, 5(1), 144-161. URL
  http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf

A BibTeX entry for LaTeX users is

  @Article{,
    author = {David Kahle and Hadley Wickham},
    title = {ggmap: Spatial Visualization with ggplot2},
    journal = {The R Journal},
    year = {2013},
    volume = {5},
    number = {1},
    pages = {144--161},
    url = {https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf},
  }


In [51]:
training <- read.csv("data/train.csv")
holdout <- read.csv("data/test.csv")
mean(training$Longitude)
head(training)
set.seed(12489)

Unnamed: 0_level_0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,⋯,TimeFromFirstStop_p40,TimeFromFirstStop_p50,TimeFromFirstStop_p60,TimeFromFirstStop_p80,DistanceToFirstStop_p20,DistanceToFirstStop_p40,DistanceToFirstStop_p50,DistanceToFirstStop_p60,DistanceToFirstStop_p80,City
Unnamed: 0_level_1,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1921357,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,0,0,⋯,0,0,0,0,0,0,0,0,0,Atlanta
2,1921358,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,0,0,⋯,0,0,0,0,0,0,0,0,0,Atlanta
3,1921359,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,1,0,⋯,0,0,0,0,0,0,0,0,0,Atlanta
4,1921360,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,1,0,⋯,0,0,0,0,0,0,0,0,0,Atlanta
5,1921361,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,2,0,⋯,0,0,0,0,0,0,0,0,0,Atlanta
6,1921362,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,2,0,⋯,0,0,0,0,0,0,0,0,0,Atlanta


In [52]:
drilled_down_training <- group_by(training, IntersectionId, EntryHeading, ExitHeading)
summarise_at(drilled_down_training, vars(TimeFromFirstStop_p80),list(mean=mean,median=median))

IntersectionId,EntryHeading,ExitHeading,mean,median
<int>,<chr>,<chr>,<dbl>,<dbl>
0,E,E,5.6484375,0.0
0,E,N,20.2173913,21.0
0,E,NW,34.0000000,34.0
0,E,SW,39.0000000,42.0
0,N,E,66.2352941,58.0
0,N,N,64.7794118,62.0
0,N,W,79.1428571,69.0
0,NE,E,38.0000000,38.0
0,NE,N,52.1363636,43.0
0,NE,SE,0.0000000,0.0


In [79]:
RMSE = function(predicted, expected, log=0){
  if (log == 0){
      sqrt(mean((predicted - expected)^2))
  } else {
      temp <- which(predicted == 0)
      epred <- exp(predicted)
      epred[temp] <- 0
      sqrt(mean((epred- expected)^2)) 
  }
}

kfold <- function(nn, k, modelWrapper,data, log=0) {
  foldsize = nn/k
  folds = list()
  initial_pool = c(1:nn)
  current_sample = initial_pool
  exhausted_pool = c()
  for (i in 1:(k-1)){
    itrain=sample(current_sample, foldsize)
    folds[[i]]=sort(itrain)
    exhausted_pool <- append(exhausted_pool,itrain)
    current_sample= initial_pool[-exhausted_pool]
  }
  folds[[k]] = current_sample

  perf_measures = list()
  for (i in 1:k){
    train_indexes = unlist(folds[c(1:k)[-i]])
    holdout_indexes=unlist(folds[c(1:k)[i]])

    train = data[train_indexes,]
    holdo = data[holdout_indexes,]

    model <- modelWrapper(train)
    outPredict <- predict(model, newdata=holdo,interval = "prediction")
    if (identical(modelWrapper, tobitModelWrapper)){
        head(outPredict)
        fit <- outPredict[,1]
        lwr <- outPredict[,1] - qnorm(0.975)*exp(outPredict[,2])
        upr <- outPredict[,1] + qnorm(0.975)*exp(outPredict[,2])
        outPredict <- cbind(fit,lwr,upr)
        head(outPredict)
    }
      
    if (log == 0) {
#       perf_measures[[i]] <- RMSE(outPredict, holdo$TotalTimeStopped_p50)
        perf_measures[[i]] <- intervalScore(outPredict, holdo$TotalTimeStopped_p50, 0.2)
    } else {
#       perf_measures[[i]] <- RMSE(outPredict, holdo$TotalTimeStopped_p50, log=1)
        temp <- which(holdo$TotalTimeStopped_p50 == 0)
        logActual <- log(holdo$TotalTimeStopped_p50)
        logActual[temp] <- 0
        
        perf_measures[[i]] <- intervalScore(outPredict, logActual, 0.2)
    }
  }
  mean(unlist(perf_measures))
}

augmentRightTurn <- function(dataset) {
    isRightTurn <- function(entry, exit) {
        rightexit <- list(
            N = c("NW", "W", "SW"),
            NW = c("W", "SW", "S"),
            W = c("SW", "S", "SE"),
            SW = c("S", "SE", "E"),
            S = c("SE", "E", "NE"),
            SE = c("E", "NE", "N"),
            E = c("NE", "N", "NW"),
            NE = c("N", "NW", "W")
        )
        exit %in% rightexit[[entry]]
    }
    vIsRightTurn <- Vectorize(isRightTurn)
    temp <- dataset %>% 
        mutate(RightTurn = vIsRightTurn(EntryHeading, ExitHeading)) %>%
        group_by(IntersectionId, EntryHeading) %>%
        summarize(RightTurnAllowed = max(RightTurn)) %>%
        inner_join(dataset)
    temp %>% mutate(RightTurn = ifelse(vIsRightTurn(EntryHeading, ExitHeading), 1, 0))
}

augmentLeftTurn <- function(dataset) {
    isLeftTurn <- function(entry, exit) {
        leftexit <- list(
            N = c("NE", "E", "SE"),
            NW = c("N", "NE", "E"),
            W = c("NW", "N", "NE"),
            SW = c("W", "NW", "N"),
            S = c("SW", "W", "NW"),
            SE = c("S", "SW", "W"),
            E = c("SE", "S", "SW"),
            NE = c("E", "SE", "S")
        )
        exit %in% leftexit[[entry]]
    }
    vIsLeftTurn <- Vectorize(isLeftTurn)
    temp <- dataset %>% 
        mutate(LeftTurn = vIsLeftTurn(EntryHeading, ExitHeading)) %>%
        group_by(IntersectionId, EntryHeading) %>%
        summarize(LeftTurnAllowed = max(LeftTurn)) %>%
        inner_join(dataset)
    temp %>% mutate(LeftTurn = ifelse(vIsLeftTurn(EntryHeading, ExitHeading), 1, 0))
}

transformDataset <- function(dataset){
    augmented <- augmentRightTurn(training)
    augmented <- augmentLeftTurn(augmented)
    augmented <- augmented %>% mutate(straightThrough = ifelse(EntryHeading == ExitHeading, 1, 0))
    augmented <- augmented %>% mutate(
    JanAndMay = ifelse(Month == 1 | Month == 5, 1, 0),
    rushHour = ifelse((Hour >= 6 & Hour <= 9) | (Hour >= 15 & Hour <= 18), 1, 0),
    hasWaitTime = ifelse(DistanceToFirstStop_p50 > 0 & TimeFromFirstStop_p50 > 0, 1, 0),
    logTotalTimeStopped_p50 = ifelse(hasWaitTime == 1, log(TotalTimeStopped_p50), TotalTimeStopped_p50))
    augmented
}

#' interval score function for prediction intervals,
#' smaller value is better
#'
#' @description
#' interval score for prediction intervals
#'
#' @param predobj has 3 (or more) columns: pointprediction, predLB, predUB
#' @param actual vector of actual values (in holdout set, for example)
#' @param alpha level for prediction interval, 
#'  1-alpha is expected coverage proportion if model is valid;
#'  alpha=0.2 for 80% prediction intervals
#'
#' @return interval score 
#'
intervalScore=function(predObj,actual,alpha)
{ n=nrow(predObj)
  ilow=(actual<predObj[,2])  # underestimation
  ihigh=(actual>predObj[,3]) # overestimation
  sumlength=sum(predObj[,3]-predObj[,2]) # sum of lengths of prediction intervals
  sumlow=sum(predObj[ilow,2]-actual[ilow])*2/alpha
  sumhigh=sum(actual[ihigh]-predObj[ihigh,3])*2/alpha
  (sumlength+sumlow+sumhigh)/n # average length + average under/over penalties
}



In [54]:
augmentedTraining <- transformDataset(training)
augmentedHoldout <- transformDataset(holdout)



`summarise()` has grouped output by 'IntersectionId'. You can override using the `.groups` argument.

Joining, by = c("IntersectionId", "EntryHeading")

`summarise()` has grouped output by 'IntersectionId'. You can override using the `.groups` argument.

Joining, by = c("IntersectionId", "EntryHeading")

`summarise()` has grouped output by 'IntersectionId'. You can override using the `.groups` argument.

Joining, by = c("IntersectionId", "EntryHeading")

`summarise()` has grouped output by 'IntersectionId'. You can override using the `.groups` argument.

Joining, by = c("IntersectionId", "EntryHeading")



In [55]:
head(augmentedTraining)
names(augmentedTraining)

IntersectionId,EntryHeading,LeftTurnAllowed,RightTurnAllowed,RowId,Latitude,Longitude,EntryStreetName,ExitStreetName,ExitHeading,⋯,DistanceToFirstStop_p60,DistanceToFirstStop_p80,City,RightTurn,LeftTurn,straightThrough,JanAndMay,rushHour,hasWaitTime,logTotalTimeStopped_p50
<int>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,E,1,1,2077842,42.29174,-71.07718,Talbot Avenue,Talbot Avenue,E,⋯,58.7,78.6,Boston,0,0,1,0,1,0,0.0
0,E,1,1,2077854,42.29174,-71.07718,Talbot Avenue,New England Avenue,SW,⋯,0.0,106.9,Boston,0,1,0,0,0,0,0.0
0,E,1,1,2077855,42.29174,-71.07718,Talbot Avenue,Norwell Street,NW,⋯,72.5,103.0,Boston,1,0,0,0,0,1,2.484907
0,E,1,1,2077861,42.29174,-71.07718,Talbot Avenue,Talbot Avenue,E,⋯,0.0,106.8,Boston,0,0,1,0,1,0,0.0
0,E,1,1,2077867,42.29174,-71.07718,Talbot Avenue,Talbot Avenue,E,⋯,69.2,75.6,Boston,0,0,1,0,1,1,2.079442
0,E,1,1,2077873,42.29174,-71.07718,Talbot Avenue,Talbot Avenue,E,⋯,62.8,71.3,Boston,0,0,1,0,1,0,0.0


In [80]:
basicLinearModelWrapper <- function(data){
    lm(TotalTimeStopped_p50~hasWaitTime+Hour+rushHour+straightThrough+Weekend+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed + LeftTurn+RightTurn, data=data)
    #summary(basicLinearModel)
}
rushLeftModelWrapper <- function(data){
    lm(TotalTimeStopped_p50~hasWaitTime+Hour+rushHour+LeftTurn*rushHour+straightThrough+Weekend+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed+ LeftTurn+RightTurn, data=data)
    #summary(rushLeftModel)
}
rushModelWrapper <- function(data){
    lm(TotalTimeStopped_p50~hasWaitTime+Hour+Hour*rushHour+rushHour+LeftTurn*rushHour+straightThrough+straightThrough*rushHour+Weekend+Weekend*rushHour+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed+ LeftTurn+RightTurn, data=data)
    #summary(rushModel)
}
monthModelWrapper <- function(data){
    lm(TotalTimeStopped_p50~hasWaitTime+Hour+Month+Hour*rushHour+rushHour+LeftTurn*rushHour+straightThrough+straightThrough*rushHour+Weekend+Weekend*rushHour+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed+ LeftTurn+RightTurn, data=data)
}
straightInteractionModelWrapper <- function(data){
    lm(TotalTimeStopped_p50~hasWaitTime+Hour+Hour*rushHour+rushHour+LeftTurn*rushHour+straightThrough+straightThrough*rushHour+straightThrough*LeftTurnAllowed+straightThrough*RightTurnAllowed+Weekend+Weekend*rushHour+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed+ LeftTurn+RightTurn, data=data)
}

logModelWrapper <- function(data){
    lm(logTotalTimeStopped_p50~hasWaitTime+Hour+Hour*rushHour+rushHour+LeftTurn*rushHour+straightThrough+straightThrough*rushHour+straightThrough*LeftTurnAllowed+straightThrough*RightTurnAllowed+Weekend+Weekend*rushHour+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed+ LeftTurn+RightTurn, data=data)    
}

tobitModelWrapper <- function(data){
    vglm(TotalTimeStopped_p50~hasWaitTime+Hour+Hour*rushHour+rushHour+LeftTurn*rushHour+straightThrough+straightThrough*rushHour+straightThrough*LeftTurnAllowed+straightThrough*RightTurnAllowed+Weekend+Weekend*rushHour+JanAndMay+City+LeftTurnAllowed+RightTurnAllowed+ LeftTurn+RightTurn, tobit(Lower = 0), data = data)
}


totalRows = nrow(augmentedTraining)
k = 5 
print(kfold(totalRows, k, basicLinearModelWrapper,data=augmentedTraining))
print(kfold(totalRows, k, rushLeftModelWrapper,data=augmentedTraining))
print(kfold(totalRows, k, rushModelWrapper,data=augmentedTraining))
print(kfold(totalRows, k, monthModelWrapper,data=augmentedTraining))
print(kfold(totalRows, k, straightInteractionModelWrapper,data=augmentedTraining))
print(kfold(totalRows, k, logModelWrapper,data=augmentedTraining), log=1)
print(kfold(totalRows, k, tobitModelWrapper, data=augmentedTraining))



[1] 48.60943
[1] 48.60831
[1] 48.51678
[1] 48.51642
[1] 48.46743
[1] 67.02132
[1] 380.7536


In [None]:
basicLinearModel <- basicLinearModelWrapper(augmentedTraining)
rushLeftModel <- rushLeftModelWrapper(augmentedTraining)
rushModel <- rushModelWrapper(augmentedTraining)
monthModel <- monthModelWrapper(augmentedTraining)
straightInteractionModel<- straightInteractionModelWrapper(augmentedTraining)
logModel<- logModelWrapper(augmentedTraining)
tobitModel <- tobitModelWrapper(augmentedTraining)

#Out Of Sample Predictions

In [63]:

outPredict1 <- predict(basicLinearModel, newdata=augmentedHoldout)
outPredict2 <- predict(rushLeftModel, newdata=augmentedHoldout)
outPredict3 <- predict(rushModel, newdata=augmentedHoldout)
outPredict4 <- predict(monthModel, newdata=augmentedHoldout)
outPredict5 <- predict(straightInteractionModel, newdata=augmentedHoldout)
outPredict6 <- predict(logModel, newdata=augmentedHoldout)
outpredict7 <- predict(tobitModel, newdata=augmentedHoldout)

RMSE1 <- RMSE(outPredict1, augmentedHoldout$TotalTimeStopped_p50)
RMSE2 <- RMSE(outPredict2, augmentedHoldout$TotalTimeStopped_p50)
RMSE3 <- RMSE(outPredict3, augmentedHoldout$TotalTimeStopped_p50)
RMSE4 <- RMSE(outPredict4, augmentedHoldout$TotalTimeStopped_p50)
RMSE5 <- RMSE(outPredict5, augmentedHoldout$TotalTimeStopped_p50)
RMSE6 <- RMSE(outPredict6, augmentedHoldout$TotalTimeStopped_p50, log=1)
RMSE7 <- RMSE(outpredict7, augmentedHoldout$TotalTimeStopped_p50)

print(RMSE1)
print(RMSE2)
print(RMSE3)
print(RMSE4)
print(RMSE5)
print(RMSE6)
print(RMSE7)





[1] 10.68748
[1] 10.68728
[1] 10.66762
[1] 10.66749
[1] 10.65503
[1] 11.05359
[1] 49.35929


In [None]:
plot(straightInteractionModel)

In [None]:
plot(tobitModel)

In [78]:
outPredict1 <- predict(basicLinearModel, newdata=augmentedHoldout, interval="prediction")
outPredict2 <- predict(rushLeftModel, newdata=augmentedHoldout, interval="prediction")
outPredict3 <- predict(rushModel, newdata=augmentedHoldout, interval="prediction")
outPredict4 <- predict(monthModel, newdata=augmentedHoldout, interval="prediction")
outPredict5 <- predict(straightInteractionModel, newdata=augmentedHoldout, interval="prediction")
outPredict6 <- predict(logModel, newdata=augmentedHoldout, interval="prediction")
outpredict7 <- predict(tobitModel, newdata=augmentedHoldout)

fit <- outPredict7[,1]
lwr <- outPredict7[,1] - qnorm(0.975)*exp(outPredict7[,2])
upr <- outPredict7[,1] + qnorm(0.975)*exp(outPredict7[,2])
outPredict7s <- cbind(fit,lwr,upr)

IS1 <- RMSE(outPredict1, augmentedHoldout$TotalTimeStopped_p50)
IS2 <- RMSE(outPredict2, augmentedHoldout$TotalTimeStopped_p50)
IS3 <- RMSE(outPredict3, augmentedHoldout$TotalTimeStopped_p50)
IS4 <- RMSE(outPredict4, augmentedHoldout$TotalTimeStopped_p50)
IS5 <- RMSE(outPredict5, augmentedHoldout$TotalTimeStopped_p50)
IS6 <- RMSE(outPredict6, augmentedHoldout$TotalTimeStopped_p50, log=1)
IS7 <- RMSE(outpredict7s, augmentedHoldout$TotalTimeStopped_p50)

print(IS1)
print(IS2)
print(IS3)
print(IS4)
print(IS5)
print(IS6)
print(IS7)

Unnamed: 0,mu,loglink(sd),mu.1,loglink(sd).1
1,-75.86179,2.921365,-75.86179,2.921365
2,-77.81885,2.921365,-77.81885,2.921365
3,30.25493,2.921365,30.25493,2.921365
4,-75.86179,2.921365,-75.86179,2.921365
5,30.48731,2.921365,30.48731,2.921365
6,-75.39885,2.921365,-75.39885,2.921365
7,-78.42395,2.921365,-78.42395,2.921365
8,24.68331,2.921365,24.68331,2.921365
9,24.68013,2.921365,24.68013,2.921365
10,-79.27115,2.921365,-79.27115,2.921365
