### Import and transformations of API data

In [1]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [None]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

In [3]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [4]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [5]:
# pull data for London

# acquire air quality data
ld_aq_url <- paste0("https://biendata.com/competition/airquality/ld/",startDate,"/",endDate,"/2k0d1d8")
ld_aq_file <- getURL(ld_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_aq_data <- read.csv(textConnection(ld_aq_file), header=TRUE)
  
# acquire API grid meteorology data
ld_gm_url <- paste0("https://biendata.com/competition/meteorology/ld_grid/",startDate,"/",endDate,"/2k0d1d8")
ld_gm_file <- getURL(ld_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_gm_data <- read.csv(textConnection(ld_gm_file), header=TRUE)

In [6]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [7]:
# Converting character to datetime
ld_gm_data$time <- anytime(ld_gm_data$time)
ld_aq_data$time <- anytime(ld_aq_data$time)

In [8]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "ld_gm_data"
'data.frame':	1035697 obs. of  9 variables:
 $ id            : int  2000096 2000097 2000098 2000099 2000100 2000101 2000102 2000103 2000104 2000105 ...
 $ station_id    : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 8 levels "CLEAR_DAY","CLEAR_NIGHT",..: 3 3 3 6 6 6 6 6 6 6 ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed    : num  22 18.9 15.8 14.1 13.6 ...
NULL
[1] "ld_aq_data"
'data.frame':	23006 obs. of  9 variables:
 $ id                : int  2941506 2941507 2941508 2941509 2941510 2941511 2941512 2941513 2941514 2941515 ...
 $ station_id        : Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ time       

In [9]:
# Printing the min and max dates of all datasets
print("ld_gm_data")
ld_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("ld_aq_data")
ld_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "ld_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-21


[1] "ld_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-20 22:00:00


In [10]:
# using only required columns
ld_aq_data <- ld_aq_data %>% select(-id)
ld_gm_data <- ld_gm_data %>% select(-c(id, weather))

In [11]:
# London closest grids to stations
ld_closest_stations <- read.csv('SL_london_closest_stations.csv')

In [13]:
# Modifying column names for consistency
colnames(ld_closest_stations) <- c('x',"stationId","stationName","distance") 
colnames(ld_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(ld_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [14]:
str(ld_closest_stations)
str(ld_gm_data)
str(ld_aq_data)
head(ld_closest_stations)

'data.frame':	24 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 24 levels "BL0","BX1","BX9",..: 3 2 1 5 4 7 8 6 10 12 ...
 $ stationName: Factor w/ 10 levels "london_grid_346",..: 10 10 6 6 4 6 6 5 8 8 ...
 $ distance   : num  3929 3929 3060 3683 5212 ...
'data.frame':	1035697 obs. of  7 variables:
 $ stationName   : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed.kph: num  22 18.9 15.8 14.1 13.6 ...
'data.frame':	23006 obs. of  8 variables:
 $ stationId: Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:

x,stationId,stationName,distance
1,BX9,london_grid_472,3929.17
2,BX1,london_grid_472,3929.17
3,BL0,london_grid_409,3059.76
4,CD9,london_grid_409,3682.914
5,CD1,london_grid_388,5211.633
6,CT2,london_grid_409,1646.945


In [15]:
# Mapping stationIds with respective grids
ld_aq_map <- merge(ld_aq_data,ld_closest_stations, by = "stationId")

In [16]:
# Merge of AirQuality and Meteorology data
ld_aq_gm_data <- merge(ld_aq_map, ld_gm_data, by = c("stationName","utc_time"))

In [17]:
str(ld_aq_gm_data)
head(ld_aq_gm_data)
ld_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	22739 obs. of  16 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ NO2           : num  NA NA NA NA NA NA NA NA NA NA ...
 $ CO            : logi  NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ SO2           : logi  NA NA NA NA NA NA ...
 $ x             : int  21 21 21 21 21 21 21 21 21 21 ...
 $ distance      : num  4235 4235 4235 4235 4235 ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num 

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
london_grid_366,2018-03-31 07:00:00,TD5,8.7,,,,,,21,4234.613,4.83,986.8419,87,205.39,11.3
london_grid_366,2018-03-31 08:00:00,TD5,6.8,,,,,,21,4234.613,5.42,987.2517,84,210.33,11.95
london_grid_366,2018-03-31 09:00:00,TD5,4.5,,,,,,21,4234.613,6.05,987.7612,81,214.35,12.01
london_grid_366,2018-03-31 10:00:00,TD5,8.2,,,,,,21,4234.613,6.1,988.8336,79,233.7,10.18
london_grid_366,2018-03-31 11:00:00,TD5,11.8,,,,,,21,4234.613,7.05,991.6767,74,242.44,7.95
london_grid_366,2018-03-31 12:00:00,TD5,10.2,,,,,,21,4234.613,7.8,992.0664,71,267.43,6.76


min_date,max_date
2018-03-31 07:00:00,2018-05-20 22:00:00


In [31]:
# selecting only required columns
ld_aq_gm_data <- ld_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [32]:
# adding additional columns for modeling
ld_aq_gm_data$hour <- hour(ld_aq_gm_data$utc_time)
ld_aq_gm_data$month <- month(ld_aq_gm_data$utc_time)
ld_aq_gm_data$date <- date(ld_aq_gm_data$utc_time)
ld_aq_gm_data$weekend = chron::is.weekend(ld_aq_gm_data$date)

In [33]:
str(ld_aq_gm_data)

'data.frame':	22739 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...


### Import and transformations of Historical data

In [21]:
# retreiving historical files
ld_aq_gm_hist_file <- "Ready for Modeling/ld_aq_gm_hist_data.csv"
ld_aq_gm_hist_data <- read.csv(ld_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [23]:
# data transformations for consistency
ld_aq_gm_hist_data <- ld_aq_gm_hist_data %>% select(-X)
ld_aq_gm_hist_data$utc_time <- anytime(ld_aq_gm_hist_data$utc_time)
ld_aq_gm_hist_data$date <- as.Date(ld_aq_gm_hist_data$date , "%Y-%m-%d")

In [35]:
colnames(ld_aq_gm_hist_data) <- c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                 "hour","month", "date", "weekend")

In [36]:
str(ld_aq_gm_hist_data)

'data.frame':	257936 obs. of  15 variables:
 $ stationName   : chr  "london_grid_346" "london_grid_346" "london_grid_346" "london_grid_346" ...
 $ utc_time      : POSIXct, format: "2017-01-01 08:00:00" "2017-01-01 09:00:00" ...
 $ stationId     : chr  "LH0" "LH0" "LH0" "LH0" ...
 $ PM2.5         : num  18.3 16.3 13.3 9.4 6.1 6.7 2.1 0.9 1.1 1 ...
 $ PM10          : num  21.3 19.5 16.2 11.6 8.5 13.4 4.6 2 2.1 2 ...
 $ O3            : num  41.6 44.1 49.1 45.2 41.4 53.6 11.7 12.1 12 13.5 ...
 $ temperature   : num  6.05 6.05 6.04 6.04 6.25 6.46 6.67 6.91 7.15 7.39 ...
 $ pressure      : num  1019 1018 1017 1016 1015 ...
 $ humidity      : num  90 89.2 88.3 87.4 87.8 ...
 $ wind_direction: num  221 220 218 217 216 ...
 $ wind_speed.kph: num  16.4 17.1 17.7 18.4 18.8 ...
 $ hour          : int  8 9 10 11 12 13 14 15 16 17 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE .

In [37]:
# Append API and hist data
ld_aq_gm_combined_data <- rbind(ld_aq_gm_data, ld_aq_gm_hist_data)

In [38]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(date), max_date = max(date))

'data.frame':	280675 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : num  NA NA NA NA NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TR

min_date,max_date
2017-01-01,2018-05-20


In [39]:
# removing NA rows
ld_aq_gm_combined_data_final <- ld_aq_gm_combined_data[complete.cases(ld_aq_gm_combined_data),]

In [43]:
# converting logical weekend value to numeric for modeling
ld_aq_gm_combined_data_final$weekend <- as.integer(ld_aq_gm_combined_data_final$weekend)

In [44]:
summary(ld_aq_gm_combined_data_final)

          stationName       utc_time                     stationId    
 london_grid_409:27337   Min.   :2017-01-01 08:00:00   GR9    :10284  
 london_grid_451:26318   1st Qu.:2017-04-05 14:00:00   BL0    :10059  
 london_grid_430:24847   Median :2017-07-15 01:00:00   CD9    : 9628  
 london_grid_472:13347   Mean   :2017-07-30 12:18:48   CD1    : 9552  
 london_grid_388: 9798   3rd Qu.:2017-11-22 17:00:00   RB7    : 9480  
 london_grid_452: 9480   Max.   :2018-03-27 12:00:00   GN3    : 9452  
 (Other)        :14478                                 (Other):67150  
     PM2.5             PM10              O3          temperature   
 Min.   :-14.00   Min.   :-11.80   Min.   : -4.70   Min.   :-4.37  
 1st Qu.:  6.50   1st Qu.: 11.40   1st Qu.: 20.50   1st Qu.: 5.54  
 Median : 10.10   Median : 16.90   Median : 35.80   Median : 9.52  
 Mean   : 13.44   Mean   : 20.41   Mean   : 41.21   Mean   :10.06  
 3rd Qu.: 16.40   3rd Qu.: 25.50   3rd Qu.: 55.70   3rd Qu.:14.55  
 Max.   :189.70   Max.  

### Model Training and validation

In [45]:
set.seed(7821)

In [46]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(ld_aq_gm_combined_data_final))
train_index <- sample(seq_len(nrow(ld_aq_gm_combined_data_final)), size = sample_size)
train_ld <- ld_aq_gm_combined_data_final[train_index, ]
test_ld <- ld_aq_gm_combined_data_final[-train_index, ]

In [47]:
str(train_ld)
str(test_ld)

'data.frame':	100484 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 7 7 8 4 5 9 4 6 9 6 ...
 $ utc_time      : POSIXct, format: "2018-01-15 05:00:00" "2017-05-04 05:00:00" ...
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 11 11 9 4 17 16 4 5 16 5 ...
 $ PM2.5         : num  5.2 27.9 28.8 13.7 57 11 15.8 6.7 12 5.8 ...
 $ PM10          : num  13.2 46.3 33.7 17 76 10.6 34 13.5 14.6 8.1 ...
 $ O3            : num  14.4 44.5 28.6 46.4 32.8 9.4 50.6 41.6 44.2 74.5 ...
 $ temperature   : num  4.96 8.98 11.86 6.64 3.76 ...
 $ pressure      : num  1012 1019 1007 1003 1015 ...
 $ humidity      : num  74 93.5 85 79.6 92.8 ...
 $ wind_direction: num  216.1 33.8 183.6 154.8 86.2 ...
 $ wind_speed.kph: num  12.08 17.74 6.85 15.87 26.62 ...
 $ hour          : int  5 5 2 14 9 9 12 20 19 6 ...
 $ month         : num  1 5 11 3 2 8 2 7 10 2 ...
 $ date          : Date, format: "2018-01-15" "2017-05-04" ...
 $ weekend       : int  0 0 1 0 0 1 1 0 0 0 ..

In [79]:
# Predicting PM2.5 using stationId, hour, month and weekend variables
PM2.5_ld_formula <- as.formula("PM2.5 ~ stationId + hour + month + weekend")
PM2.5_ld_model <- train(PM2.5_ld_formula, data = train_ld, method = "lm" )
summary(PM2.5_ld_model)
test_ld$PM2.5_pred <- predict(PM2.5_ld_model, test_ld)
metrics(test_ld, truth = PM2.5, estimate = PM2.5_pred)

"prediction from a rank-deficient fit may be misleading"


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-28.894  -6.601  -2.915   2.889 173.658 

Coefficients: (7 not defined because of singularities)
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  16.98701    0.15190 111.834  < 2e-16 ***
stationIdBX1 -3.27682    0.23631 -13.867  < 2e-16 ***
stationIdBX9       NA         NA      NA       NA    
stationIdCD1  1.36759    0.17914   7.634 2.29e-14 ***
stationIdCD9 -0.70900    0.17847  -3.973 7.12e-05 ***
stationIdCT2       NA         NA      NA       NA    
stationIdCT3 -0.82535    0.19001  -4.344 1.40e-05 ***
stationIdGN0 -2.18403    0.17995 -12.137  < 2e-16 ***
stationIdGN3 -2.33722    0.17925 -13.039  < 2e-16 ***
stationIdGR4 -1.68955    0.19041  -8.873  < 2e-16 ***
stationIdGR9 -3.13022    0.17529 -17.858  < 2e-16 ***
stationIdHV1 -3.01384    0.17955 -16.786  < 2e-16 ***
stationIdKF1       NA         NA      NA       NA    
stationIdLW2  1.66583    0.21040   7.918 2.45e-1

"prediction from a rank-deficient fit may be misleading"

rmse,rsq
11.3191,0.04936939


In [80]:
# Predicting PM10 using stationId, hour, month, weekend variables along with previously predicted PM2.5 
PM10_ld_formula <- as.formula("PM10 ~ PM2.5 + stationId + hour + month + weekend")
PM10_ld_model <- train(PM10_ld_formula, data = train_ld, method = "lm" )
summary(PM10_ld_model)
test_ld$PM10_pred <- predict(PM10_ld_model, test_ld)
metrics(test_ld, truth = PM10, estimate = PM10_pred)

"prediction from a rank-deficient fit may be misleading"


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
   Min     1Q Median     3Q    Max 
-65.78  -3.62  -0.70   2.70 611.78 

Coefficients: (7 not defined because of singularities)
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   4.206195   0.102149  41.177  < 2e-16 ***
PM2.5         1.010366   0.002001 504.978  < 2e-16 ***
stationIdBX1 -3.137778   0.150006 -20.918  < 2e-16 ***
stationIdBX9        NA         NA      NA       NA    
stationIdCD1  0.234062   0.113640   2.060   0.0394 *  
stationIdCD9  2.305246   0.113194  20.366  < 2e-16 ***
stationIdCT2        NA         NA      NA       NA    
stationIdCT3  4.025365   0.120514  33.402  < 2e-16 ***
stationIdGN0  1.700674   0.114206  14.891  < 2e-16 ***
stationIdGN3  3.055402   0.113772  26.856  < 2e-16 ***
stationIdGR4  2.021493   0.120804  16.734  < 2e-16 ***
stationIdGR9  5.523282   0.111339  49.608  < 2e-16 ***
stationIdHV1  1.831769   0.114024  16.065  < 2e-16 ***
stationIdKF1        NA         NA      NA   

"prediction from a rank-deficient fit may be misleading"

rmse,rsq
6.692685,0.7684861


In [81]:
# Predicting O3 using stationId, hour, month, weekend variables along with previously predicted PM2.5, PM10 and O3 variables
O3_ld_formula <- as.formula("O3 ~ PM2.5 + PM10 + hour + month + weekend")
O3_ld_model <- train(O3_ld_formula, data = train_ld, method = "lm" )
summary(O3_ld_model)
test_ld$O3_pred <- predict(O3_ld_model, test_ld)
metrics(test_ld, truth = O3, estimate = O3_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-399.14  -16.79   -4.32   12.55  173.34 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 19.971087   0.229706   86.94  < 2e-16 ***
PM2.5        0.346505   0.011860   29.22  < 2e-16 ***
PM10         0.565391   0.009711   58.22  < 2e-16 ***
hour         0.556025   0.010972   50.68  < 2e-16 ***
month        0.059144   0.022148    2.67  0.00758 ** 
weekend     -5.701797   0.168545  -33.83  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24.04 on 100478 degrees of freedom
Multiple R-squared:  0.2174,	Adjusted R-squared:  0.2174 
F-statistic:  5584 on 5 and 100478 DF,  p-value: < 2.2e-16


rmse,rsq
23.95667,0.2118861


### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [134]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [135]:
# building 1 hour intervals for next 2 days and stationIds
ld_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
ld_time <- with_tz(ld_time, tzone = "UTC")
ld_future_data <- data.frame(ld_time)
# This id will be used in the creation of final submission file
ld_future_data$id <- seq.int(nrow(ld_future_data)) -1
ld_future_data <- merge(ld_future_data, data.frame(unique(ld_aq_gm_combined_data$stationId)))
names(ld_future_data) <- c("utc_time", "id", "stationId")

In [136]:
# building datetime features for the next 2 days
ld_future_data$hour <- hour(ld_future_data$utc_time)
ld_future_data$month <- month(ld_future_data$utc_time)
ld_future_data$date <- date(ld_future_data$utc_time)
ld_future_data$weekend = chron::is.weekend(ld_future_data$date)

In [137]:
ld_future_data$weekend <- as.integer(ld_future_data$weekend)

In [138]:
# predicting PM2.5, PM10 and O3 values
ld_future_data$PM2.5 <- predict(PM2.5_ld_model, ld_future_data)
ld_future_data$PM10 <- predict(PM10_ld_model, ld_future_data)
ld_future_data$O3 <- predict(O3_ld_model, ld_future_data)

"prediction from a rank-deficient fit may be misleading"

In [139]:
#creating the test_id for final submission file
ld_future_data$test_id <- paste(ld_future_data$stationId, "#", ld_future_data$id, sep = "")

In [140]:
str(ld_future_data)

'data.frame':	1152 obs. of  11 variables:
 $ utc_time : POSIXct, format: "2018-05-22 00:00:00" "2018-05-22 01:00:00" ...
 $ id       : num  0 1 2 3 4 5 6 7 8 9 ...
 $ stationId: Factor w/ 24 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ hour     : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month    : num  5 5 5 5 5 5 5 5 5 5 ...
 $ date     : Date, format: "2018-05-22" "2018-05-22" ...
 $ weekend  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ PM2.5    : num  13.9 14 14.1 14.2 14.3 ...
 $ PM10     : num  18.1 18.2 18.4 18.6 18.7 ...
 $ O3       : num  35.3 36 36.7 37.3 38 ...
 $ test_id  : chr  "TD5#0" "TD5#1" "TD5#2" "TD5#3" ...


In [141]:
head(ld_future_data, 20)

utc_time,id,stationId,hour,month,date,weekend,PM2.5,PM10,O3,test_id
2018-05-22 00:00:00,0,TD5,0,5,2018-05-22,0,13.93636,18.09443,35.32625,TD5#0
2018-05-22 01:00:00,1,TD5,1,5,2018-05-22,0,14.01674,18.24936,35.99772,TD5#1
2018-05-22 02:00:00,2,TD5,2,5,2018-05-22,0,14.09713,18.40429,36.66919,TD5#2
2018-05-22 03:00:00,3,TD5,3,5,2018-05-22,0,14.17751,18.55921,37.34066,TD5#3
2018-05-22 04:00:00,4,TD5,4,5,2018-05-22,0,14.25789,18.71414,38.01213,TD5#4
2018-05-22 05:00:00,5,TD5,5,5,2018-05-22,0,14.33827,18.86907,38.6836,TD5#5
2018-05-22 06:00:00,6,TD5,6,5,2018-05-22,0,14.41865,19.02399,39.35508,TD5#6
2018-05-22 07:00:00,7,TD5,7,5,2018-05-22,0,14.49903,19.17892,40.02655,TD5#7
2018-05-22 08:00:00,8,TD5,8,5,2018-05-22,0,14.57942,19.33384,40.69802,TD5#8
2018-05-22 09:00:00,9,TD5,9,5,2018-05-22,0,14.6598,19.48877,41.36949,TD5#9


In [142]:
write.csv(ld_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("ld_submission",Sys.Date(),".csv"), row.names = FALSE)

In [143]:
# resetting the timezone
Sys.unsetenv("TZ")