### Import and transformations of API data

In [1]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [None]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

In [3]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [4]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [6]:
# pull data for Beijing

# acquire air quality data
bj_aq_url <- paste0("https://biendata.com/competition/airquality/bj/",startDate,"/",endDate,"/2k0d1d8")
bj_aq_file <- getURL(bj_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_aq_data <- read.csv(textConnection(bj_aq_file), header=TRUE)
  
# acquire API grid meteorology data
bj_gm_url <- paste0("https://biendata.com/competition/meteorology/bj_grid/",startDate,"/",endDate,"/2k0d1d8")
bj_gm_file <- getURL(bj_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_gm_data <- read.csv(textConnection(bj_gm_file), header=TRUE)

In [7]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [8]:
# Converting character to datetime
bj_gm_data$time <- anytime(bj_gm_data$time)
bj_aq_data$time <- anytime(bj_aq_data$time)

In [9]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "bj_gm_data"
'data.frame':	785704 obs. of  9 variables:
 $ id            : int  2000958 2000959 2000960 2000961 2000962 2000963 2000964 2000965 2000966 2000967 ...
 $ station_id    : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 9 levels "CLEAR_DAY","CLEAR_NIGHT",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ temperature   : num  24 24 25 25 25 25 25 26 22.4 22.4 ...
 $ pressure      : num  973 960 946 931 914 ...
 $ humidity      : num  23 21 19 18 16 15 15 16 16 17 ...
 $ wind_direction: num  161 165 170 184 221 ...
 $ wind_speed    : num  14.28 13.3 12.42 9.84 7.39 ...
NULL
[1] "bj_aq_data"
'data.frame':	40495 obs. of  9 variables:
 $ id                : int  2941450 2941451 2941452 2941453 2941454 2941455 2941456 2941457 2941458 2941459 ...
 $ station_id        : Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ time              : PO

In [10]:
# Printing the min and max dates of all datasets
print("bj_gm_data")
bj_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("bj_aq_data")
bj_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "bj_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-21


[1] "bj_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-20 23:00:00


In [11]:
# using only required columns
bj_aq_data <- bj_aq_data %>% select(-id)
bj_gm_data <- bj_gm_data %>% select(-c(id, weather))

In [12]:
# Beijing closest grids to stations
bj_closest_stations <- read.csv('SL_beijing_closest_stations.csv')
colnames(bj_closest_stations) <- c('x',"stationId","stationName","distance") 

In [14]:
# Modifying column names for consistency
colnames(bj_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(bj_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [15]:
str(bj_closest_stations)
str(bj_gm_data)
str(bj_aq_data)
head(bj_closest_stations)

'data.frame':	35 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ stationName: Factor w/ 26 levels "beijing_grid_216",..: 16 16 13 16 17 19 14 9 8 13 ...
 $ distance   : num  3540 1669 4638 4776 2020 ...
'data.frame':	785704 obs. of  7 variables:
 $ stationName   : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  24 24 25 25 25 25 25 26 22.4 22.4 ...
 $ pressure      : num  973 960 946 931 914 ...
 $ humidity      : num  23 21 19 18 16 15 15 16 16 17 ...
 $ wind_direction: num  161 165 170 184 221 ...
 $ wind_speed.kph: num  14.28 13.3 12.42 9.84 7.39 ...
'data.frame':	40495 obs. of  8 variables:
 $ stationId: Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00

x,stationId,stationName,distance
1,dongsi_aq,beijing_grid_303,3539.569
2,tiantan_aq,beijing_grid_303,1669.215
3,guanyuan_aq,beijing_grid_282,4637.889
4,wanshouxigong_aq,beijing_grid_303,4775.641
5,aotizhongxin_aq,beijing_grid_304,2020.02
6,nongzhanguan_aq,beijing_grid_324,5296.386


In [16]:
# Mapping stationIds with respective grids
bj_aq_map <- merge(bj_aq_data,bj_closest_stations, by = "stationId")

In [17]:
# Merge of AirQuality and Meteorology data
bj_aq_gm_data <- merge(bj_aq_map, bj_gm_data, by = c("stationName","utc_time"))

In [18]:
str(bj_aq_gm_data)
head(bj_aq_gm_data)
bj_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	40177 obs. of  16 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ NO2           : num  45 52 55 59 68 59 85 76 61 87 ...
 $ CO            : num  0.8 0.9 0.9 1 1 1.1 1.1 1.1 1.1 1.2 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ SO2           : num  6 NA NA NA NA 20 8 NA NA NA ...
 $ x             : int  30 30 30 30 30 30 30 30 30 30 ...
 $ distance      : num  2226 2226 2226 2226 2226 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
beijing_grid_216,2018-03-31 07:00:00,liulihe_aq,105,163,45,0.8,137,6.0,30,2226.39,22,1000.8969,20,159.45,10.63
beijing_grid_216,2018-03-31 08:00:00,liulihe_aq,120,185,52,0.9,140,,30,2226.39,23,1000.058,20,173.73,10.96
beijing_grid_216,2018-03-31 09:00:00,liulihe_aq,121,185,55,0.9,141,,30,2226.39,23,999.5212,22,188.34,10.21
beijing_grid_216,2018-03-31 10:00:00,liulihe_aq,137,212,59,1.0,132,,30,2226.39,24,999.4476,24,204.63,9.83
beijing_grid_216,2018-03-31 11:00:00,liulihe_aq,146,254,68,1.0,106,,30,2226.39,20,999.6694,28,232.0,7.27
beijing_grid_216,2018-03-31 12:00:00,liulihe_aq,270,323,59,1.1,96,20.0,30,2226.39,19,1000.0671,31,268.92,6.59


min_date,max_date
2018-03-31 07:00:00,2018-05-20 23:00:00


In [19]:
# selecting only required columns
bj_aq_gm_data <- bj_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [20]:
# adding additional columns for modeling
bj_aq_gm_data$hour <- hour(bj_aq_gm_data$utc_time)
bj_aq_gm_data$month <- month(bj_aq_gm_data$utc_time)
bj_aq_gm_data$date <- date(bj_aq_gm_data$utc_time)
bj_aq_gm_data$weekend = chron::is.weekend(bj_aq_gm_data$date)

In [21]:
str(bj_aq_gm_data)

'data.frame':	40177 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE

### Import and transformations of Historical data

In [22]:
# retreiving historical files
bj_aq_gm_hist_file <- "Ready for Modeling/bj_aq_gm_hist_data.csv"
bj_aq_gm_hist_data <- read.csv(bj_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [24]:
# data transformations for consistency
bj_aq_gm_hist_data <- bj_aq_gm_hist_data %>% select(-X)
bj_aq_gm_hist_data$utc_time <- anytime(bj_aq_gm_hist_data$utc_time)
bj_aq_gm_hist_data$date <- as.Date(bj_aq_gm_hist_data$date , "%Y-%m-%d")

In [26]:
str(bj_aq_gm_hist_data)

'data.frame':	356825 obs. of  15 variables:
 $ stationName   : chr  "beijing_grid_216" "beijing_grid_216" "beijing_grid_216" "beijing_grid_216" ...
 $ utc_time      : POSIXct, format: "2017-01-01 14:00:00" "2017-01-01 15:00:00" ...
 $ stationId     : chr  "liulihe_aq" "liulihe_aq" "liulihe_aq" "liulihe_aq" ...
 $ PM2.5         : int  376 369 361 354 356 315 287 254 231 224 ...
 $ PM10          : int  447 407 389 NA 360 NA NA NA NA NA ...
 $ O3            : int  2 2 2 2 2 2 NA 2 2 2 ...
 $ temperature   : num  -2.22 -2.37 -2.58 -2.79 -2.99 -3.3 -3.6 -3.9 -4.11 -4.33 ...
 $ pressure      : num  1015 1015 1015 1015 1015 ...
 $ humidity      : num  62.9 65.8 65.3 64.9 64.5 ...
 $ wind_direction: num  284 298 308 319 330 ...
 $ wind_speed.kph: num  4.28 4.39 4.19 4.14 4.24 4.1 3.99 3.92 2.97 2.25 ...
 $ hour          : int  14 15 16 17 18 19 20 21 22 23 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi 

In [27]:
# Append API and hist data
bj_aq_gm_combined_data <- rbind(bj_aq_gm_data, bj_aq_gm_hist_data)

In [28]:
str(bj_aq_gm_combined_data)
bj_aq_gm_combined_data %>% summarize(min_date = min(date), max_date = max(date))

'data.frame':	397002 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRU

min_date,max_date
2017-01-01,2018-05-20


In [29]:
# removing NA rows
bj_aq_gm_combined_data_final <- bj_aq_gm_combined_data[complete.cases(bj_aq_gm_combined_data),]

In [30]:
# converting logical weekend value to numeric for modeling
bj_aq_gm_combined_data_final$weekend <- as.integer(bj_aq_gm_combined_data_final$weekend)

In [31]:
summary(bj_aq_gm_combined_data_final)

           stationName        utc_time                  
 beijing_grid_303: 49146   Min.   :2017-01-01 14:00:00  
 beijing_grid_282: 17856   1st Qu.:2017-05-12 18:00:00  
 beijing_grid_283: 16728   Median :2017-10-03 11:00:00  
 beijing_grid_324: 16126   Mean   :2017-09-23 09:13:04  
 beijing_grid_452: 12934   3rd Qu.:2018-01-20 09:00:00  
 beijing_grid_301:  9683   Max.   :2018-05-20 20:00:00  
 (Other)         :162811                                
             stationId          PM2.5              PM10              O3       
 daxing_aq        :  9683   Min.   :   2.00   Min.   :   5.0   Min.   :  1.0  
 badaling_aq      :  9281   1st Qu.:  15.00   1st Qu.:  40.0   1st Qu.: 14.0  
 fangshan_aq      :  9233   Median :  36.00   Median :  74.0   Median : 49.0  
 mentougou_aq     :  9134   Mean   :  52.15   Mean   :  92.8   Mean   : 58.4  
 gucheng_aq       :  9001   3rd Qu.:  70.00   3rd Qu.: 120.0   3rd Qu.: 82.0  
 fengtaihuayuan_aq:  8928   Max.   :1000.00   Max.   :3000.0   Max.   

### Model Training and validation

In [32]:
set.seed(7821)

In [33]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(bj_aq_gm_combined_data_final))
train_index <- sample(seq_len(nrow(bj_aq_gm_combined_data_final)), size = sample_size)
train_bj <- bj_aq_gm_combined_data_final[train_index, ]
test_bj <- bj_aq_gm_combined_data_final[-train_index, ]

In [61]:
str(train_bj)
str(test_bj)

'data.frame':	228204 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 16 13 18 1 2 21 1 7 22 5 ...
 $ utc_time      : POSIXct, format: "2017-03-03 09:00:00" "2018-02-03 16:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 27 11 30 14 2 25 14 12 23 34 ...
 $ PM2.5         : num  110 3 8 33 32 45 69 16 15 68 ...
 $ PM10          : num  168 8 104 59 45 92 116 57 77 139 ...
 $ O3            : num  50 66 76 63 190 22 4 79 49 86 ...
 $ temperature   : num  12.2 -4.5 17 13.1 28.8 ...
 $ pressure      : num  1009 1029 1016 1009 933 ...
 $ humidity      : num  24.2 21.7 15.9 27 41.7 ...
 $ wind_direction: num  112 341 329 257 165 ...
 $ wind_speed.kph: num  17.43 11.98 21.33 8.48 14.83 ...
 $ hour          : int  9 16 8 18 7 3 17 1 16 5 ...
 $ month         : num  3 2 11 3 6 12 1 12 4 4 ...
 $ date          : Date, format: "2017-03-03" "2018-02-03" ...
 $ weekend       : int  0 1 0 1 0 0 0 1 0 0 ...
'data.frame':	57052 obs. of  15 variab

In [47]:
# Predicting PM2.5 using stationId, hour, month and weekend variables
PM2.5_bj_formula <- as.formula("PM2.5 ~ stationId + hour + month + weekend")
PM2.5_bj_model <- train(PM2.5_bj_formula, data = train_bj, method = "lm" )
summary(PM2.5_bj_model)
test_bj$PM2.5_pred <- predict(PM2.5_bj_model, test_bj)
metrics(test_bj, truth = PM2.5, estimate = PM2.5_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
   Min     1Q Median     3Q    Max 
-85.02 -34.25 -14.83  18.23 932.52 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 57.32279    0.73502  77.988  < 2e-16 ***
stationIdbadaling_aq        -6.02534    0.94053  -6.406 1.49e-10 ***
stationIdbeibuxinqu_aq       2.42267    0.97007   2.497 0.012511 *  
stationIddaxing_aq          10.91906    0.93409  11.690  < 2e-16 ***
stationIddingling_aq       -11.49854    0.96206 -11.952  < 2e-16 ***
stationIddonggaocun_aq     -12.76691    1.05944 -12.051  < 2e-16 ***
stationIddongsi_aq           7.80218    0.95321   8.185 2.73e-16 ***
stationIddongsihuan_aq       8.24017    0.99612   8.272  < 2e-16 ***
stationIdfangshan_aq         7.89979    0.94449   8.364  < 2e-16 ***
stationIdfengtaihuayuan_aq   8.95377    0.95010   9.424  < 2e-16 ***
stationIdguanyuan_aq         5.26709    0.95255   5.529 3.22e-08 ***
stationIdgucheng_aq        

rmse,rsq
55.65508,0.03233573


In [48]:
# Predicting PM10 using stationId, hour, month, weekend variables along with previously predicted PM2.5 
PM10_bj_formula <- as.formula("PM10 ~ PM2.5 + stationId + hour + month + weekend")
PM10_bj_model <- train(PM10_bj_formula, data = train_bj, method = "lm" )
summary(PM10_bj_model)
test_bj$PM10_pred <- predict(PM10_bj_model, test_bj)
metrics(test_bj, truth = PM10, estimate = PM10_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-289.29  -22.94   -9.60   10.34 2861.80 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                26.161289   0.665040  39.338  < 2e-16 ***
PM2.5                       1.249117   0.001869 668.215  < 2e-16 ***
stationIdbadaling_aq       13.900738   0.839936  16.550  < 2e-16 ***
stationIdbeibuxinqu_aq     13.256318   0.866249  15.303  < 2e-16 ***
stationIddaxing_aq         12.073491   0.834355  14.470  < 2e-16 ***
stationIddingling_aq       -2.831597   0.859357  -3.295 0.000984 ***
stationIddonggaocun_aq     -4.925737   0.946345  -5.205 1.94e-07 ***
stationIddongsi_aq          2.048812   0.851309   2.407 0.016100 *  
stationIddongsihuan_aq     12.766482   0.889634  14.350  < 2e-16 ***
stationIdfangshan_aq       15.091879   0.843526  17.891  < 2e-16 ***
stationIdfengtaihuayuan_aq  8.195970   0.848570   9.659  < 2e-16 ***
stationIdguanyuan

rmse,rsq
49.85478,0.668695


In [49]:
# Predicting O3 using stationId, hour, month, weekend variables along with previously predicted PM2.5, PM10 and O3 variables
O3_bj_formula <- as.formula("O3 ~ PM2.5 + PM10 + stationId + hour + month + weekend")
O3_bj_model <- train(O3_bj_formula, data = train_bj, method = "lm" )
summary(O3_bj_model)
test_bj$O3_pred <- predict(O3_bj_model, test_bj)
metrics(test_bj, truth = O3, estimate = O3_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-229.15  -34.36   -9.92   19.40  483.59 

Coefficients:
                             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)                 87.387665   0.671343  130.169  < 2e-16 ***
PM2.5                       -0.178171   0.003234  -55.095  < 2e-16 ***
PM10                         0.104188   0.002106   49.469  < 2e-16 ***
stationIdbadaling_aq        -5.859957   0.845543   -6.930 4.21e-12 ***
stationIdbeibuxinqu_aq      -8.111106   0.871956   -9.302  < 2e-16 ***
stationIddaxing_aq          -4.254343   0.839806   -5.066 4.07e-07 ***
stationIddingling_aq        13.696761   0.864595   15.842  < 2e-16 ***
stationIddonggaocun_aq      11.630832   0.952147   12.215  < 2e-16 ***
stationIddongsi_aq          -0.801638   0.856489   -0.936 0.349296    
stationIddongsihuan_aq     -24.329869   0.895439  -27.171  < 2e-16 ***
stationIdfangshan_aq         1.210980   0.849242    1.426 0.1538

rmse,rsq
51.06523,0.1067947


### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [60]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [61]:
# building 1 hour intervals for next 2 days and stationIds
bj_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
bj_time <- with_tz(bj_time, tzone = "UTC")
bj_future_data <- data.frame(bj_time)
# This id will be used in the creation of final submission file
bj_future_data$id <- seq.int(nrow(bj_future_data)) -1
bj_future_data <- merge(bj_future_data, data.frame(unique(bj_aq_gm_combined_data$stationId)))
names(bj_future_data) <- c("utc_time", "id", "stationId")

In [62]:
# building datetime features for the next 2 days
bj_future_data$hour <- hour(bj_future_data$utc_time)
bj_future_data$month <- month(bj_future_data$utc_time)
bj_future_data$date <- date(bj_future_data$utc_time)
bj_future_data$weekend = chron::is.weekend(bj_future_data$date)

In [63]:
bj_future_data$weekend <- as.integer(bj_future_data$weekend)

In [64]:
# predicting PM2.5, PM10 and O3 values
bj_future_data$PM2.5 <- predict(PM2.5_bj_model, bj_future_data)
bj_future_data$PM10 <- predict(PM10_bj_model, bj_future_data)
bj_future_data$O3 <- predict(O3_bj_model, bj_future_data)

In [65]:
#creating the test_id for final submission file
bj_future_data$test_id <- paste(bj_future_data$stationId, "#", bj_future_data$id, sep = "")

In [66]:
str(bj_future_data)

'data.frame':	1680 obs. of  11 variables:
 $ utc_time : POSIXct, format: "2018-05-22 00:00:00" "2018-05-22 01:00:00" ...
 $ id       : num  0 1 2 3 4 5 6 7 8 9 ...
 $ stationId: Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ hour     : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month    : num  5 5 5 5 5 5 5 5 5 5 ...
 $ date     : Date, format: "2018-05-22" "2018-05-22" ...
 $ weekend  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ PM2.5    : num  79 79.1 79.2 79.3 79.4 ...
 $ PM10     : num  139 139 139 140 140 ...
 $ O3       : num  67.7 65.8 63.9 61.9 60 ...
 $ test_id  : chr  "liulihe_aq#0" "liulihe_aq#1" "liulihe_aq#2" "liulihe_aq#3" ...


In [67]:
head(bj_future_data, 20)

utc_time,id,stationId,hour,month,date,weekend,PM2.5,PM10,O3,test_id
2018-05-22 00:00:00,0,liulihe_aq,0,5,2018-05-22,0,78.99788,139.4386,67.71018,liulihe_aq#0
2018-05-22 01:00:00,1,liulihe_aq,1,5,2018-05-22,0,79.09542,139.4686,65.78825,liulihe_aq#1
2018-05-22 02:00:00,2,liulihe_aq,2,5,2018-05-22,0,79.19295,139.4986,63.86632,liulihe_aq#2
2018-05-22 03:00:00,3,liulihe_aq,3,5,2018-05-22,0,79.29048,139.5285,61.94439,liulihe_aq#3
2018-05-22 04:00:00,4,liulihe_aq,4,5,2018-05-22,0,79.38802,139.5585,60.02246,liulihe_aq#4
2018-05-22 05:00:00,5,liulihe_aq,5,5,2018-05-22,0,79.48555,139.5885,58.10053,liulihe_aq#5
2018-05-22 06:00:00,6,liulihe_aq,6,5,2018-05-22,0,79.58308,139.6185,56.1786,liulihe_aq#6
2018-05-22 07:00:00,7,liulihe_aq,7,5,2018-05-22,0,79.68062,139.6485,54.25667,liulihe_aq#7
2018-05-22 08:00:00,8,liulihe_aq,8,5,2018-05-22,0,79.77815,139.6785,52.33474,liulihe_aq#8
2018-05-22 09:00:00,9,liulihe_aq,9,5,2018-05-22,0,79.87568,139.7085,50.41281,liulihe_aq#9


In [68]:
write.csv(bj_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("bj_submission", Sys.Date(),".csv"), row.names = FALSE)

In [69]:
# resetting the timezone
Sys.unsetenv("TZ")