### Import and transformations of API data

In [1]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [2]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

"package 'RCurl' was built under R version 3.4.4"Loading required package: bitops
Loading required package: gsubfn
Loading required package: proto
Could not load tcltk.  Will use slower R code instead.
Loading required package: RSQLite
"package 'dplyr' was built under R version 3.4.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'lubridate' was built under R version 3.4.3"
Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date

"package 'chron' was built under R version 3.4.3"
Attaching package: 'chron'

The following objects are masked from 'package:lubridate':

    days, hours, minutes, seconds, years

Loading required package: caret
"package 'caret' was built under R version 3.4.4"Loading required package: lattice
"package 'lattice' was built under R version 3.4.3"Loading required packa

In [3]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [4]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [5]:
# pull data for Beijing

# acquire air quality data
bj_aq_url <- paste0("https://biendata.com/competition/airquality/bj/",startDate,"/",endDate,"/2k0d1d8")
bj_aq_file <- getURL(bj_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_aq_data <- read.csv(textConnection(bj_aq_file), header=TRUE)
  
# acquire API grid meteorology data
bj_gm_url <- paste0("https://biendata.com/competition/meteorology/bj_grid/",startDate,"/",endDate,"/2k0d1d8")
bj_gm_file <- getURL(bj_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_gm_data <- read.csv(textConnection(bj_gm_file), header=TRUE)

In [6]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [7]:
# Converting character to datetime
bj_gm_data$time <- anytime(bj_gm_data$time)
bj_aq_data$time <- anytime(bj_aq_data$time)

In [8]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "bj_gm_data"
'data.frame':	945828 obs. of  9 variables:
 $ id            : int  2000958 2000959 2000960 2000961 2000962 2000963 2000964 2000965 2000966 2000967 ...
 $ station_id    : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 9 levels "CLEAR_DAY","CLEAR_NIGHT",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ temperature   : num  24 24 25 25 25 25 25 26 22.4 22.4 ...
 $ pressure      : num  973 960 946 931 914 ...
 $ humidity      : num  23 21 19 18 16 15 15 16 16 17 ...
 $ wind_direction: num  161 165 170 184 221 ...
 $ wind_speed    : num  14.28 13.3 12.42 9.84 7.39 ...
NULL
[1] "bj_aq_data"
'data.frame':	48615 obs. of  9 variables:
 $ id                : int  2941450 2941451 2941452 2941453 2941454 2941455 2941456 2941457 2941458 2941459 ...
 $ station_id        : Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ time              : PO

In [9]:
# Printing the min and max dates of all datasets
print("bj_gm_data")
bj_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("bj_aq_data")
bj_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "bj_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-06-01


[1] "bj_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-06-01


In [10]:
# using only required columns
bj_aq_data <- bj_aq_data %>% select(-id)
bj_gm_data <- bj_gm_data %>% select(-c(id, weather))

In [11]:
# Beijing closest grids to stations
bj_closest_stations <- read.csv('SL_beijing_closest_stations.csv')
colnames(bj_closest_stations) <- c('x',"stationId","stationName","distance") 

In [12]:
# Modifying column names for consistency
colnames(bj_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(bj_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [13]:
str(bj_closest_stations)
str(bj_gm_data)
str(bj_aq_data)
head(bj_closest_stations)

'data.frame':	35 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ stationName: Factor w/ 26 levels "beijing_grid_216",..: 16 16 13 16 17 19 14 9 8 13 ...
 $ distance   : num  3540 1669 4638 4776 2020 ...
'data.frame':	945828 obs. of  7 variables:
 $ stationName   : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  24 24 25 25 25 25 25 26 22.4 22.4 ...
 $ pressure      : num  973 960 946 931 914 ...
 $ humidity      : num  23 21 19 18 16 15 15 16 16 17 ...
 $ wind_direction: num  161 165 170 184 221 ...
 $ wind_speed.kph: num  14.28 13.3 12.42 9.84 7.39 ...
'data.frame':	48615 obs. of  8 variables:
 $ stationId: Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00

x,stationId,stationName,distance
1,dongsi_aq,beijing_grid_303,3539.569
2,tiantan_aq,beijing_grid_303,1669.215
3,guanyuan_aq,beijing_grid_282,4637.889
4,wanshouxigong_aq,beijing_grid_303,4775.641
5,aotizhongxin_aq,beijing_grid_304,2020.02
6,nongzhanguan_aq,beijing_grid_324,5296.386


In [14]:
# Mapping stationIds with respective grids
bj_aq_map <- merge(bj_aq_data,bj_closest_stations, by = "stationId")

In [15]:
# Merge of AirQuality and Meteorology data
bj_aq_gm_data <- merge(bj_aq_map, bj_gm_data, by = c("stationName","utc_time"))

In [16]:
str(bj_aq_gm_data)
head(bj_aq_gm_data)
bj_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	47737 obs. of  16 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ NO2           : num  45 52 55 59 68 59 85 76 61 87 ...
 $ CO            : num  0.8 0.9 0.9 1 1 1.1 1.1 1.1 1.1 1.2 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ SO2           : num  6 NA NA NA NA 20 8 NA NA NA ...
 $ x             : int  30 30 30 30 30 30 30 30 30 30 ...
 $ distance      : num  2226 2226 2226 2226 2226 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
beijing_grid_216,2018-03-31 07:00:00,liulihe_aq,105,163,45,0.8,137,6.0,30,2226.39,22,1000.8969,20,159.45,10.63
beijing_grid_216,2018-03-31 08:00:00,liulihe_aq,120,185,52,0.9,140,,30,2226.39,23,1000.058,20,173.73,10.96
beijing_grid_216,2018-03-31 09:00:00,liulihe_aq,121,185,55,0.9,141,,30,2226.39,23,999.5212,22,188.34,10.21
beijing_grid_216,2018-03-31 10:00:00,liulihe_aq,137,212,59,1.0,132,,30,2226.39,24,999.4476,24,204.63,9.83
beijing_grid_216,2018-03-31 11:00:00,liulihe_aq,146,254,68,1.0,106,,30,2226.39,20,999.6694,28,232.0,7.27
beijing_grid_216,2018-03-31 12:00:00,liulihe_aq,270,323,59,1.1,96,20.0,30,2226.39,19,1000.0671,31,268.92,6.59


min_date,max_date
2018-03-31 07:00:00,2018-06-01


In [17]:
rm("bj_gm_data")
rm("bj_aq_data")
rm("bj_aq_map")

In [18]:
# selecting only required columns
bj_aq_gm_data <- bj_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [19]:
# adding additional columns for modeling
bj_aq_gm_data$hour <- hour(bj_aq_gm_data$utc_time)
bj_aq_gm_data$month <- month(bj_aq_gm_data$utc_time)
bj_aq_gm_data$date <- date(bj_aq_gm_data$utc_time)
bj_aq_gm_data$weekend = chron::is.weekend(bj_aq_gm_data$date)

In [20]:
str(bj_aq_gm_data)

'data.frame':	47737 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE

### Import and transformations of Historical data

In [21]:
# retreiving historical files
bj_aq_gm_hist_file <- "Ready for Modeling/bj_aq_gm_hist_data.csv"
bj_aq_gm_hist_data <- read.csv(bj_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [22]:
# data transformations for consistency
bj_aq_gm_hist_data <- bj_aq_gm_hist_data %>% select(-X)
bj_aq_gm_hist_data$utc_time <- anytime(bj_aq_gm_hist_data$utc_time)
bj_aq_gm_hist_data$date <- as.Date(bj_aq_gm_hist_data$date , "%Y-%m-%d")

In [23]:
str(bj_aq_gm_hist_data)

'data.frame':	356825 obs. of  15 variables:
 $ stationName   : chr  "beijing_grid_216" "beijing_grid_216" "beijing_grid_216" "beijing_grid_216" ...
 $ utc_time      : POSIXct, format: "2017-01-01 14:00:00" "2017-01-01 15:00:00" ...
 $ stationId     : chr  "liulihe_aq" "liulihe_aq" "liulihe_aq" "liulihe_aq" ...
 $ PM2.5         : int  376 369 361 354 356 315 287 254 231 224 ...
 $ PM10          : int  447 407 389 NA 360 NA NA NA NA NA ...
 $ O3            : int  2 2 2 2 2 2 NA 2 2 2 ...
 $ temperature   : num  -2.22 -2.37 -2.58 -2.79 -2.99 -3.3 -3.6 -3.9 -4.11 -4.33 ...
 $ pressure      : num  1015 1015 1015 1015 1015 ...
 $ humidity      : num  62.9 65.8 65.3 64.9 64.5 ...
 $ wind_direction: num  284 298 308 319 330 ...
 $ wind_speed.kph: num  4.28 4.39 4.19 4.14 4.24 4.1 3.99 3.92 2.97 2.25 ...
 $ hour          : int  14 15 16 17 18 19 20 21 22 23 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi 

In [24]:
# Append API and hist data
bj_aq_gm_combined_data <- rbind(bj_aq_gm_data, bj_aq_gm_hist_data)

In [25]:
str(bj_aq_gm_combined_data)
bj_aq_gm_combined_data %>% summarize(min_date = min(date), max_date = max(date))
summary(bj_aq_gm_combined_data)

'data.frame':	404562 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRU

min_date,max_date
2017-01-01,2018-06-01


           stationName        utc_time                           stationId     
 beijing_grid_303: 69354   Min.   :2017-01-01 14:00:00   badaling_aq  : 11559  
 beijing_grid_282: 23118   1st Qu.:2017-05-10 10:00:00   beibuxinqu_aq: 11559  
 beijing_grid_283: 23118   Median :2017-09-25 14:00:00   daxing_aq    : 11559  
 beijing_grid_324: 23118   Mean   :2017-09-20 00:29:17   dingling_aq  : 11559  
 beijing_grid_452: 23118   3rd Qu.:2018-01-22 14:00:00   donggaocun_aq: 11559  
 beijing_grid_216: 11559   Max.   :2018-06-01 00:00:00   dongsi_aq    : 11559  
 (Other)         :231177                                 (Other)      :335208  
     PM2.5              PM10               O3          temperature    
 Min.   :   2.00   Min.   :   5.00   Min.   :  1.00   Min.   :-18.37  
 1st Qu.:  17.00   1st Qu.:  40.00   1st Qu.: 15.00   1st Qu.:  0.58  
 Median :  41.00   Median :  75.00   Median : 49.00   Median : 11.80  
 Mean   :  60.65   Mean   :  93.77   Mean   : 58.59   Mean   : 11.46  
 3rd 

In [26]:
beijing_holidays <- c("2017-01-01", "2017-01-02","2017-01-27", "2017-01-28", "2017-01-29","2017-01-30", "2017-01-31"
                      , "2017-02-01", "2017-02-02", "2017-04-02", "2017-04-03","2017-04-04", "2017-05-01", "2017-05-28"
                      , "2017-05-29", "2017-05-30", "2017-10-01", "2017-10-02", "2017-10-03", "2017-10-04", "2017-10-05"
                      , "2017-10-06", "2017-10-07", "2017-10-08", "2018-01-01", "2018-02-15", "2018-02-16", "2018-02-17"
                      , "2018-02-18", "2018-02-19", "2018-02-20", "2018-02-21", "2018-04-05", "2018-04-06", "2018-04-07"
                      , "2018-04-30", "2018-05-01")

In [27]:
bj_aq_gm_combined_data$holiday <- ifelse(bj_aq_gm_combined_data$date %in% as.Date(beijing_holidays), 1, 0)

In [28]:
rm("bj_aq_gm_data")
rm("bj_aq_gm_hist_data")

In [29]:
#code to replace outliers with NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$PM2.5 %in% boxplot.stats(bj_aq_gm_combined_data$PM2.5)$out, ]$PM2.5 <- NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$PM10 %in% boxplot.stats(bj_aq_gm_combined_data$PM10)$out, ]$PM10 <- NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$O3 %in% boxplot.stats(bj_aq_gm_combined_data$O3)$out, ]$O3 <- NA

In [30]:
bj_aq_gm_combined_data[bj_aq_gm_combined_data$pressure %in% boxplot.stats(bj_aq_gm_combined_data$pressure)$out, ]$pressure <- NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$humidity %in% boxplot.stats(bj_aq_gm_combined_data$humidity)$out, ]$humidity <- NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$wind_speed.kph %in% boxplot.stats(bj_aq_gm_combined_data$wind_speed.kph)$out, ]$wind_speed.kph <- NA

In [31]:
# replace pressure NA with mean values based on certain groups
bj_aq_gm_combined_data_pressure <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(pressure = mean(pressure, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_pressure, c("stationId", "hour", "month")) %>% 
                          mutate(pressure = coalesce(pressure.x, pressure.y)) %>% 
                          select(-pressure.x, -pressure.y)

bj_aq_gm_combined_data_pressure <- bj_aq_gm_combined_data %>% 
                                group_by( hour, month) %>% 
                                summarize(pressure = mean(pressure, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_pressure, c( "hour", "month")) %>% 
                          mutate(pressure = coalesce(pressure.x, pressure.y)) %>% 
                          select(-pressure.x, -pressure.y)

# replace humidity NA with mean values based on certain groups
bj_aq_gm_combined_data_humidity <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(humidity = mean(humidity, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_humidity, c("stationId", "hour", "month")) %>% 
                          mutate(humidity = coalesce(humidity.x, humidity.y)) %>% 
                          select(-humidity.x, -humidity.y)


# replace wind_speed NA with mean values based on certain groups
bj_aq_gm_combined_data_wind_speed.kph <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(wind_speed.kph = mean(wind_speed.kph, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_wind_speed.kph, c("stationId", "hour", "month")) %>% 
                          mutate(wind_speed.kph = coalesce(wind_speed.kph.x, wind_speed.kph.y)) %>% 
                          select(-wind_speed.kph.x, -wind_speed.kph.y)

"package 'bindrcpp' was built under R version 3.4.3"

In [32]:
# replace PM2.5 NA with mean values based on certain groups
bj_aq_gm_combined_data_PM2.5 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM2.5, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)
bj_aq_gm_combined_data_PM2.5 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM2.5, c("stationId", "hour", "weekend","holiday")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)

In [33]:
# replace PM10 NA with mean values based on certain groups
bj_aq_gm_combined_data_PM10 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM10, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)
bj_aq_gm_combined_data_PM10 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM10, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)

In [34]:
# replace O3 NA with mean values based on certain groups
bj_aq_gm_combined_data_O3 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_O3, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
bj_aq_gm_combined_data_O3 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour,  weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_O3, c("stationId", "hour",  "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)

In [35]:
rm("bj_aq_gm_combined_data_PM2.5")
rm("bj_aq_gm_combined_data_PM10")
rm("bj_aq_gm_combined_data_O3")
rm("bj_aq_gm_combined_data_pressure")
rm("bj_aq_gm_combined_data_humidity")
rm("bj_aq_gm_combined_data_wind_speed.kph")

In [36]:
str(bj_aq_gm_combined_data)
summary(bj_aq_gm_combined_data)

'data.frame':	404562 obs. of  16 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ PM2.5         : num  105 120 121 137 146 ...
 $ PM10          : num  163 185 185 212 116 ...
 $ O3            : num  137 140 141 132 106 96 4

           stationName        utc_time                           stationId     
 beijing_grid_303: 69354   Min.   :2017-01-01 14:00:00   badaling_aq  : 11559  
 beijing_grid_282: 23118   1st Qu.:2017-05-10 10:00:00   beibuxinqu_aq: 11559  
 beijing_grid_283: 23118   Median :2017-09-25 14:00:00   daxing_aq    : 11559  
 beijing_grid_324: 23118   Mean   :2017-09-20 00:29:17   dingling_aq  : 11559  
 beijing_grid_452: 23118   3rd Qu.:2018-01-22 14:00:00   donggaocun_aq: 11559  
 beijing_grid_216: 11559   Max.   :2018-06-01 00:00:00   dongsi_aq    : 11559  
 (Other)         :231177                                 (Other)      :335208  
  temperature     wind_direction       hour           month       
 Min.   :-18.37   Min.   :  0.0   Min.   : 0.00   Min.   : 1.000  
 1st Qu.:  0.58   1st Qu.:106.5   1st Qu.: 5.00   1st Qu.: 3.000  
 Median : 11.80   Median :189.1   Median :11.00   Median : 5.000  
 Mean   : 11.46   Mean   :190.9   Mean   :11.45   Mean   : 5.607  
 3rd Qu.: 22.00   3rd Qu.

In [37]:
# validating for no NAs
bj_aq_gm_combined_data[!complete.cases(bj_aq_gm_combined_data),]

stationName,utc_time,stationId,temperature,wind_direction,hour,month,date,weekend,holiday,pressure,humidity,wind_speed.kph,PM2.5,PM10,O3


In [38]:
# converting logical weekend value to numeric for modeling
bj_aq_gm_combined_data$weekend <- as.integer(bj_aq_gm_combined_data$weekend)

In [39]:
# selecting only required columns
bj_aq_gm_combined_data <- 
bj_aq_gm_combined_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                    "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                    "hour", "month", "date", "weekend", "holiday"))

In [40]:
# retreiving lat long data  
bj_lat_long_file <- "Datasets/Beijing_AirQuality_Stations.csv"
bj_lat_long_data <- read.csv(bj_lat_long_file, header=TRUE, sep=",", stringsAsFactors = FALSE) 
colnames(bj_lat_long_data) <- c("stationId", "longitude", "latitude")

In [41]:
bj_aq_gm_combined_data <- merge(bj_aq_gm_combined_data,bj_lat_long_data, by = "stationId")

In [42]:
str(bj_aq_gm_combined_data)
bj_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_aq_gm_combined_data)

'data.frame':	404562 obs. of  18 variables:
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 17 17 17 17 17 17 17 17 17 17 ...
 $ utc_time      : POSIXct, format: "2017-01-01 20:00:00" "2017-01-01 21:00:00" ...
 $ PM2.5         : num  5.5 116 51 38 7 10 8 9 7 10 ...
 $ PM10          : num  29.5 28 63 102 16 20 19 16 14 17 ...
 $ O3            : num  12.4 4 26 28 3 ...
 $ temperature   : num  -2.09 -2.4 -2.65 -2.9 -2.47 -2.27 -2.01 -1.75 -1.49 -1.79 ...
 $ pressure      : num  1020 1020 1020 1021 1018 ...
 $ humidity      : num  44.7 45.2 44.9 44.6 54.8 ...
 $ wind_direction: num  338 339 339 339 322 ...
 $ wind_speed.kph: num  13.84 13.68 12.85 12.03 4.55 ...
 $ hour          : int  20 21 22 23 14 15 16 17 18 19 ...
 $ month         : num  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ holiday       

min_date,max_date
2017-01-01 14:00:00,2018-06-01


         stationId                stationName        utc_time                  
 badaling_aq  : 11559   beijing_grid_303: 69354   Min.   :2017-01-01 14:00:00  
 beibuxinqu_aq: 11559   beijing_grid_282: 23118   1st Qu.:2017-05-10 10:00:00  
 daxing_aq    : 11559   beijing_grid_283: 23118   Median :2017-09-25 14:00:00  
 dingling_aq  : 11559   beijing_grid_324: 23118   Mean   :2017-09-20 00:29:17  
 donggaocun_aq: 11559   beijing_grid_452: 23118   3rd Qu.:2018-01-22 14:00:00  
 dongsi_aq    : 11559   beijing_grid_216: 11559   Max.   :2018-06-01 00:00:00  
 (Other)      :335208   (Other)         :231177                                
     PM2.5             PM10              O3          temperature    
 Min.   :  2.00   Min.   :  5.00   Min.   :  1.00   Min.   :-18.37  
 1st Qu.: 18.00   1st Qu.: 45.00   1st Qu.: 16.00   1st Qu.:  0.58  
 Median : 41.00   Median : 73.00   Median : 49.00   Median : 11.80  
 Mean   : 49.64   Mean   : 80.57   Mean   : 54.92   Mean   : 11.46  
 3rd Qu.: 69.00

### Model Training and validation

In [43]:
set.seed(2306)

In [44]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(bj_aq_gm_combined_data))
train_index <- sample(seq_len(nrow(bj_aq_gm_combined_data)), size = sample_size)
train_bj <- bj_aq_gm_combined_data[train_index, ]
test_bj <- bj_aq_gm_combined_data[-train_index, ]

In [45]:
str(train_bj)
str(test_bj)

'data.frame':	323649 obs. of  18 variables:
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 11 22 31 21 14 4 3 14 29 11 ...
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 13 16 16 26 1 15 9 1 3 13 ...
 $ utc_time      : POSIXct, format: "2017-12-31 18:00:00" "2017-01-20 17:00:00" ...
 $ PM2.5         : num  37 99 69 14 26 54 11 68 44 10 ...
 $ PM10          : num  73 52.5 90 22 93 ...
 $ O3            : num  2 2 89 105 15 2 105 38 7 39 ...
 $ temperature   : num  -0.73 -9.14 25.22 20.48 12 ...
 $ pressure      : num  1019 1022 1006 1002 1010 ...
 $ humidity      : num  19.7 23.3 56.5 18.2 31 ...
 $ wind_direction: num  14.4 195.1 174.1 229.2 238.8 ...
 $ wind_speed.kph: num  5.27 5.63 5.64 11.52 3.45 ...
 $ hour          : int  18 17 12 9 22 13 6 3 17 19 ...
 $ month         : num  12 1 8 3 5 1 2 1 12 3 ...
 $ date          : Date, format: "2017-12-31" "2017-01-20" ...
 $ weekend       : int  1 0 0 1 0 1 0 0 0 0 ...
 $ holiday       : num  0 0 0 0 0 0 0 1 0 

In [83]:
subsets <- c(1, 3, 5, 7, 10)

In [64]:
PM2.5_x <- train_bj[,c("latitude", "longitude", "hour", "month", "weekend", "holiday", "temperature","humidity","wind_direction","wind_speed.kph")]
PM2.5_y <- train_bj[,"PM2.5"]
PM2.5_ctrl <- rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 5, number = 10)
PM2.5_lmProfile <- rfe(PM2.5_x , PM2.5_y, sizes = subsets, rfeControl = PM2.5_ctrl )
PM2.5_lmProfile
summary(PM2.5_lmProfile$fit)


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE RMSESD RsquaredSD   MAESD Selected
         1 38.96 0.008666 30.64 0.1040   0.001152 0.07018         
         3 37.85 0.064373 29.66 0.1025   0.002336 0.07209         
         5 37.75 0.069181 29.60 0.1030   0.002369 0.07653         
         7 36.89 0.111269 28.92 0.1060   0.002960 0.07622         
        10 36.40 0.134915 28.29 0.1103   0.003323 0.07963        *

The top 5 variables (out of 10):
   latitude, holiday, wind_speed.kph, month, weekend



Call:
lm(formula = y ~ ., data = tmp)

Residuals:
    Min      1Q  Median      3Q     Max 
-83.326 -25.607  -8.846  17.613 152.729 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     8.235e+02  2.711e+01  30.377  < 2e-16 ***
latitude       -1.734e+01  2.869e-01 -60.444  < 2e-16 ***
holiday         5.044e+00  2.528e-01  19.954  < 2e-16 ***
wind_speed.kph -1.470e+00  1.550e-02 -94.807  < 2e-16 ***
month          -1.337e+00  1.944e-02 -68.770  < 2e-16 ***
weekend         9.167e-01  1.415e-01   6.478 9.28e-11 ***
longitude      -5.877e-01  2.326e-01  -2.527   0.0115 *  
humidity        3.829e-01  3.681e-03 104.028  < 2e-16 ***
hour           -3.750e-01  9.907e-03 -37.854  < 2e-16 ***
temperature     2.895e-01  6.079e-03  47.618  < 2e-16 ***
wind_direction -3.536e-02  6.526e-04 -54.185  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 36.4 on 323638 degrees of freedom
Multiple R-squared:  0.1349,	

In [65]:
test_bj$PM2.5_pred <- predict(PM2.5_lmProfile$fit, test_bj)
metrics(test_bj, truth = PM2.5, estimate = PM2.5_pred)

rmse,rsq
36.43249,0.1377196


In [86]:
PM10_x <- train_bj[,c("PM2.5","latitude", "longitude", "hour", "month", "weekend", "holiday", "temperature","humidity","wind_direction","wind_speed.kph")]
PM10_y <- train_bj[,"PM10"]
PM10_ctrl <- rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 5, number = 10)
PM10_lmProfile <- rfe(PM10_x , PM10_y, sizes = subsets, rfeControl = PM10_ctrl )
PM10_lmProfile
summary(PM10_lmProfile$fit)


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE RMSESD RsquaredSD   MAESD Selected
         1 47.22  0.01355 37.13 0.1638   0.001225 0.09537         
         3 46.72  0.03422 36.55 0.1756   0.001855 0.11278         
         5 46.57  0.04049 36.41 0.1730   0.001888 0.10985         
         7 37.02  0.39382 27.11 0.2267   0.004812 0.13473         
        10 36.61  0.40723 26.80 0.2274   0.004946 0.13482         
        11 36.60  0.40746 26.80 0.2273   0.004949 0.13474        *

The top 5 variables (out of 11):
   longitude, latitude, holiday, weekend, month



Call:
lm(formula = y ~ ., data = tmp)

Residuals:
     Min       1Q   Median       3Q      Max 
-153.374  -22.305   -3.924   17.578  210.130 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     2.352e+03  2.730e+01  86.155   <2e-16 ***
longitude      -1.521e+01  2.339e-01 -65.010   <2e-16 ***
latitude       -1.331e+01  2.901e-01 -45.879   <2e-16 ***
holiday        -5.278e+00  2.543e-01 -20.753   <2e-16 ***
weekend         3.900e+00  1.423e-01  27.405   <2e-16 ***
month          -1.102e+00  1.970e-02 -55.955   <2e-16 ***
temperature     7.628e-01  6.134e-03 124.347   <2e-16 ***
PM2.5           7.162e-01  1.768e-03 405.189   <2e-16 ***
hour            4.067e-01  9.985e-03  40.730   <2e-16 ***
humidity       -3.074e-01  3.763e-03 -81.711   <2e-16 ***
wind_speed.kph  3.954e-02  1.580e-02   2.502   0.0124 *  
wind_direction -7.577e-03  6.593e-04 -11.493   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard er

In [87]:
test_bj$PM10_pred <- predict(PM10_lmProfile$fit, test_bj)
metrics(test_bj, truth = PM10, estimate = PM10_pred)

rmse,rsq
36.7546,0.4008434


In [84]:
O3_x <- train_bj[,c("PM2.5", "PM10", "latitude", "longitude", "hour", "month", "weekend", "holiday", "temperature","humidity","wind_direction","wind_speed.kph")]
O3_y <- train_bj[,"O3"]
O3_ctrl <- rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 5, number = 10)
O3_lmProfile <- rfe(O3_x , O3_y, sizes = subsets, rfeControl = O3_ctrl )
O3_lmProfile
summary(O3_lmProfile$fit)


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE RMSESD RsquaredSD   MAESD Selected
         1 43.50  0.01063 35.65 0.1120  0.0009913 0.07561         
         3 43.38  0.01628 35.44 0.1113  0.0012120 0.07159         
         5 32.58  0.44487 25.97 0.1011  0.0027814 0.08797         
         7 31.83  0.47011 25.17 0.3034  0.0103427 0.37337         
        10 31.39  0.48491 24.60 0.1092  0.0030048 0.08629         
        12 31.39  0.48493 24.59 0.1093  0.0030013 0.08622        *

The top 5 variables (out of 12):
   latitude, longitude, month, wind_speed.kph, temperature



Call:
lm(formula = y ~ ., data = tmp)

Residuals:
     Min       1Q   Median       3Q      Max 
-131.984  -21.169   -1.622   19.127  177.003 

Coefficients:
                 Estimate Std. Error  t value Pr(>|t|)    
(Intercept)    -1.475e+03  2.368e+01  -62.269  < 2e-16 ***
latitude        2.952e+01  2.496e-01  118.259  < 2e-16 ***
longitude       2.968e+00  2.019e-01   14.699  < 2e-16 ***
month          -2.611e+00  1.697e-02 -153.874  < 2e-16 ***
wind_speed.kph  2.282e+00  1.355e-02  168.365  < 2e-16 ***
temperature     2.180e+00  5.385e-03  404.780  < 2e-16 ***
hour           -5.420e-01  8.585e-03  -63.131  < 2e-16 ***
holiday        -3.316e-01  2.183e-01   -1.519 0.128736    
humidity       -3.374e-01  3.260e-03 -103.508  < 2e-16 ***
weekend        -1.319e-01  1.222e-01   -1.079 0.280386    
PM10           -7.371e-02  1.507e-03  -48.900  < 2e-16 ***
PM2.5          -6.882e-03  1.861e-03   -3.698 0.000217 ***
wind_direction -5.023e-05  5.655e-04   -0.089 0.929221    
---
Signif. code

In [85]:
test_bj$O3_pred <- predict(O3_lmProfile$fit, test_bj)
metrics(test_bj, truth = O3, estimate = O3_pred)

rmse,rsq
31.37673,0.4879548


In [49]:
rm("train_bj")
rm("test_bj")

### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [68]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [69]:
# building 1 hour intervals for next 2 days and stationIds
bj_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
bj_time <- with_tz(bj_time, tzone = "UTC")
bj_future_data <- data.frame(bj_time)
# This id will be used in the creation of final submission file
bj_future_data$id <- seq.int(nrow(bj_future_data)) -1
bj_future_data <- merge(bj_future_data, data.frame(unique(bj_aq_gm_combined_data$stationId)))
names(bj_future_data) <- c("utc_time", "id", "stationId")

In [70]:
# building datetime features for the next 2 days
bj_future_data$hour <- hour(bj_future_data$utc_time)
bj_future_data$month <- month(bj_future_data$utc_time)
bj_future_data$date <- date(bj_future_data$utc_time)
bj_future_data$weekend = chron::is.weekend(bj_future_data$date)

In [71]:
bj_future_data$weekend <- as.integer(bj_future_data$weekend)
bj_future_data$holiday <- ifelse(bj_future_data$date %in% as.Date(beijing_holidays), 1, 0)

In [72]:
# closest grid
bj_future_data <-  merge(bj_future_data,bj_closest_stations, by = "stationId")
#lat long data
bj_future_data <- merge(bj_future_data,bj_lat_long_data, by = "stationId")

In [73]:
hour <- as.character('11')
bj_future_weather_url <- paste0("http://kdd.caiyunapp.com/competition/forecast/bj/",tomorrow - 1,"-",hour,"/2k0d1d8")
bj_future_weather_url

In [74]:
bj_future_weather_file <- getURL(bj_future_weather_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_future_weather_data <- read.csv(textConnection(bj_future_weather_file), header=TRUE)

In [75]:
bj_future_weather_data$forecast_time <- anytime(bj_future_weather_data$forecast_time)
bj_future_weather_data <- bj_future_weather_data %>% select(-c(id, weather))
colnames(bj_future_weather_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_speed.kph", "wind_direction") 

In [76]:
str(bj_future_weather_data)
summary(bj_future_weather_data)
bj_future_weather_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	31248 obs. of  7 variables:
 $ stationName   : Factor w/ 651 levels "beijing_grid_000",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-06-03 12:00:00" "2018-06-03 13:00:00" ...
 $ temperature   : num  25 24 23 22 22 21 21 20 20 19 ...
 $ pressure      : num  969 970 971 971 971 ...
 $ humidity      : num  26 26 24 22 21 21 21 21 22 22 ...
 $ wind_speed.kph: num  5.33 9.04 10.77 11.14 10.64 ...
 $ wind_direction: num  335 339 336 331 327 ...


           stationName       utc_time                    temperature   
 beijing_grid_000:   48   Min.   :2018-06-03 12:00:00   Min.   : 9.00  
 beijing_grid_001:   48   1st Qu.:2018-06-03 23:45:00   1st Qu.:21.50  
 beijing_grid_002:   48   Median :2018-06-04 11:30:00   Median :26.84  
 beijing_grid_003:   48   Mean   :2018-06-04 11:30:00   Mean   :26.63  
 beijing_grid_004:   48   3rd Qu.:2018-06-04 23:15:00   3rd Qu.:32.36  
 beijing_grid_005:   48   Max.   :2018-06-05 11:00:00   Max.   :39.74  
 (Other)         :30960                                                
    pressure         humidity     wind_speed.kph  wind_direction  
 Min.   : 835.3   Min.   : 3.00   Min.   : 0.07   Min.   :  0.02  
 1st Qu.: 923.3   1st Qu.:10.00   1st Qu.: 6.97   1st Qu.:193.75  
 Median : 968.0   Median :18.00   Median :11.91   Median :234.79  
 Mean   : 957.3   Mean   :22.05   Mean   :13.77   Mean   :219.16  
 3rd Qu.: 999.2   3rd Qu.:31.00   3rd Qu.:19.51   3rd Qu.:267.74  
 Max.   :1008.3   Max.

min_date,max_date
2018-06-03 12:00:00,2018-06-05 11:00:00


In [77]:
bj_future_data <- bj_future_data %>% left_join(bj_future_weather_data, by= c("stationName", "utc_time"))

"Column `stationName` joining factors with different levels, coercing to character vector"

In [78]:
# replace pressure NA with mean values based on certain groups
bj_future_data_pressure <- bj_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(pressure = mean(pressure, na.rm = TRUE))
bj_future_data <- bj_future_data %>% 
                          left_join(bj_future_data_pressure, c("stationId", "hour")) %>% 
                          mutate(pressure = coalesce(pressure.x, pressure.y)) %>% 
                          select(-pressure.x, -pressure.y)
# replace temperature NA with mean values based on certain groups
bj_future_data_temperature <- bj_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(temperature = mean(temperature, na.rm = TRUE))
bj_future_data <- bj_future_data %>% 
                          left_join(bj_future_data_temperature, c("stationId", "hour")) %>% 
                          mutate(temperature = coalesce(temperature.x, temperature.y)) %>% 
                          select(-temperature.x, -temperature.y)

# replace humidity NA with mean values based on certain groups
bj_future_data_humidity <- bj_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(humidity = mean(humidity, na.rm = TRUE))
bj_future_data <- bj_future_data %>% 
                          left_join(bj_future_data_humidity, c("stationId", "hour")) %>% 
                          mutate(humidity = coalesce(humidity.x, humidity.y)) %>% 
                          select(-humidity.x, -humidity.y)


# replace wind_speed NA with mean values based on certain groups
bj_future_data_wind_speed.kph <- bj_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(wind_speed.kph = mean(wind_speed.kph, na.rm = TRUE))
bj_future_data <- bj_future_data %>% 
                          left_join(bj_future_data_wind_speed.kph, c("stationId", "hour")) %>% 
                          mutate(wind_speed.kph = coalesce(wind_speed.kph.x, wind_speed.kph.y)) %>% 
                          select(-wind_speed.kph.x, -wind_speed.kph.y)

# replace wind direction NA with mean values based on certain groups
bj_future_data_wind_direction <- bj_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(wind_direction = mean(wind_direction, na.rm = TRUE))
bj_future_data <- bj_future_data %>% 
                          left_join(bj_future_data_wind_direction, c("stationId", "hour")) %>% 
                          mutate(wind_direction = coalesce(wind_direction.x, wind_direction.y)) %>% 
                          select(-wind_direction.x, -wind_direction.y)

In [80]:
# predicting PM2.5, PM10 and O3 values
bj_future_data$PM2.5 <- predict(PM2.5_lmProfile$fit, bj_future_data)
bj_future_data$PM10 <- predict(PM10_lmProfile$fit, bj_future_data)
bj_future_data$O3 <- predict(O3_lmProfile$fit, bj_future_data)

In [81]:
#creating the test_id for final submission file
bj_future_data$test_id <- paste(bj_future_data$stationId, "#", bj_future_data$id, sep = "")

In [82]:
str(bj_future_data)
summary(bj_future_data)
bj_future_data %>% summarize(min_date = min(utc_time ), max_date = max(utc_time ))

'data.frame':	1680 obs. of  22 variables:
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-06-04 00:00:00" "2018-06-04 01:00:00" ...
 $ id            : num  0 1 2 3 4 5 6 7 8 9 ...
 $ hour          : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month         : num  6 6 6 6 6 6 6 6 6 6 ...
 $ date          : Date, format: "2018-06-04" "2018-06-04" ...
 $ weekend       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ x             : int  5 5 5 5 5 5 5 5 5 5 ...
 $ stationName   : chr  "beijing_grid_304" "beijing_grid_304" "beijing_grid_304" "beijing_grid_304" ...
 $ distance      : num  2020 2020 2020 2020 2020 ...
 $ longitude     : num  116 116 116 116 116 ...
 $ latitude      : num  40 40 40 40 40 ...
 $ pressure      : num  1002 1002 1002 1002 1002 ...
 $ temperature   : num  20.7 20.6 20 19.6 19.4 20 20.3 20 20.7 22.4 ...
 $ humidity      : num  27 28 28 29 28 28 26 23 19 15 ...
 $ wind

           stationId       utc_time                         id       
 aotizhongxin_aq:  48   Min.   :2018-06-04 00:00:00   Min.   : 0.00  
 badaling_aq    :  48   1st Qu.:2018-06-04 11:45:00   1st Qu.:11.75  
 beibuxinqu_aq  :  48   Median :2018-06-04 23:30:00   Median :23.50  
 daxing_aq      :  48   Mean   :2018-06-04 23:30:00   Mean   :23.50  
 dingling_aq    :  48   3rd Qu.:2018-06-05 11:15:00   3rd Qu.:35.25  
 donggaocun_aq  :  48   Max.   :2018-06-05 23:00:00   Max.   :47.00  
 (Other)        :1392                                                
      hour           month        date               weekend     holiday 
 Min.   : 0.00   Min.   :6   Min.   :2018-06-04   Min.   :0   Min.   :0  
 1st Qu.: 5.75   1st Qu.:6   1st Qu.:2018-06-04   1st Qu.:0   1st Qu.:0  
 Median :11.50   Median :6   Median :2018-06-04   Median :0   Median :0  
 Mean   :11.50   Mean   :6   Mean   :2018-06-04   Mean   :0   Mean   :0  
 3rd Qu.:17.25   3rd Qu.:6   3rd Qu.:2018-06-05   3rd Qu.:0   3rd Qu.:

min_date,max_date
2018-06-04,2018-06-05 23:00:00


In [64]:
head(bj_future_data, 20)

stationId,utc_time,id,hour,month,date,weekend,holiday,x,stationName,...,latitude,pressure,temperature,humidity,wind_speed.kph,wind_direction,PM2.5,PM10,O3,test_id
aotizhongxin_aq,2018-06-01 00:00:00,0,0,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1007.804,22.4,19,272.74,1.84,48.28576,79.89863,74.98524,aotizhongxin_aq#0
aotizhongxin_aq,2018-06-01 01:00:00,1,1,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1007.692,21.0,20,322.07,2.15,48.3269,79.87083,73.24972,aotizhongxin_aq#1
aotizhongxin_aq,2018-06-01 02:00:00,2,2,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1007.685,20.5,22,339.79,3.31,48.36803,79.84303,71.5142,aotizhongxin_aq#2
aotizhongxin_aq,2018-06-01 03:00:00,3,3,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1007.792,20.6,23,346.03,4.34,48.40916,79.81522,69.77868,aotizhongxin_aq#3
aotizhongxin_aq,2018-06-01 04:00:00,4,4,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1007.99,22.1,23,352.29,4.96,48.4503,79.78742,68.04316,aotizhongxin_aq#4
aotizhongxin_aq,2018-06-01 05:00:00,5,5,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1008.244,24.0,22,2.27,5.14,48.49143,79.75962,66.30764,aotizhongxin_aq#5
aotizhongxin_aq,2018-06-01 06:00:00,6,6,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1008.475,24.9,21,13.73,4.75,48.53256,79.73182,64.57212,aotizhongxin_aq#6
aotizhongxin_aq,2018-06-01 07:00:00,7,7,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1008.591,21.0,19,26.07,3.44,48.57369,79.70402,62.83661,aotizhongxin_aq#7
aotizhongxin_aq,2018-06-01 08:00:00,8,8,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1008.518,26.1,17,64.42,1.05,48.61483,79.67621,61.10109,aotizhongxin_aq#8
aotizhongxin_aq,2018-06-01 09:00:00,9,9,6,2018-06-01,0,0,5,beijing_grid_304,...,39.982,1008.241,26.4,15,184.05,2.71,48.65596,79.64841,59.36557,aotizhongxin_aq#9


In [65]:
write.csv(bj_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("bj_submission", Sys.Date(),".csv"), row.names = FALSE)

In [66]:
# resetting the timezone
Sys.unsetenv("TZ")

In [69]:
rm("bj_future_data")
rm("bj_lat_long_data")
rm("bj_aq_gm_combined_data")
rm("bj_future_data_humidity")
rm("bj_future_data_temperature")
rm("bj_future_data_pressure")
rm("bj_future_data_wind_direction")
rm("bj_future_data_wind_speed.kph")
rm("bj_future_weather_data")
rm("bj_closest_stations")

"object 'bj_aq_gm_combined_data' not found"