### Import and transformations of API data

In [1]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [2]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

"package 'RCurl' was built under R version 3.4.4"Loading required package: bitops
Loading required package: gsubfn
Loading required package: proto
Could not load tcltk.  Will use slower R code instead.
Loading required package: RSQLite
"package 'dplyr' was built under R version 3.4.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'lubridate' was built under R version 3.4.3"
Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date

"package 'chron' was built under R version 3.4.3"
Attaching package: 'chron'

The following objects are masked from 'package:lubridate':

    days, hours, minutes, seconds, years

Loading required package: caret
"package 'caret' was built under R version 3.4.4"Loading required package: lattice
"package 'lattice' was built under R version 3.4.3"Loading required packa

In [3]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [4]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [5]:
# pull data for Beijing

# acquire air quality data
bj_aq_url <- paste0("https://biendata.com/competition/airquality/bj/",startDate,"/",endDate,"/2k0d1d8")
bj_aq_file <- getURL(bj_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_aq_data <- read.csv(textConnection(bj_aq_file), header=TRUE)
  
# acquire API grid meteorology data
bj_gm_url <- paste0("https://biendata.com/competition/meteorology/bj_grid/",startDate,"/",endDate,"/2k0d1d8")
bj_gm_file <- getURL(bj_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_gm_data <- read.csv(textConnection(bj_gm_file), header=TRUE)

In [6]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [7]:
# Converting character to datetime
bj_gm_data$time <- anytime(bj_gm_data$time)
bj_aq_data$time <- anytime(bj_aq_data$time)

In [8]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "bj_gm_data"
'data.frame':	898970 obs. of  9 variables:
 $ id            : int  2000958 2000959 2000960 2000961 2000962 2000963 2000964 2000965 2000966 2000967 ...
 $ station_id    : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 9 levels "CLEAR_DAY","CLEAR_NIGHT",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ temperature   : num  24 24 25 25 25 25 25 26 22.4 22.4 ...
 $ pressure      : num  973 960 946 931 914 ...
 $ humidity      : num  23 21 19 18 16 15 15 16 16 17 ...
 $ wind_direction: num  161 165 170 184 221 ...
 $ wind_speed    : num  14.28 13.3 12.42 9.84 7.39 ...
NULL
[1] "bj_aq_data"
'data.frame':	46900 obs. of  9 variables:
 $ id                : int  2941450 2941451 2941452 2941453 2941454 2941455 2941456 2941457 2941458 2941459 ...
 $ station_id        : Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ time              : PO

In [9]:
# Printing the min and max dates of all datasets
print("bj_gm_data")
bj_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("bj_aq_data")
bj_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "bj_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-28 22:00:00


[1] "bj_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-28 21:00:00


In [10]:
# using only required columns
bj_aq_data <- bj_aq_data %>% select(-id)
bj_gm_data <- bj_gm_data %>% select(-c(id, weather))

In [11]:
# Beijing closest grids to stations
bj_closest_stations <- read.csv('SL_beijing_closest_stations.csv')
colnames(bj_closest_stations) <- c('x',"stationId","stationName","distance") 

In [12]:
# Modifying column names for consistency
colnames(bj_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(bj_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [13]:
str(bj_closest_stations)
str(bj_gm_data)
str(bj_aq_data)
head(bj_closest_stations)

'data.frame':	35 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ stationName: Factor w/ 26 levels "beijing_grid_216",..: 16 16 13 16 17 19 14 9 8 13 ...
 $ distance   : num  3540 1669 4638 4776 2020 ...
'data.frame':	898970 obs. of  7 variables:
 $ stationName   : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  24 24 25 25 25 25 25 26 22.4 22.4 ...
 $ pressure      : num  973 960 946 931 914 ...
 $ humidity      : num  23 21 19 18 16 15 15 16 16 17 ...
 $ wind_direction: num  161 165 170 184 221 ...
 $ wind_speed.kph: num  14.28 13.3 12.42 9.84 7.39 ...
'data.frame':	46900 obs. of  8 variables:
 $ stationId: Factor w/ 35 levels "aotizhongxin_aq",..: 7 24 11 27 1 19 26 3 35 10 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00

x,stationId,stationName,distance
1,dongsi_aq,beijing_grid_303,3539.569
2,tiantan_aq,beijing_grid_303,1669.215
3,guanyuan_aq,beijing_grid_282,4637.889
4,wanshouxigong_aq,beijing_grid_303,4775.641
5,aotizhongxin_aq,beijing_grid_304,2020.02
6,nongzhanguan_aq,beijing_grid_324,5296.386


In [14]:
# Mapping stationIds with respective grids
bj_aq_map <- merge(bj_aq_data,bj_closest_stations, by = "stationId")

In [15]:
# Merge of AirQuality and Meteorology data
bj_aq_gm_data <- merge(bj_aq_map, bj_gm_data, by = c("stationName","utc_time"))

In [16]:
str(bj_aq_gm_data)
head(bj_aq_gm_data)
bj_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	46092 obs. of  16 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ NO2           : num  45 52 55 59 68 59 85 76 61 87 ...
 $ CO            : num  0.8 0.9 0.9 1 1 1.1 1.1 1.1 1.1 1.2 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ SO2           : num  6 NA NA NA NA 20 8 NA NA NA ...
 $ x             : int  30 30 30 30 30 30 30 30 30 30 ...
 $ distance      : num  2226 2226 2226 2226 2226 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
beijing_grid_216,2018-03-31 07:00:00,liulihe_aq,105,163,45,0.8,137,6.0,30,2226.39,22,1000.8969,20,159.45,10.63
beijing_grid_216,2018-03-31 08:00:00,liulihe_aq,120,185,52,0.9,140,,30,2226.39,23,1000.058,20,173.73,10.96
beijing_grid_216,2018-03-31 09:00:00,liulihe_aq,121,185,55,0.9,141,,30,2226.39,23,999.5212,22,188.34,10.21
beijing_grid_216,2018-03-31 10:00:00,liulihe_aq,137,212,59,1.0,132,,30,2226.39,24,999.4476,24,204.63,9.83
beijing_grid_216,2018-03-31 11:00:00,liulihe_aq,146,254,68,1.0,106,,30,2226.39,20,999.6694,28,232.0,7.27
beijing_grid_216,2018-03-31 12:00:00,liulihe_aq,270,323,59,1.1,96,20.0,30,2226.39,19,1000.0671,31,268.92,6.59


min_date,max_date
2018-03-31 07:00:00,2018-05-28 21:00:00


In [17]:
rm("bj_gm_data")
rm("bj_aq_data")
rm("bj_aq_map")
rm("bj_closest_stations")

In [18]:
# selecting only required columns
bj_aq_gm_data <- bj_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [19]:
# adding additional columns for modeling
bj_aq_gm_data$hour <- hour(bj_aq_gm_data$utc_time)
bj_aq_gm_data$month <- month(bj_aq_gm_data$utc_time)
bj_aq_gm_data$date <- date(bj_aq_gm_data$utc_time)
bj_aq_gm_data$weekend = chron::is.weekend(bj_aq_gm_data$date)

In [20]:
str(bj_aq_gm_data)

'data.frame':	46092 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE

### Import and transformations of Historical data

In [21]:
# retreiving historical files
bj_aq_gm_hist_file <- "Ready for Modeling/bj_aq_gm_hist_data.csv"
bj_aq_gm_hist_data <- read.csv(bj_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [22]:
# data transformations for consistency
bj_aq_gm_hist_data <- bj_aq_gm_hist_data %>% select(-X)
bj_aq_gm_hist_data$utc_time <- anytime(bj_aq_gm_hist_data$utc_time)
bj_aq_gm_hist_data$date <- as.Date(bj_aq_gm_hist_data$date , "%Y-%m-%d")

In [23]:
str(bj_aq_gm_hist_data)

'data.frame':	356825 obs. of  15 variables:
 $ stationName   : chr  "beijing_grid_216" "beijing_grid_216" "beijing_grid_216" "beijing_grid_216" ...
 $ utc_time      : POSIXct, format: "2017-01-01 14:00:00" "2017-01-01 15:00:00" ...
 $ stationId     : chr  "liulihe_aq" "liulihe_aq" "liulihe_aq" "liulihe_aq" ...
 $ PM2.5         : int  376 369 361 354 356 315 287 254 231 224 ...
 $ PM10          : int  447 407 389 NA 360 NA NA NA NA NA ...
 $ O3            : int  2 2 2 2 2 2 NA 2 2 2 ...
 $ temperature   : num  -2.22 -2.37 -2.58 -2.79 -2.99 -3.3 -3.6 -3.9 -4.11 -4.33 ...
 $ pressure      : num  1015 1015 1015 1015 1015 ...
 $ humidity      : num  62.9 65.8 65.3 64.9 64.5 ...
 $ wind_direction: num  284 298 308 319 330 ...
 $ wind_speed.kph: num  4.28 4.39 4.19 4.14 4.24 4.1 3.99 3.92 2.97 2.25 ...
 $ hour          : int  14 15 16 17 18 19 20 21 22 23 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi 

In [24]:
# Append API and hist data
bj_aq_gm_combined_data <- rbind(bj_aq_gm_data, bj_aq_gm_hist_data)

In [25]:
str(bj_aq_gm_combined_data)
bj_aq_gm_combined_data %>% summarize(min_date = min(date), max_date = max(date))
summary(bj_aq_gm_combined_data)

'data.frame':	402917 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 270 160 180 136 149 ...
 $ PM10          : num  163 185 185 212 254 323 297 280 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRU

min_date,max_date
2017-01-01,2018-05-28


           stationName        utc_time                           stationId     
 beijing_grid_303: 69072   Min.   :2017-01-01 14:00:00   badaling_aq  : 11512  
 beijing_grid_282: 23024   1st Qu.:2017-05-09 22:00:00   beibuxinqu_aq: 11512  
 beijing_grid_283: 23024   Median :2017-09-24 14:00:00   daxing_aq    : 11512  
 beijing_grid_324: 23024   Mean   :2017-09-18 23:43:01   dingling_aq  : 11512  
 beijing_grid_452: 23024   3rd Qu.:2018-01-21 02:00:00   donggaocun_aq: 11512  
 beijing_grid_216: 11512   Max.   :2018-05-28 21:00:00   dongsi_aq    : 11512  
 (Other)         :230237                                 (Other)      :333845  
     PM2.5             PM10              O3          temperature    
 Min.   :   2.0   Min.   :   5.0   Min.   :  1.00   Min.   :-18.37  
 1st Qu.:  17.0   1st Qu.:  40.0   1st Qu.: 15.00   1st Qu.:  0.55  
 Median :  41.0   Median :  75.0   Median : 49.00   Median : 11.71  
 Mean   :  60.8   Mean   :  93.9   Mean   : 58.38   Mean   : 11.40  
 3rd Qu.:  81.0

In [26]:
rm("bj_aq_gm_data")
rm("bj_aq_gm_hist_data")

In [30]:
#code to replace outliers with NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$PM2.5 %in% boxplot.stats(bj_aq_gm_combined_data$PM2.5)$out, ]$PM2.5 <- NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$PM10 %in% boxplot.stats(bj_aq_gm_combined_data$PM10)$out, ]$PM10 <- NA
bj_aq_gm_combined_data[bj_aq_gm_combined_data$O3 %in% boxplot.stats(bj_aq_gm_combined_data$O3)$out, ]$O3 <- NA

In [40]:
str(bj_aq_gm_combined_data)
summary(bj_aq_gm_combined_data)

'data.frame':	402917 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ PM2.5         : num  105 120 121 137 146 NA 160 NA 136 149 ...
 $ PM10          : num  163 185 185 212 NA NA NA NA 204 219 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...


           stationName        utc_time                           stationId     
 beijing_grid_303: 69072   Min.   :2017-01-01 14:00:00   badaling_aq  : 11512  
 beijing_grid_282: 23024   1st Qu.:2017-05-09 22:00:00   beibuxinqu_aq: 11512  
 beijing_grid_283: 23024   Median :2017-09-24 14:00:00   daxing_aq    : 11512  
 beijing_grid_324: 23024   Mean   :2017-09-18 23:43:01   dingling_aq  : 11512  
 beijing_grid_452: 23024   3rd Qu.:2018-01-21 02:00:00   donggaocun_aq: 11512  
 beijing_grid_216: 11512   Max.   :2018-05-28 21:00:00   dongsi_aq    : 11512  
 (Other)         :230237                                 (Other)      :333845  
     PM2.5             PM10              O3          temperature    
 Min.   :  2.00   Min.   :  5.00   Min.   :  1.00   Min.   :-18.37  
 1st Qu.: 16.00   1st Qu.: 39.00   1st Qu.: 14.00   1st Qu.:  0.55  
 Median : 38.00   Median : 72.00   Median : 47.00   Median : 11.71  
 Mean   : 49.69   Mean   : 82.42   Mean   : 52.55   Mean   : 11.40  
 3rd Qu.: 72.00

In [41]:
# replace PM2.5 NA with mean values based on certain groups
bj_aq_gm_combined_data_PM2.5 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM2.5, c("stationId", "hour", "month", "weekend")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)
bj_aq_gm_combined_data_PM2.5 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM2.5, c("stationId", "hour", "weekend")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)

"package 'bindrcpp' was built under R version 3.4.3"

In [42]:
# replace PM10 NA with mean values based on certain groups
bj_aq_gm_combined_data_PM10 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM10, c("stationId", "hour", "month", "weekend")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)
bj_aq_gm_combined_data_PM10 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_PM10, c("stationId", "hour", "weekend")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)

In [43]:
# replace O3 NA with mean values based on certain groups
bj_aq_gm_combined_data_O3 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_O3, c("stationId", "hour", "month", "weekend")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
bj_aq_gm_combined_data_O3 <- bj_aq_gm_combined_data %>% 
                                group_by(stationId, hour,  weekend) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
bj_aq_gm_combined_data <- bj_aq_gm_combined_data %>% 
                          left_join(bj_aq_gm_combined_data_O3, c("stationId", "hour",  "weekend")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)

In [44]:
rm("bj_aq_gm_combined_data_PM2.5")
rm("bj_aq_gm_combined_data_PM10")
rm("bj_aq_gm_combined_data_O3")

In [45]:
str(bj_aq_gm_combined_data)
summary(bj_aq_gm_combined_data)

'data.frame':	402917 obs. of  15 variables:
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ temperature   : num  22 23 23 24 20 19 17 13 14 12.4 ...
 $ pressure      : num  1001 1000 1000 999 1000 ...
 $ humidity      : num  20 20 22 24 28 31 32 32 31 32 ...
 $ wind_direction: num  159 174 188 205 232 ...
 $ wind_speed.kph: num  10.63 10.96 10.21 9.83 7.27 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 17 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ PM2.5         : num  105 120 121 137 146 ...
 $ PM10          : num  163 185 185 212 116 ...
 $ O3            : num  137 140 141 132 106 96 46 45 50 7 ...


           stationName        utc_time                           stationId     
 beijing_grid_303: 69072   Min.   :2017-01-01 14:00:00   badaling_aq  : 11512  
 beijing_grid_282: 23024   1st Qu.:2017-05-09 22:00:00   beibuxinqu_aq: 11512  
 beijing_grid_283: 23024   Median :2017-09-24 14:00:00   daxing_aq    : 11512  
 beijing_grid_324: 23024   Mean   :2017-09-18 23:43:01   dingling_aq  : 11512  
 beijing_grid_452: 23024   3rd Qu.:2018-01-21 02:00:00   donggaocun_aq: 11512  
 beijing_grid_216: 11512   Max.   :2018-05-28 21:00:00   dongsi_aq    : 11512  
 (Other)         :230237                                 (Other)      :333845  
  temperature        pressure         humidity      wind_direction 
 Min.   :-18.37   Min.   : 917.6   Min.   :  4.59   Min.   :  0.0  
 1st Qu.:  0.55   1st Qu.: 993.2   1st Qu.: 21.66   1st Qu.:106.6  
 Median : 11.71   Median :1004.4   Median : 33.03   Median :189.0  
 Mean   : 11.40   Mean   :1001.2   Mean   : 37.94   Mean   :191.0  
 3rd Qu.: 22.00   3r

In [46]:
# validating for no NAs
bj_aq_gm_combined_data[!complete.cases(bj_aq_gm_combined_data),]

stationName,utc_time,stationId,temperature,pressure,humidity,wind_direction,wind_speed.kph,hour,month,date,weekend,PM2.5,PM10,O3


In [47]:
# converting logical weekend value to numeric for modeling
bj_aq_gm_combined_data$weekend <- as.integer(bj_aq_gm_combined_data$weekend)

In [48]:
# selecting only required columns
bj_aq_gm_combined_data <- 
bj_aq_gm_combined_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                    "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                    "hour", "month", "date", "weekend"))

In [49]:
# retreiving lat long data  
bj_lat_long_file <- "Datasets/Beijing_AirQuality_Stations.csv"
bj_lat_long_data <- read.csv(bj_lat_long_file, header=TRUE, sep=",", stringsAsFactors = FALSE) 
colnames(bj_lat_long_data) <- c("stationId", "longitude", "latitude")

In [None]:
bj_aq_gm_combined_data <- merge(bj_aq_gm_combined_data,bj_lat_long_data, by = "stationId")

In [52]:
str(bj_aq_gm_combined_data)
bj_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_aq_gm_combined_data)

'data.frame':	402917 obs. of  17 variables:
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 17 17 17 17 17 17 17 17 17 17 ...
 $ utc_time      : POSIXct, format: "2017-01-06 19:00:00" "2017-01-06 20:00:00" ...
 $ PM2.5         : num  46.1 43.8 48.7 173 166 ...
 $ PM10          : num  60.4 56.3 63.7 50.6 59.2 ...
 $ O3            : num  2 33.1 2 2 2 ...
 $ temperature   : num  -3.9 -3.94 -3.86 -3.63 -3.46 -3.98 -3.81 -3.67 -3.85 -4.2 ...
 $ pressure      : num  1018 1018 1018 1018 1018 ...
 $ humidity      : num  87.5 87.5 87.4 86.6 86.1 ...
 $ wind_direction: num  30.1 15.6 47.8 352.5 347.4 ...
 $ wind_speed.kph: num  1.83 2.09 1.71 2.8 3.01 ...
 $ hour          : int  19 20 18 23 0 21 22 17 18 19 ...
 $ month         : num  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-06" "2017-01-06" ...
 $ weekend       : int  0 0 0 0 1 0 0 0 0 0 ...
 $ longitude     : num  116 116

min_date,max_date
2017-01-01 14:00:00,2018-05-28 21:00:00


         stationId                stationName        utc_time                  
 badaling_aq  : 11512   beijing_grid_303: 69072   Min.   :2017-01-01 14:00:00  
 beibuxinqu_aq: 11512   beijing_grid_282: 23024   1st Qu.:2017-05-09 22:00:00  
 daxing_aq    : 11512   beijing_grid_283: 23024   Median :2017-09-24 14:00:00  
 dingling_aq  : 11512   beijing_grid_324: 23024   Mean   :2017-09-18 23:43:01  
 donggaocun_aq: 11512   beijing_grid_452: 23024   3rd Qu.:2018-01-21 02:00:00  
 dongsi_aq    : 11512   beijing_grid_216: 11512   Max.   :2018-05-28 21:00:00  
 (Other)      :333845   (Other)         :230237                                
     PM2.5             PM10              O3          temperature    
 Min.   :  2.00   Min.   :  5.00   Min.   :  1.00   Min.   :-18.37  
 1st Qu.: 18.00   1st Qu.: 45.00   1st Qu.: 16.00   1st Qu.:  0.55  
 Median : 41.49   Median : 73.21   Median : 49.00   Median : 11.71  
 Mean   : 49.79   Mean   : 80.88   Mean   : 54.73   Mean   : 11.40  
 3rd Qu.: 69.00

### Model Training and validation

In [53]:
set.seed(2306)

In [54]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(bj_aq_gm_combined_data))
train_index <- sample(seq_len(nrow(bj_aq_gm_combined_data)), size = sample_size)
train_bj <- bj_aq_gm_combined_data[train_index, ]
test_bj <- bj_aq_gm_combined_data[-train_index, ]

In [55]:
str(train_bj)
str(test_bj)

'data.frame':	322333 obs. of  17 variables:
 $ stationId     : Factor w/ 35 levels "aotizhongxin_aq",..: 11 22 31 21 14 4 3 14 29 11 ...
 $ stationName   : Factor w/ 26 levels "beijing_grid_216",..: 13 16 16 26 1 15 9 1 3 13 ...
 $ utc_time      : POSIXct, format: "2017-05-10 13:00:00" "2017-08-16 02:00:00" ...
 $ PM2.5         : num  51 99 28 104 45 4 34 36 166 18 ...
 $ PM10          : num  132 100 47 34.6 114 ...
 $ O3            : num  112 26 19 13 42 50 2 2 81 2 ...
 $ temperature   : num  20.44 28.83 4.53 -6.37 16 ...
 $ pressure      : num  998 1000 1017 1005 1010 ...
 $ humidity      : num  21.2 56.6 15.8 96.3 27 ...
 $ wind_direction: num  285 180 248 295 258 ...
 $ wind_speed.kph: num  9.08 6.32 12.98 8.16 6.64 ...
 $ hour          : int  13 2 5 23 17 23 21 14 15 22 ...
 $ month         : num  5 8 12 2 5 2 4 4 9 12 ...
 $ date          : Date, format: "2017-05-10" "2017-08-16" ...
 $ weekend       : int  0 0 0 0 0 1 0 0 1 1 ...
 $ longitude     : num  116 116 116 117 116 ...


In [56]:
# Predicting PM2.5 using stationId, hour, month and weekend variables
PM2.5_bj_formula <- as.formula("PM2.5 ~ latitude + longitude + hour + month + weekend")
PM2.5_bj_model <- train(PM2.5_bj_formula, data = train_bj, method = "lm" )
summary(PM2.5_bj_model)
test_bj$PM2.5_pred <- predict(PM2.5_bj_model, test_bj)
metrics(test_bj, truth = PM2.5, estimate = PM2.5_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-57.673 -30.660  -8.329  18.566 138.595 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 588.582604  28.888297  20.374  < 2e-16 ***
latitude    -16.066214   0.303024 -53.020  < 2e-16 ***
longitude     0.910126   0.247749   3.674 0.000239 ***
hour          0.046803   0.009896   4.730 2.25e-06 ***
month        -0.578321   0.019512 -29.639  < 2e-16 ***
weekend       1.290028   0.150985   8.544  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 38.83 on 322327 degrees of freedom
Multiple R-squared:  0.01175,	Adjusted R-squared:  0.01174 
F-statistic: 766.7 on 5 and 322327 DF,  p-value: < 2.2e-16


rmse,rsq
38.81946,0.0119749


In [57]:
# Predicting PM10 using stationId, hour, month, weekend variables along with previously predicted PM2.5 
PM10_bj_formula <- as.formula("PM10 ~ PM2.5 + latitude + longitude + hour + month + weekend")
PM10_bj_model <- train(PM10_bj_formula, data = train_bj, method = "lm" )
summary(PM10_bj_model)
test_bj$PM10_pred <- predict(PM10_bj_model, test_bj)
metrics(test_bj, truth = PM10, estimate = PM10_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-146.426  -23.290   -4.945   17.768  197.472 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.572e+03  2.820e+01  91.207  < 2e-16 ***
PM2.5        6.994e-01  1.718e-03 407.008  < 2e-16 ***
latitude    -1.908e+01  2.969e-01 -64.258  < 2e-16 ***
longitude   -1.512e+01  2.417e-01 -62.569  < 2e-16 ***
hour        -6.896e-02  9.655e-03  -7.142 9.18e-13 ***
month       -5.227e-01  1.906e-02 -27.420  < 2e-16 ***
weekend      3.313e+00  1.473e-01  22.489  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 37.89 on 322326 degrees of freedom
Multiple R-squared:  0.3661,	Adjusted R-squared:  0.3661 
F-statistic: 3.103e+04 on 6 and 322326 DF,  p-value: < 2.2e-16


rmse,rsq
37.90572,0.3680417


In [58]:
# Predicting O3 using stationId, hour, month, weekend variables along with previously predicted PM2.5, PM10 and O3 variables
O3_bj_formula <- as.formula("O3 ~ PM2.5 + PM10 + latitude + longitude + hour + month + weekend")
O3_bj_model <- train(O3_bj_formula, data = train_bj, method = "lm" )
summary(O3_bj_model)
test_bj$O3_pred <- predict(O3_bj_model, test_bj)
metrics(test_bj, truth = O3, estimate = O3_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-94.797 -31.646  -6.674  24.149 150.122 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept) -1.550e+03  3.107e+01  -49.878  < 2e-16 ***
PM2.5       -1.660e-01  2.300e-03  -72.155  < 2e-16 ***
PM10         9.195e-02  1.916e-03   47.992  < 2e-16 ***
latitude     1.797e+01  3.251e-01   55.289  < 2e-16 ***
longitude    7.826e+00  2.645e-01   29.585  < 2e-16 ***
hour        -1.717e+00  1.050e-02 -163.439  < 2e-16 ***
month       -8.041e-01  2.076e-02  -38.731  < 2e-16 ***
weekend     -4.769e-01  1.604e-01   -2.974  0.00294 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 41.21 on 322325 degrees of freedom
Multiple R-squared:  0.106,	Adjusted R-squared:  0.106 
F-statistic:  5461 on 7 and 322325 DF,  p-value: < 2.2e-16


rmse,rsq
41.20984,0.107647


In [59]:
rm("train_bj")
rm("test_bj")

### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [115]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [116]:
# building 1 hour intervals for next 2 days and stationIds
bj_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
bj_time <- with_tz(bj_time, tzone = "UTC")
bj_future_data <- data.frame(bj_time)
# This id will be used in the creation of final submission file
bj_future_data$id <- seq.int(nrow(bj_future_data)) -1
bj_future_data <- merge(bj_future_data, data.frame(unique(bj_aq_gm_combined_data$stationId)))
names(bj_future_data) <- c("utc_time", "id", "stationId")

In [117]:
# building datetime features for the next 2 days
bj_future_data$hour <- hour(bj_future_data$utc_time)
bj_future_data$month <- month(bj_future_data$utc_time)
bj_future_data$date <- date(bj_future_data$utc_time)
bj_future_data$weekend = chron::is.weekend(bj_future_data$date)

In [118]:
bj_future_data$weekend <- as.integer(bj_future_data$weekend)

In [119]:
#lat long data
bj_future_data <- merge(bj_future_data,bj_lat_long_data, by = "stationId")

In [120]:
# predicting PM2.5, PM10 and O3 values
bj_future_data$PM2.5 <- predict(PM2.5_bj_model, bj_future_data)
bj_future_data$PM10 <- predict(PM10_bj_model, bj_future_data)
bj_future_data$O3 <- predict(O3_bj_model, bj_future_data)

In [121]:
#creating the test_id for final submission file
bj_future_data$test_id <- paste(bj_future_data$stationId, "#", bj_future_data$id, sep = "")

In [122]:
str(bj_future_data)
summary(bj_future_data)
bj_future_data %>% summarize(min_date = min(utc_time ), max_date = max(utc_time ))

'data.frame':	1680 obs. of  13 variables:
 $ stationId: Factor w/ 35 levels "aotizhongxin_aq",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time : POSIXct, format: "2018-05-29 00:00:00" "2018-05-29 01:00:00" ...
 $ id       : num  0 1 2 3 4 5 6 7 8 9 ...
 $ hour     : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month    : num  5 5 5 5 5 5 5 5 5 5 ...
 $ date     : Date, format: "2018-05-29" "2018-05-29" ...
 $ weekend  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ longitude: num  116 116 116 116 116 ...
 $ latitude : num  40 40 40 40 40 ...
 $ PM2.5    : num  60.6 60.8 60.9 61.1 61.3 ...
 $ PM10     : num  91.5 91.5 91.5 91.6 91.6 ...
 $ O3       : num  81.6 79.6 77.7 75.7 73.7 ...
 $ test_id  : chr  "aotizhongxin_aq#0" "aotizhongxin_aq#1" "aotizhongxin_aq#2" "aotizhongxin_aq#3" ...


           stationId       utc_time                         id       
 aotizhongxin_aq:  48   Min.   :2018-05-29 00:00:00   Min.   : 0.00  
 badaling_aq    :  48   1st Qu.:2018-05-29 11:45:00   1st Qu.:11.75  
 beibuxinqu_aq  :  48   Median :2018-05-29 23:30:00   Median :23.50  
 daxing_aq      :  48   Mean   :2018-05-29 23:30:00   Mean   :23.50  
 dingling_aq    :  48   3rd Qu.:2018-05-30 11:15:00   3rd Qu.:35.25  
 donggaocun_aq  :  48   Max.   :2018-05-30 23:00:00   Max.   :47.00  
 (Other)        :1392                                                
      hour           month        date               weekend    longitude    
 Min.   : 0.00   Min.   :5   Min.   :2018-05-29   Min.   :0   Min.   :116.0  
 1st Qu.: 5.75   1st Qu.:5   1st Qu.:2018-05-29   1st Qu.:0   1st Qu.:116.2  
 Median :11.50   Median :5   Median :2018-05-29   Median :0   Median :116.4  
 Mean   :11.50   Mean   :5   Mean   :2018-05-29   Mean   :0   Mean   :116.4  
 3rd Qu.:17.25   3rd Qu.:5   3rd Qu.:2018-05-30   

min_date,max_date
2018-05-29,2018-05-30 23:00:00


In [123]:
head(bj_future_data, 20)

stationId,utc_time,id,hour,month,date,weekend,longitude,latitude,PM2.5,PM10,O3,test_id
aotizhongxin_aq,2018-05-29 00:00:00,0,0,5,2018-05-29,0,116.397,39.982,60.61774,91.46826,81.59289,aotizhongxin_aq#0
aotizhongxin_aq,2018-05-29 01:00:00,1,1,5,2018-05-29,0,116.397,39.982,60.77679,91.49735,79.62857,aotizhongxin_aq#1
aotizhongxin_aq,2018-05-29 02:00:00,2,2,5,2018-05-29,0,116.397,39.982,60.93584,91.52645,77.66424,aotizhongxin_aq#2
aotizhongxin_aq,2018-05-29 03:00:00,3,3,5,2018-05-29,0,116.397,39.982,61.09489,91.55554,75.69992,aotizhongxin_aq#3
aotizhongxin_aq,2018-05-29 04:00:00,4,4,5,2018-05-29,0,116.397,39.982,61.25394,91.58464,73.73559,aotizhongxin_aq#4
aotizhongxin_aq,2018-05-29 05:00:00,5,5,5,2018-05-29,0,116.397,39.982,61.41299,91.61373,71.77127,aotizhongxin_aq#5
aotizhongxin_aq,2018-05-29 06:00:00,6,6,5,2018-05-29,0,116.397,39.982,61.57204,91.64283,69.80694,aotizhongxin_aq#6
aotizhongxin_aq,2018-05-29 07:00:00,7,7,5,2018-05-29,0,116.397,39.982,61.73109,91.67192,67.84261,aotizhongxin_aq#7
aotizhongxin_aq,2018-05-29 08:00:00,8,8,5,2018-05-29,0,116.397,39.982,61.89014,91.70102,65.87829,aotizhongxin_aq#8
aotizhongxin_aq,2018-05-29 09:00:00,9,9,5,2018-05-29,0,116.397,39.982,62.04919,91.73011,63.91396,aotizhongxin_aq#9


In [124]:
write.csv(bj_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("bj_submission", Sys.Date(),".csv"), row.names = FALSE)

In [125]:
# resetting the timezone
Sys.unsetenv("TZ")

In [126]:
rm("bj_future_data")
rm("bj_lat_long_data")
rm("bj_aq_gm_combined_data")