### Import and transformations of API data

In [37]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [38]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

In [39]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [40]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [41]:
# pull data for London

# acquire air quality data
ld_aq_url <- paste0("https://biendata.com/competition/airquality/ld/",startDate,"/",endDate,"/2k0d1d8")
ld_aq_file <- getURL(ld_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_aq_data <- read.csv(textConnection(ld_aq_file), header=TRUE)
  
# acquire API grid meteorology data
ld_gm_url <- paste0("https://biendata.com/competition/meteorology/ld_grid/",startDate,"/",endDate,"/2k0d1d8")
ld_gm_file <- getURL(ld_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_gm_data <- read.csv(textConnection(ld_gm_file), header=TRUE)

In [42]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [43]:
# Converting character to datetime
ld_gm_data$time <- anytime(ld_gm_data$time)
ld_aq_data$time <- anytime(ld_aq_data$time)

In [44]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "ld_gm_data"
'data.frame':	1202720 obs. of  9 variables:
 $ id            : int  2000096 2000097 2000098 2000099 2000100 2000101 2000102 2000103 2000104 2000105 ...
 $ station_id    : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 8 levels "CLEAR_DAY","CLEAR_NIGHT",..: 3 3 3 6 6 6 6 6 6 6 ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed    : num  22 18.9 15.8 14.1 13.6 ...
NULL
[1] "ld_aq_data"
'data.frame':	26996 obs. of  9 variables:
 $ id                : int  2941506 2941507 2941508 2941509 2941510 2941511 2941512 2941513 2941514 2941515 ...
 $ station_id        : Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ time       

In [45]:
# Printing the min and max dates of all datasets
print("ld_gm_data")
ld_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("ld_aq_data")
ld_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "ld_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-29 17:00:00


[1] "ld_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-29 16:00:00


In [46]:
# using only required columns
ld_aq_data <- ld_aq_data %>% select(-id)
ld_gm_data <- ld_gm_data %>% select(-c(id, weather))

In [47]:
# London closest grids to stations
ld_closest_stations <- read.csv('SL_london_closest_stations.csv')

In [48]:
# Modifying column names for consistency
colnames(ld_closest_stations) <- c('x',"stationId","stationName","distance") 
colnames(ld_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(ld_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [49]:
str(ld_closest_stations)
str(ld_gm_data)
str(ld_aq_data)
head(ld_closest_stations)

'data.frame':	24 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 24 levels "BL0","BX1","BX9",..: 3 2 1 5 4 7 8 6 10 12 ...
 $ stationName: Factor w/ 10 levels "london_grid_346",..: 10 10 6 6 4 6 6 5 8 8 ...
 $ distance   : num  3929 3929 3060 3683 5212 ...
'data.frame':	1202720 obs. of  7 variables:
 $ stationName   : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed.kph: num  22 18.9 15.8 14.1 13.6 ...
'data.frame':	26996 obs. of  8 variables:
 $ stationId: Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:

x,stationId,stationName,distance
1,BX9,london_grid_472,3929.17
2,BX1,london_grid_472,3929.17
3,BL0,london_grid_409,3059.76
4,CD9,london_grid_409,3682.914
5,CD1,london_grid_388,5211.633
6,CT2,london_grid_409,1646.945


In [50]:
# Mapping stationIds with respective grids
ld_aq_map <- merge(ld_aq_data,ld_closest_stations, by = "stationId")

In [51]:
# Merge of AirQuality and Meteorology data
ld_aq_gm_data <- merge(ld_aq_map, ld_gm_data, by = c("stationName","utc_time"))

In [52]:
str(ld_aq_gm_data)
head(ld_aq_gm_data)
ld_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	26444 obs. of  16 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ NO2           : num  NA NA NA NA NA NA NA NA NA NA ...
 $ CO            : logi  NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ SO2           : logi  NA NA NA NA NA NA ...
 $ x             : int  21 21 21 21 21 21 21 21 21 21 ...
 $ distance      : num  4235 4235 4235 4235 4235 ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num 

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
london_grid_366,2018-03-31 07:00:00,TD5,8.7,,,,,,21,4234.613,4.83,986.8419,87,205.39,11.3
london_grid_366,2018-03-31 08:00:00,TD5,6.8,,,,,,21,4234.613,5.42,987.2517,84,210.33,11.95
london_grid_366,2018-03-31 09:00:00,TD5,4.5,,,,,,21,4234.613,6.05,987.7612,81,214.35,12.01
london_grid_366,2018-03-31 10:00:00,TD5,8.2,,,,,,21,4234.613,6.1,988.8336,79,233.7,10.18
london_grid_366,2018-03-31 11:00:00,TD5,11.8,,,,,,21,4234.613,7.05,991.6767,74,242.44,7.95
london_grid_366,2018-03-31 12:00:00,TD5,10.2,,,,,,21,4234.613,7.8,992.0664,71,267.43,6.76


min_date,max_date
2018-03-31 07:00:00,2018-05-29 16:00:00


In [53]:
rm("ld_gm_data")
rm("ld_aq_data")
rm("ld_aq_map")
rm("ld_closest_stations")

In [54]:
# selecting only required columns
ld_aq_gm_data <- ld_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [55]:
# adding additional columns for modeling
ld_aq_gm_data$hour <- hour(ld_aq_gm_data$utc_time)
ld_aq_gm_data$month <- month(ld_aq_gm_data$utc_time)
ld_aq_gm_data$date <- date(ld_aq_gm_data$utc_time)
ld_aq_gm_data$weekend = chron::is.weekend(ld_aq_gm_data$date)

In [56]:
str(ld_aq_gm_data)

'data.frame':	26444 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...


### Import and transformations of Historical data

In [57]:
# retreiving historical files
ld_aq_gm_hist_file <- "Ready for Modeling/ld_aq_gm_hist_data.csv"
ld_aq_gm_hist_data <- read.csv(ld_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [58]:
# data transformations for consistency
ld_aq_gm_hist_data <- ld_aq_gm_hist_data %>% select(-X)
ld_aq_gm_hist_data$utc_time <- anytime(ld_aq_gm_hist_data$utc_time)
ld_aq_gm_hist_data$date <- as.Date(ld_aq_gm_hist_data$date , "%Y-%m-%d")

In [59]:
colnames(ld_aq_gm_hist_data) <- c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                 "hour","month", "date", "weekend")

In [60]:
str(ld_aq_gm_hist_data)

'data.frame':	257936 obs. of  15 variables:
 $ stationName   : chr  "london_grid_346" "london_grid_346" "london_grid_346" "london_grid_346" ...
 $ utc_time      : POSIXct, format: "2017-01-01 08:00:00" "2017-01-01 09:00:00" ...
 $ stationId     : chr  "LH0" "LH0" "LH0" "LH0" ...
 $ PM2.5         : num  18.3 16.3 13.3 9.4 6.1 6.7 2.1 0.9 1.1 1 ...
 $ PM10          : num  21.3 19.5 16.2 11.6 8.5 13.4 4.6 2 2.1 2 ...
 $ O3            : num  41.6 44.1 49.1 45.2 41.4 53.6 11.7 12.1 12 13.5 ...
 $ temperature   : num  6.05 6.05 6.04 6.04 6.25 6.46 6.67 6.91 7.15 7.39 ...
 $ pressure      : num  1019 1018 1017 1016 1015 ...
 $ humidity      : num  90 89.2 88.3 87.4 87.8 ...
 $ wind_direction: num  221 220 218 217 216 ...
 $ wind_speed.kph: num  16.4 17.1 17.7 18.4 18.8 ...
 $ hour          : int  8 9 10 11 12 13 14 15 16 17 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE .

In [61]:
# Append API and hist data
ld_aq_gm_combined_data <- rbind(ld_aq_gm_data, ld_aq_gm_hist_data)

In [62]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_aq_gm_combined_data)

'data.frame':	284380 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : num  NA NA NA NA NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TR

min_date,max_date
2017-01-01 08:00:00,2018-05-29 16:00:00


          stationName       utc_time                     stationId     
 london_grid_451:47400   Min.   :2017-01-01 08:00:00   CD9    : 12206  
 london_grid_388:47396   1st Qu.:2017-05-07 03:00:00   BL0    : 12198  
 london_grid_409:47390   Median :2017-09-07 14:00:00   GN0    : 12198  
 london_grid_430:36593   Mean   :2017-09-08 10:16:03   GN3    : 12198  
 london_grid_472:36592   3rd Qu.:2018-01-09 01:00:00   GR4    : 12198  
 london_grid_408:23003   Max.   :2018-05-29 16:00:00   GR9    : 12198  
 (Other)        :46006                                 (Other):211184  
     PM2.5             PM10              O3          temperature   
 Min.   :-14.00   Min.   :-11.80   Min.   : -8.30   Min.   :-4.57  
 1st Qu.:  6.50   1st Qu.: 11.40   1st Qu.: 19.50   1st Qu.: 5.98  
 Median : 10.20   Median : 17.20   Median : 34.20   Median :10.18  
 Mean   : 13.44   Mean   : 20.59   Mean   : 39.72   Mean   :10.59  
 3rd Qu.: 16.60   3rd Qu.: 26.00   3rd Qu.: 53.80   3rd Qu.:15.05  
 Max.   :313.00 

In [63]:
rm("ld_aq_gm_data")
rm("ld_aq_gm_hist_data")

In [64]:
# retrieving only air quality stations that need predictions
ld_pred_stations <- read.csv("Datasets/London_AirQuality_Stations.csv",  header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_pred_stations <- ld_pred_stations %>% filter(need_prediction == "TRUE") %>% select(station_id)
colnames(ld_pred_stations) <- "stationId"

In [65]:
ld_aq_gm_combined_data <- merge(ld_aq_gm_combined_data,ld_pred_stations, by = "stationId")

In [66]:
rm("ld_pred_stations")

In [67]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	158576 obs. of  15 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-10-22 03:00:00" "2017-04-21 14:00:00" ...
 $ PM2.5         : num  4 12.8 8.1 5.5 10.4 14.3 8.6 8.2 26.7 17.4 ...
 $ PM10          : num  3.3 24.3 12.1 6.1 20.2 22.2 10 5.9 43 22.8 ...
 $ O3            : num  10.5 36.1 82.4 35.6 35.4 27 17 18.6 NA 60.1 ...
 $ temperature   : num  13.04 8.03 20.28 2.24 12.56 ...
 $ pressure      : num  999 1029 1014 994 1011 ...
 $ humidity      : num  70.2 91.4 40.3 69 73.5 ...
 $ wind_direction: num  239 297 268 301 260 ...
 $ wind_speed.kph: num  28.67 9.93 22.26 19.77 16.89 ...
 $ hour          : int  3 14 20 5 9 22 21 2 12 12 ...
 $ month         : num  10 4 8 12 10 5 7 3 4 1 ...
 $ date          : Date, format: "2017-10-22" "2017-04-21" ...
 $ weekend       : logi  TRUE FALSE TRUE FALSE FALSE FALS

   stationId              stationName       utc_time                  
 CD9    :12206   london_grid_451:36594   Min.   :2017-01-01 08:00:00  
 BL0    :12198   london_grid_430:36593   1st Qu.:2017-05-08 08:00:00  
 GN0    :12198   london_grid_388:36590   Median :2017-09-12 10:00:00  
 GN3    :12198   london_grid_409:24404   Mean   :2017-09-12 22:05:54  
 GR4    :12198   london_grid_472:12198   3rd Qu.:2018-01-17 12:00:00  
 GR9    :12198   london_grid_408:12197   Max.   :2018-05-29 16:00:00  
 (Other):85380   (Other)        :    0                                
     PM2.5            PM10              O3          temperature   
 Min.   :-10.5   Min.   :-11.80   Min.   : -4.70   Min.   :-4.37  
 1st Qu.:  6.6   1st Qu.: 12.00   1st Qu.: 21.60   1st Qu.: 6.09  
 Median : 10.4   Median : 18.00   Median : 37.80   Median :10.30  
 Mean   : 13.6   Mean   : 21.41   Mean   : 43.24   Mean   :10.70  
 3rd Qu.: 16.9   3rd Qu.: 27.00   3rd Qu.: 58.40   3rd Qu.:15.15  
 Max.   :189.7   Max.   :667.1

In [68]:
london_holidays <- c("2017-01-02", "2017-04-14","2017-04-17", "2017-05-01", "2017-05-29", "2017-08-28", "2017-12-25"
                     , "2017-12-26", "2018-01-01", "2018-03-30", "2018-04-02", "2018-05-07", "2018-05-28")

In [69]:
ld_aq_gm_combined_data$holiday <- ifelse(ld_aq_gm_combined_data$date %in% as.Date(london_holidays), 1, 0)

In [70]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	158576 obs. of  16 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-10-22 03:00:00" "2017-04-21 14:00:00" ...
 $ PM2.5         : num  4 12.8 8.1 5.5 10.4 14.3 8.6 8.2 26.7 17.4 ...
 $ PM10          : num  3.3 24.3 12.1 6.1 20.2 22.2 10 5.9 43 22.8 ...
 $ O3            : num  10.5 36.1 82.4 35.6 35.4 27 17 18.6 NA 60.1 ...
 $ temperature   : num  13.04 8.03 20.28 2.24 12.56 ...
 $ pressure      : num  999 1029 1014 994 1011 ...
 $ humidity      : num  70.2 91.4 40.3 69 73.5 ...
 $ wind_direction: num  239 297 268 301 260 ...
 $ wind_speed.kph: num  28.67 9.93 22.26 19.77 16.89 ...
 $ hour          : int  3 14 20 5 9 22 21 2 12 12 ...
 $ month         : num  10 4 8 12 10 5 7 3 4 1 ...
 $ date          : Date, format: "2017-10-22" "2017-04-21" ...
 $ weekend       : logi  TRUE FALSE TRUE FALSE FALSE FALS

   stationId              stationName       utc_time                  
 CD9    :12206   london_grid_451:36594   Min.   :2017-01-01 08:00:00  
 BL0    :12198   london_grid_430:36593   1st Qu.:2017-05-08 08:00:00  
 GN0    :12198   london_grid_388:36590   Median :2017-09-12 10:00:00  
 GN3    :12198   london_grid_409:24404   Mean   :2017-09-12 22:05:54  
 GR4    :12198   london_grid_472:12198   3rd Qu.:2018-01-17 12:00:00  
 GR9    :12198   london_grid_408:12197   Max.   :2018-05-29 16:00:00  
 (Other):85380   (Other)        :    0                                
     PM2.5            PM10              O3          temperature   
 Min.   :-10.5   Min.   :-11.80   Min.   : -4.70   Min.   :-4.37  
 1st Qu.:  6.6   1st Qu.: 12.00   1st Qu.: 21.60   1st Qu.: 6.09  
 Median : 10.4   Median : 18.00   Median : 37.80   Median :10.30  
 Mean   : 13.6   Mean   : 21.41   Mean   : 43.24   Mean   :10.70  
 3rd Qu.: 16.9   3rd Qu.: 27.00   3rd Qu.: 58.40   3rd Qu.:15.15  
 Max.   :189.7   Max.   :667.1

In [71]:
# replacing outliers with NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM2.5 %in% boxplot.stats(ld_aq_gm_combined_data$PM2.5)$out, ]$PM2.5 <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM10 %in% boxplot.stats(ld_aq_gm_combined_data$PM10)$out, ]$PM10 <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$O3 %in% boxplot.stats(ld_aq_gm_combined_data$O3)$out, ]$O3 <- NA

In [78]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	158576 obs. of  16 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-10-22 03:00:00" "2017-04-21 14:00:00" ...
 $ temperature   : num  13.04 8.03 20.28 2.24 12.56 ...
 $ pressure      : num  999 1029 1014 994 1011 ...
 $ humidity      : num  70.2 91.4 40.3 69 73.5 ...
 $ wind_direction: num  239 297 268 301 260 ...
 $ wind_speed.kph: num  28.67 9.93 22.26 19.77 16.89 ...
 $ hour          : int  3 14 20 5 9 22 21 2 12 12 ...
 $ month         : num  10 4 8 12 10 5 7 3 4 1 ...
 $ date          : Date, format: "2017-10-22" "2017-04-21" ...
 $ weekend       : logi  TRUE FALSE TRUE FALSE FALSE FALSE ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ PM2.5         : num  4 12.8 8.1 5.5 10.4 14.3 8.6 8.2 26.7 17.4 ...
 $ PM10          : num  3.3 24.3 12.1 6.1 20.2 22.2 10 5.9 43 22.8 ...
 $ O3            :

   stationId              stationName       utc_time                  
 CD9    :12206   london_grid_451:36594   Min.   :2017-01-01 08:00:00  
 BL0    :12198   london_grid_430:36593   1st Qu.:2017-05-08 08:00:00  
 GN0    :12198   london_grid_388:36590   Median :2017-09-12 10:00:00  
 GN3    :12198   london_grid_409:24404   Mean   :2017-09-12 22:05:54  
 GR4    :12198   london_grid_472:12198   3rd Qu.:2018-01-17 12:00:00  
 GR9    :12198   london_grid_408:12197   Max.   :2018-05-29 16:00:00  
 (Other):85380   (Other)        :    0                                
  temperature       pressure         humidity     wind_direction 
 Min.   :-4.37   Min.   : 964.4   Min.   : 7.00   Min.   :  0.0  
 1st Qu.: 6.09   1st Qu.:1005.0   1st Qu.:66.70   1st Qu.:148.4  
 Median :10.30   Median :1012.2   Median :77.46   Median :228.2  
 Mean   :10.70   Mean   :1010.9   Mean   :75.92   Mean   :207.4  
 3rd Qu.:15.15   3rd Qu.:1018.2   3rd Qu.:87.27   3rd Qu.:272.1  
 Max.   :30.28   Max.   :1035.8   Ma

In [73]:
#replacing PM2.5 NA with mean values of certain groupings
ld_aq_gm_combined_data_PM2.5 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM2.5, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)
ld_aq_gm_combined_data_PM2.5 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM2.5, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)

In [75]:
#replacing PM10 NA with mean values of certain groupings
ld_aq_gm_combined_data_PM10 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM10, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)
ld_aq_gm_combined_data_PM10 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM10, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)

In [77]:
#replacing O3 NA with mean values of certain groupings
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(hour, month, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("hour", "month", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)

In [79]:
rm("ld_aq_gm_combined_data_PM2.5")
rm("ld_aq_gm_combined_data_PM10")
rm("ld_aq_gm_combined_data_O3")

In [80]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	158576 obs. of  16 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-10-22 03:00:00" "2017-04-21 14:00:00" ...
 $ temperature   : num  13.04 8.03 20.28 2.24 12.56 ...
 $ pressure      : num  999 1029 1014 994 1011 ...
 $ humidity      : num  70.2 91.4 40.3 69 73.5 ...
 $ wind_direction: num  239 297 268 301 260 ...
 $ wind_speed.kph: num  28.67 9.93 22.26 19.77 16.89 ...
 $ hour          : int  3 14 20 5 9 22 21 2 12 12 ...
 $ month         : num  10 4 8 12 10 5 7 3 4 1 ...
 $ date          : Date, format: "2017-10-22" "2017-04-21" ...
 $ weekend       : logi  TRUE FALSE TRUE FALSE FALSE FALSE ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ PM2.5         : num  4 12.8 8.1 5.5 10.4 14.3 8.6 8.2 26.7 17.4 ...
 $ PM10          : num  3.3 24.3 12.1 6.1 20.2 22.2 10 5.9 43 22.8 ...
 $ O3            :

   stationId              stationName       utc_time                  
 CD9    :12206   london_grid_451:36594   Min.   :2017-01-01 08:00:00  
 BL0    :12198   london_grid_430:36593   1st Qu.:2017-05-08 08:00:00  
 GN0    :12198   london_grid_388:36590   Median :2017-09-12 10:00:00  
 GN3    :12198   london_grid_409:24404   Mean   :2017-09-12 22:05:54  
 GR4    :12198   london_grid_472:12198   3rd Qu.:2018-01-17 12:00:00  
 GR9    :12198   london_grid_408:12197   Max.   :2018-05-29 16:00:00  
 (Other):85380   (Other)        :    0                                
  temperature       pressure         humidity     wind_direction 
 Min.   :-4.37   Min.   : 964.4   Min.   : 7.00   Min.   :  0.0  
 1st Qu.: 6.09   1st Qu.:1005.0   1st Qu.:66.70   1st Qu.:148.4  
 Median :10.30   Median :1012.2   Median :77.46   Median :228.2  
 Mean   :10.70   Mean   :1010.9   Mean   :75.92   Mean   :207.4  
 3rd Qu.:15.15   3rd Qu.:1018.2   3rd Qu.:87.27   3rd Qu.:272.1  
 Max.   :30.28   Max.   :1035.8   Ma

In [81]:
#validating for no NAs
ld_aq_gm_combined_data[!complete.cases(ld_aq_gm_combined_data),]

stationId,stationName,utc_time,temperature,pressure,humidity,wind_direction,wind_speed.kph,hour,month,date,weekend,holiday,PM2.5,PM10,O3


In [82]:
# converting logical weekend value to numeric for modeling
ld_aq_gm_combined_data$weekend <- as.integer(ld_aq_gm_combined_data$weekend)

In [83]:
# selecting only required columns
ld_aq_gm_combined_data <- 
ld_aq_gm_combined_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                    "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                    "hour", "month", "date", "weekend", "holiday"))

In [84]:
# retreiving lat long data 
ld_lat_long_file <- "Datasets/London_AirQuality_Stations.csv"
ld_lat_long_data <- read.csv(ld_lat_long_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_lat_long_data <- ld_lat_long_data %>% select(station_id, Latitude, Longitude)
colnames(ld_lat_long_data) <- c("stationId", "latitude", "longitude")

In [85]:
ld_aq_gm_combined_data <- merge(ld_aq_gm_combined_data,ld_lat_long_data, by = "stationId")

In [86]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_aq_gm_combined_data)

'data.frame':	158576 obs. of  18 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-10-22 03:00:00" "2017-04-21 14:00:00" ...
 $ PM2.5         : num  4 12.8 8.1 5.5 10.4 14.3 8.6 8.2 26.7 17.4 ...
 $ PM10          : num  3.3 24.3 12.1 6.1 20.2 22.2 10 5.9 43 22.8 ...
 $ O3            : num  10.5 36.1 82.4 35.6 35.4 ...
 $ temperature   : num  13.04 8.03 20.28 2.24 12.56 ...
 $ pressure      : num  999 1029 1014 994 1011 ...
 $ humidity      : num  70.2 91.4 40.3 69 73.5 ...
 $ wind_direction: num  239 297 268 301 260 ...
 $ wind_speed.kph: num  28.67 9.93 22.26 19.77 16.89 ...
 $ hour          : int  3 14 20 5 9 22 21 2 12 12 ...
 $ month         : num  10 4 8 12 10 5 7 3 4 1 ...
 $ date          : Date, format: "2017-10-22" "2017-04-21" ...
 $ weekend       : int  1 0 1 0 0 0 0 0 1 0 ...
 $ holiday       : num  0 0 

min_date,max_date
2017-01-01 08:00:00,2018-05-29 16:00:00


   stationId              stationName       utc_time                  
 CD9    :12206   london_grid_451:36594   Min.   :2017-01-01 08:00:00  
 BL0    :12198   london_grid_430:36593   1st Qu.:2017-05-08 08:00:00  
 GN0    :12198   london_grid_388:36590   Median :2017-09-12 10:00:00  
 GN3    :12198   london_grid_409:24404   Mean   :2017-09-12 22:05:54  
 GR4    :12198   london_grid_472:12198   3rd Qu.:2018-01-17 12:00:00  
 GR9    :12198   london_grid_408:12197   Max.   :2018-05-29 16:00:00  
 (Other):85380   (Other)        :    0                                
     PM2.5            PM10             O3          temperature   
 Min.   :-8.60   Min.   :-8.90   Min.   : -4.70   Min.   :-4.37  
 1st Qu.: 6.90   1st Qu.:12.25   1st Qu.: 24.80   1st Qu.: 6.09  
 Median :10.39   Median :17.60   Median : 39.14   Median :10.30  
 Mean   :11.36   Mean   :19.17   Mean   : 41.59   Mean   :10.70  
 3rd Qu.:14.60   3rd Qu.:24.40   3rd Qu.: 54.65   3rd Qu.:15.15  
 Max.   :32.30   Max.   :49.50   Max

### Model Training and validation

In [87]:
set.seed(2306)

In [88]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(ld_aq_gm_combined_data))
train_index <- sample(seq_len(nrow(ld_aq_gm_combined_data)), size = sample_size)
train_ld <- ld_aq_gm_combined_data[train_index, ]
test_ld <- ld_aq_gm_combined_data[-train_index, ]

In [89]:
str(train_ld)
str(test_ld)

'data.frame':	126860 obs. of  18 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 8 12 17 12 9 4 1 10 15 8 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 8 10 5 10 8 4 6 8 4 8 ...
 $ utc_time      : POSIXct, format: "2018-01-13 17:00:00" "2018-03-18 23:00:00" ...
 $ PM2.5         : num  22.4 15.8 10 1.25 16.2 13.3 8.8 15.2 17 8.5 ...
 $ PM10          : num  24.5 20 17 5.75 25.5 20.9 11.3 20.1 27 13.5 ...
 $ O3            : num  23.6 9.3 37.4 28.2 62.5 ...
 $ temperature   : num  6.02 0.11 5.59 14.86 5.93 ...
 $ pressure      : num  1017 1006 1002 1013 998 ...
 $ humidity      : num  82 73.5 83.4 95.3 81.1 ...
 $ wind_direction: num  146.5 65.4 209.3 133.7 313.8 ...
 $ wind_speed.kph: num  14.57 22.61 8.99 10.02 17.38 ...
 $ hour          : int  17 23 6 9 11 14 13 21 18 23 ...
 $ month         : num  1 3 3 7 3 7 8 4 4 6 ...
 $ date          : Date, format: "2018-01-13" "2018-03-18" ...
 $ weekend       : int  1 1 0 1 0 0 1 1 0 1 ...
 $ holiday   

In [91]:
# Predicting PM2.5 using stationId, hour, month and weekend variables
PM2.5_ld_formula <- as.formula("PM2.5 ~ longitude + latitude + hour + month + weekend + holiday")
PM2.5_ld_model <- train(PM2.5_ld_formula, data = train_ld, method = "lm" )
summary(PM2.5_ld_model)
test_ld$PM2.5_pred <- predict(PM2.5_ld_model, test_ld)
metrics(test_ld, truth = PM2.5, estimate = PM2.5_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-19.9336  -4.3853  -0.9233   3.0040  24.8500 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.961295  22.395260   1.561    0.119    
longitude   -5.789988   0.143772 -40.272   <2e-16 ***
latitude    -0.459373   0.434925  -1.056    0.291    
hour         0.090926   0.002526  35.996   <2e-16 ***
month       -0.228780   0.005125 -44.643   <2e-16 ***
weekend      0.339976   0.038901   8.739   <2e-16 ***
holiday     -1.911035   0.115872 -16.493   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.234 on 126853 degrees of freedom
Multiple R-squared:  0.0403,	Adjusted R-squared:  0.04025 
F-statistic: 887.8 on 6 and 126853 DF,  p-value: < 2.2e-16


rmse,rsq
6.259695,0.04227238


In [93]:
# Predicting PM10 using stationId, hour, month, weekend variables along with previously predicted PM2.5 
PM10_ld_formula <- as.formula("PM10 ~ PM2.5 + longitude + latitude + hour + month + weekend + holiday")
PM10_ld_model <- train(PM10_ld_formula, data = train_ld, method = "lm" )
summary(PM10_ld_model)
test_ld$PM10_pred <- predict(PM10_ld_model, test_ld)
metrics(test_ld, truth = PM10, estimate = PM10_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-32.905  -3.932  -1.080   2.621  38.099 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.484e+03  2.476e+01  59.951  < 2e-16 ***
PM2.5        1.017e+00  3.104e-03 327.767  < 2e-16 ***
longitude    1.278e-01  1.600e-01   0.799    0.424    
latitude    -2.867e+01  4.809e-01 -59.621  < 2e-16 ***
hour         4.235e-02  2.807e-03  15.086  < 2e-16 ***
month       -9.968e-02  5.710e-03 -17.456  < 2e-16 ***
weekend     -1.431e+00  4.302e-02 -33.267  < 2e-16 ***
holiday     -1.051e+00  1.283e-01  -8.194 2.55e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.893 on 126852 degrees of freedom
Multiple R-squared:  0.4833,	Adjusted R-squared:  0.4833 
F-statistic: 1.695e+04 on 7 and 126852 DF,  p-value: < 2.2e-16


rmse,rsq
6.933037,0.4777391


In [95]:
# Predicting O3 using stationId, hour, month, weekend variables along with previously predicted PM2.5, PM10 and O3 variables
O3_ld_formula <- as.formula("O3 ~ PM2.5 + PM10 + longitude + latitude + hour + month + weekend + holiday")
O3_ld_model <- train(O3_ld_formula, data = train_ld, method = "lm" )
summary(O3_ld_model)
test_ld$O3_pred <- predict(O3_ld_model, test_ld)
metrics(test_ld, truth = O3, estimate = O3_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-62.208 -13.206  -2.959  10.944  85.546 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept) -8.512e+03  6.816e+01 -124.881  < 2e-16 ***
PM2.5        3.985e-01  1.145e-02   34.800  < 2e-16 ***
PM10         6.088e-01  7.621e-03   79.885  < 2e-16 ***
longitude   -2.826e+01  4.342e-01  -65.082  < 2e-16 ***
latitude     1.657e+02  1.323e+00  125.197  < 2e-16 ***
hour         5.334e-01  7.626e-03   69.937  < 2e-16 ***
month       -8.006e-02  1.552e-02   -5.159 2.49e-07 ***
weekend     -5.748e+00  1.173e-01  -49.007  < 2e-16 ***
holiday     -9.921e+00  3.482e-01  -28.491  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.71 on 126851 degrees of freedom
Multiple R-squared:  0.3048,	Adjusted R-squared:  0.3048 
F-statistic:  6953 on 8 and 126851 DF,  p-value: < 2.2e-16


rmse,rsq
18.80985,0.2918649


In [96]:
rm("train_ld")
rm("test_ld")

### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [97]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [98]:
# building 1 hour intervals for next 2 days and stationIds
ld_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
ld_time <- with_tz(ld_time, tzone = "UTC")
ld_future_data <- data.frame(ld_time)
# This id will be used in the creation of final submission file
ld_future_data$id <- seq.int(nrow(ld_future_data)) -1
ld_future_data <- merge(ld_future_data, data.frame(unique(ld_aq_gm_combined_data$stationId)))
names(ld_future_data) <- c("utc_time", "id", "stationId")

In [99]:
# building datetime features for the next 2 days
ld_future_data$hour <- hour(ld_future_data$utc_time)
ld_future_data$month <- month(ld_future_data$utc_time)
ld_future_data$date <- date(ld_future_data$utc_time)
ld_future_data$weekend = chron::is.weekend(ld_future_data$date)

In [100]:
ld_future_data$weekend <- as.integer(ld_future_data$weekend)
ld_future_data$holiday <- ifelse(ld_future_data$date %in% as.Date(london_holidays), 1, 0)

In [101]:
# lat long data
ld_future_data <- merge(ld_future_data,ld_lat_long_data, by = "stationId")

In [102]:
# predicting PM2.5, PM10 and O3 values
ld_future_data$PM2.5 <- predict(PM2.5_ld_model, ld_future_data)
ld_future_data$PM10 <- predict(PM10_ld_model, ld_future_data)
ld_future_data$O3 <- predict(O3_ld_model, ld_future_data)

In [103]:
#creating the test_id for final submission file
ld_future_data$test_id <- paste(ld_future_data$stationId, "#", ld_future_data$id, sep = "")

In [104]:
str(ld_future_data)
summary(ld_future_data)
ld_future_data %>% summarize(min_date = min(utc_time ), max_date = max(utc_time ))

'data.frame':	624 obs. of  14 variables:
 $ stationId: Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time : POSIXct, format: "2018-05-30 00:00:00" "2018-05-30 01:00:00" ...
 $ id       : num  0 1 2 3 4 5 6 7 8 9 ...
 $ hour     : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month    : num  5 5 5 5 5 5 5 5 5 5 ...
 $ date     : Date, format: "2018-05-30" "2018-05-30" ...
 $ weekend  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ holiday  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ latitude : num  51.5 51.5 51.5 51.5 51.5 ...
 $ longitude: num  -0.126 -0.126 -0.126 -0.126 -0.126 ...
 $ PM2.5    : num  10.9 11 11.1 11.2 11.2 ...
 $ PM10     : num  17.9 18 18.1 18.3 18.4 ...
 $ O3       : num  43.5 44.2 44.8 45.5 46.1 ...
 $ test_id  : chr  "BL0#0" "BL0#1" "BL0#2" "BL0#3" ...


   stationId      utc_time                         id             hour      
 BL0    : 48   Min.   :2018-05-30 00:00:00   Min.   : 0.00   Min.   : 0.00  
 CD1    : 48   1st Qu.:2018-05-30 11:45:00   1st Qu.:11.75   1st Qu.: 5.75  
 CD9    : 48   Median :2018-05-30 23:30:00   Median :23.50   Median :11.50  
 GN0    : 48   Mean   :2018-05-30 23:30:00   Mean   :23.50   Mean   :11.50  
 GN3    : 48   3rd Qu.:2018-05-31 11:15:00   3rd Qu.:35.25   3rd Qu.:17.25  
 GR4    : 48   Max.   :2018-05-31 23:00:00   Max.   :47.00   Max.   :23.00  
 (Other):336                                                                
     month        date               weekend     holiday     latitude    
 Min.   :5   Min.   :2018-05-30   Min.   :0   Min.   :0   Min.   :51.39  
 1st Qu.:5   1st Qu.:2018-05-30   1st Qu.:0   1st Qu.:0   1st Qu.:51.47  
 Median :5   Median :2018-05-30   Median :0   Median :0   Median :51.52  
 Mean   :5   Mean   :2018-05-30   Mean   :0   Mean   :0   Mean   :51.49  
 3rd Qu.:5   3

min_date,max_date
2018-05-30,2018-05-31 23:00:00


In [105]:
head(ld_future_data, 20)

stationId,utc_time,id,hour,month,date,weekend,holiday,latitude,longitude,PM2.5,PM10,O3,test_id
BL0,2018-05-30 00:00:00,0,0,5,2018-05-30,0,0,51.52229,-0.125848,10.87809,17.87551,43.5395,BL0#0
BL0,2018-05-30 01:00:00,1,1,5,2018-05-30,0,0,51.52229,-0.125848,10.96902,18.01037,44.19121,BL0#1
BL0,2018-05-30 02:00:00,2,2,5,2018-05-30,0,0,51.52229,-0.125848,11.05995,18.14523,44.84292,BL0#2
BL0,2018-05-30 03:00:00,3,3,5,2018-05-30,0,0,51.52229,-0.125848,11.15087,18.2801,45.49463,BL0#3
BL0,2018-05-30 04:00:00,4,4,5,2018-05-30,0,0,51.52229,-0.125848,11.2418,18.41496,46.14634,BL0#4
BL0,2018-05-30 05:00:00,5,5,5,2018-05-30,0,0,51.52229,-0.125848,11.33272,18.54982,46.79805,BL0#5
BL0,2018-05-30 06:00:00,6,6,5,2018-05-30,0,0,51.52229,-0.125848,11.42365,18.68468,47.44976,BL0#6
BL0,2018-05-30 07:00:00,7,7,5,2018-05-30,0,0,51.52229,-0.125848,11.51457,18.81954,48.10147,BL0#7
BL0,2018-05-30 08:00:00,8,8,5,2018-05-30,0,0,51.52229,-0.125848,11.6055,18.95441,48.75318,BL0#8
BL0,2018-05-30 09:00:00,9,9,5,2018-05-30,0,0,51.52229,-0.125848,11.69643,19.08927,49.40489,BL0#9


In [None]:
write.csv(ld_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("ld_submission",Sys.Date(),".csv"), row.names = FALSE)

In [None]:
# resetting the timezone
Sys.unsetenv("TZ")

In [None]:
rm("ld_future_data")
rm("ld_lat_long_data")
rm("ld_aq_gm_combined_data")