### Import and transformations of API data

In [1]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [2]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

"package 'RCurl' was built under R version 3.4.4"Loading required package: bitops
Loading required package: gsubfn
Loading required package: proto
Could not load tcltk.  Will use slower R code instead.
Loading required package: RSQLite
"package 'dplyr' was built under R version 3.4.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'lubridate' was built under R version 3.4.3"
Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date

"package 'chron' was built under R version 3.4.3"
Attaching package: 'chron'

The following objects are masked from 'package:lubridate':

    days, hours, minutes, seconds, years

Loading required package: caret
"package 'caret' was built under R version 3.4.4"Loading required package: lattice
"package 'lattice' was built under R version 3.4.3"Loading required packa

In [3]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [4]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [5]:
# pull data for London

# acquire air quality data
ld_aq_url <- paste0("https://biendata.com/competition/airquality/ld/",startDate,"/",endDate,"/2k0d1d8")
ld_aq_file <- getURL(ld_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_aq_data <- read.csv(textConnection(ld_aq_file), header=TRUE)
  
# acquire API grid meteorology data
ld_gm_url <- paste0("https://biendata.com/competition/meteorology/ld_grid/",startDate,"/",endDate,"/2k0d1d8")
ld_gm_file <- getURL(ld_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_gm_data <- read.csv(textConnection(ld_gm_file), header=TRUE)

In [6]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [7]:
# Converting character to datetime
ld_gm_data$time <- anytime(ld_gm_data$time)
ld_aq_data$time <- anytime(ld_aq_data$time)

In [8]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "ld_gm_data"
'data.frame':	1246627 obs. of  9 variables:
 $ id            : int  2000096 2000097 2000098 2000099 2000100 2000101 2000102 2000103 2000104 2000105 ...
 $ station_id    : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 8 levels "CLEAR_DAY","CLEAR_NIGHT",..: 3 3 3 6 6 6 6 6 6 6 ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed    : num  22 18.9 15.8 14.1 13.6 ...
NULL
[1] "ld_aq_data"
'data.frame':	28060 obs. of  9 variables:
 $ id                : int  2941506 2941507 2941508 2941509 2941510 2941511 2941512 2941513 2941514 2941515 ...
 $ station_id        : Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ time       

In [9]:
# Printing the min and max dates of all datasets
print("ld_gm_data")
ld_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("ld_aq_data")
ld_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "ld_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-06-01


[1] "ld_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-06-01


In [10]:
# using only required columns
ld_aq_data <- ld_aq_data %>% select(-id)
ld_gm_data <- ld_gm_data %>% select(-c(id, weather))

In [11]:
# London closest grids to stations
ld_closest_stations <- read.csv('SL_london_closest_stations.csv')

In [12]:
# Modifying column names for consistency
colnames(ld_closest_stations) <- c('x',"stationId","stationName","distance") 
colnames(ld_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(ld_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [13]:
str(ld_closest_stations)
str(ld_gm_data)
str(ld_aq_data)
head(ld_closest_stations)

'data.frame':	24 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 24 levels "BL0","BX1","BX9",..: 3 2 1 5 4 7 8 6 10 12 ...
 $ stationName: Factor w/ 10 levels "london_grid_346",..: 10 10 6 6 4 6 6 5 8 8 ...
 $ distance   : num  3929 3929 3060 3683 5212 ...
'data.frame':	1246627 obs. of  7 variables:
 $ stationName   : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed.kph: num  22 18.9 15.8 14.1 13.6 ...
'data.frame':	28060 obs. of  8 variables:
 $ stationId: Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:

x,stationId,stationName,distance
1,BX9,london_grid_472,3929.17
2,BX1,london_grid_472,3929.17
3,BL0,london_grid_409,3059.76
4,CD9,london_grid_409,3682.914
5,CD1,london_grid_388,5211.633
6,CT2,london_grid_409,1646.945


In [14]:
# Mapping stationIds with respective grids
ld_aq_map <- merge(ld_aq_data,ld_closest_stations, by = "stationId")

In [15]:
# Merge of AirQuality and Meteorology data
ld_aq_gm_data <- merge(ld_aq_map, ld_gm_data, by = c("stationName","utc_time"))

In [16]:
str(ld_aq_gm_data)
head(ld_aq_gm_data)
ld_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	27432 obs. of  16 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ NO2           : num  NA NA NA NA NA NA NA NA NA NA ...
 $ CO            : logi  NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ SO2           : logi  NA NA NA NA NA NA ...
 $ x             : int  21 21 21 21 21 21 21 21 21 21 ...
 $ distance      : num  4235 4235 4235 4235 4235 ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num 

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
london_grid_366,2018-03-31 07:00:00,TD5,8.7,,,,,,21,4234.613,4.83,986.8419,87,205.39,11.3
london_grid_366,2018-03-31 08:00:00,TD5,6.8,,,,,,21,4234.613,5.42,987.2517,84,210.33,11.95
london_grid_366,2018-03-31 09:00:00,TD5,4.5,,,,,,21,4234.613,6.05,987.7612,81,214.35,12.01
london_grid_366,2018-03-31 10:00:00,TD5,8.2,,,,,,21,4234.613,6.1,988.8336,79,233.7,10.18
london_grid_366,2018-03-31 11:00:00,TD5,11.8,,,,,,21,4234.613,7.05,991.6767,74,242.44,7.95
london_grid_366,2018-03-31 12:00:00,TD5,10.2,,,,,,21,4234.613,7.8,992.0664,71,267.43,6.76


min_date,max_date
2018-03-31 07:00:00,2018-06-01


In [17]:
rm("ld_gm_data")
rm("ld_aq_data")
rm("ld_aq_map")

In [18]:
# selecting only required columns
ld_aq_gm_data <- ld_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [19]:
# adding additional columns for modeling
ld_aq_gm_data$hour <- hour(ld_aq_gm_data$utc_time)
ld_aq_gm_data$month <- month(ld_aq_gm_data$utc_time)
ld_aq_gm_data$date <- date(ld_aq_gm_data$utc_time)
ld_aq_gm_data$weekend = chron::is.weekend(ld_aq_gm_data$date)

In [20]:
str(ld_aq_gm_data)

'data.frame':	27432 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...


### Import and transformations of Historical data

In [21]:
# retreiving historical files
ld_aq_gm_hist_file <- "Ready for Modeling/ld_aq_gm_hist_data.csv"
ld_aq_gm_hist_data <- read.csv(ld_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [22]:
# data transformations for consistency
ld_aq_gm_hist_data <- ld_aq_gm_hist_data %>% select(-X)
ld_aq_gm_hist_data$utc_time <- anytime(ld_aq_gm_hist_data$utc_time)
ld_aq_gm_hist_data$date <- as.Date(ld_aq_gm_hist_data$date , "%Y-%m-%d")

In [23]:
colnames(ld_aq_gm_hist_data) <- c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                 "hour","month", "date", "weekend")

In [24]:
str(ld_aq_gm_hist_data)

'data.frame':	257936 obs. of  15 variables:
 $ stationName   : chr  "london_grid_346" "london_grid_346" "london_grid_346" "london_grid_346" ...
 $ utc_time      : POSIXct, format: "2017-01-01 08:00:00" "2017-01-01 09:00:00" ...
 $ stationId     : chr  "LH0" "LH0" "LH0" "LH0" ...
 $ PM2.5         : num  18.3 16.3 13.3 9.4 6.1 6.7 2.1 0.9 1.1 1 ...
 $ PM10          : num  21.3 19.5 16.2 11.6 8.5 13.4 4.6 2 2.1 2 ...
 $ O3            : num  41.6 44.1 49.1 45.2 41.4 53.6 11.7 12.1 12 13.5 ...
 $ temperature   : num  6.05 6.05 6.04 6.04 6.25 6.46 6.67 6.91 7.15 7.39 ...
 $ pressure      : num  1019 1018 1017 1016 1015 ...
 $ humidity      : num  90 89.2 88.3 87.4 87.8 ...
 $ wind_direction: num  221 220 218 217 216 ...
 $ wind_speed.kph: num  16.4 17.1 17.7 18.4 18.8 ...
 $ hour          : int  8 9 10 11 12 13 14 15 16 17 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE .

In [25]:
# Append API and hist data
ld_aq_gm_combined_data <- rbind(ld_aq_gm_data, ld_aq_gm_hist_data)

In [26]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_aq_gm_combined_data)

'data.frame':	285368 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : num  NA NA NA NA NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TR

min_date,max_date
2017-01-01 08:00:00,2018-06-01


          stationName       utc_time                     stationId     
 london_grid_409:47598   Min.   :2017-01-01 08:00:00   CD9    : 12258  
 london_grid_451:47556   1st Qu.:2017-05-07 14:00:00   BL0    : 12250  
 london_grid_388:47552   Median :2017-09-08 10:00:00   GN0    : 12250  
 london_grid_430:36749   Mean   :2017-09-09 08:14:20   GN3    : 12250  
 london_grid_472:36748   3rd Qu.:2018-01-10 08:00:00   GR4    : 12250  
 london_grid_408:23055   Max.   :2018-06-01 00:00:00   GR9    : 12250  
 (Other)        :46110                                 (Other):211860  
     PM2.5             PM10              O3          temperature   
 Min.   :-14.00   Min.   :-12.00   Min.   : -8.30   Min.   :-4.57  
 1st Qu.:  6.50   1st Qu.: 11.40   1st Qu.: 19.50   1st Qu.: 5.99  
 Median : 10.30   Median : 17.20   Median : 34.20   Median :10.21  
 Mean   : 13.47   Mean   : 20.62   Mean   : 39.72   Mean   :10.61  
 3rd Qu.: 16.70   3rd Qu.: 26.00   3rd Qu.: 53.80   3rd Qu.:15.08  
 Max.   :313.00 

In [27]:
rm("ld_aq_gm_data")
rm("ld_aq_gm_hist_data")

In [28]:
# retrieving only air quality stations that need predictions
ld_pred_stations <- read.csv("Datasets/London_AirQuality_Stations.csv",  header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_pred_stations <- ld_pred_stations %>% filter(need_prediction == "TRUE") %>% select(station_id)
colnames(ld_pred_stations) <- "stationId"

"package 'bindrcpp' was built under R version 3.4.3"

In [29]:
ld_aq_gm_combined_data <- merge(ld_aq_gm_combined_data,ld_pred_stations, by = "stationId")

In [30]:
rm("ld_pred_stations")

In [31]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	159252 obs. of  15 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-03-18 01:00:00" "2018-05-10 06:00:00" ...
 $ PM2.5         : num  3.9 1.5 9.9 10.5 8.1 11.9 5.7 5.8 8.1 19.7 ...
 $ PM10          : num  5.4 6.4 14.1 18.3 8.9 13.8 5.7 12.9 NA 28.4 ...
 $ O3            : num  11.9 NA 20.7 44.9 48.8 29.5 18 NA 28.4 NA ...
 $ temperature   : num  8.35 10.86 17.35 18.75 13.75 ...
 $ pressure      : num  1010 1008 1005 1014 1008 ...
 $ humidity      : num  79.6 68 65.8 61.8 64.7 ...
 $ wind_direction: num  238.6 301.7 206.2 28.6 337 ...
 $ wind_speed.kph: num  28.39 18.93 36.19 19.7 9.06 ...
 $ hour          : int  1 6 19 20 17 10 0 23 11 23 ...
 $ month         : num  3 5 6 7 9 9 2 5 6 5 ...
 $ date          : Date, format: "2017-03-18" "2018-05-10" ...
 $ weekend       : logi  TRUE FALSE FALSE FALSE TRU

   stationId              stationName       utc_time                  
 CD9    :12258   london_grid_451:36750   Min.   :2017-01-01 08:00:00  
 BL0    :12250   london_grid_430:36749   1st Qu.:2017-05-08 21:00:00  
 GN0    :12250   london_grid_388:36746   Median :2017-09-13 12:00:00  
 GN3    :12250   london_grid_409:24508   Mean   :2017-09-14 00:34:44  
 GR4    :12250   london_grid_472:12250   3rd Qu.:2018-01-19 03:00:00  
 GR9    :12250   london_grid_408:12249   Max.   :2018-06-01 00:00:00  
 (Other):85744   (Other)        :    0                                
     PM2.5             PM10              O3          temperature   
 Min.   :-10.50   Min.   :-11.80   Min.   : -4.70   Min.   :-4.37  
 1st Qu.:  6.60   1st Qu.: 12.00   1st Qu.: 21.60   1st Qu.: 6.11  
 Median : 10.40   Median : 18.00   Median : 37.80   Median :10.34  
 Mean   : 13.63   Mean   : 21.44   Mean   : 43.24   Mean   :10.74  
 3rd Qu.: 17.00   3rd Qu.: 27.00   3rd Qu.: 58.40   3rd Qu.:15.20  
 Max.   :189.70   Max.  

In [32]:
london_holidays <- c("2017-01-02", "2017-04-14","2017-04-17", "2017-05-01", "2017-05-29", "2017-08-28", "2017-12-25"
                     , "2017-12-26", "2018-01-01", "2018-03-30", "2018-04-02", "2018-05-07", "2018-05-28")

In [33]:
ld_aq_gm_combined_data$holiday <- ifelse(ld_aq_gm_combined_data$date %in% as.Date(london_holidays), 1, 0)

In [34]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	159252 obs. of  16 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-03-18 01:00:00" "2018-05-10 06:00:00" ...
 $ PM2.5         : num  3.9 1.5 9.9 10.5 8.1 11.9 5.7 5.8 8.1 19.7 ...
 $ PM10          : num  5.4 6.4 14.1 18.3 8.9 13.8 5.7 12.9 NA 28.4 ...
 $ O3            : num  11.9 NA 20.7 44.9 48.8 29.5 18 NA 28.4 NA ...
 $ temperature   : num  8.35 10.86 17.35 18.75 13.75 ...
 $ pressure      : num  1010 1008 1005 1014 1008 ...
 $ humidity      : num  79.6 68 65.8 61.8 64.7 ...
 $ wind_direction: num  238.6 301.7 206.2 28.6 337 ...
 $ wind_speed.kph: num  28.39 18.93 36.19 19.7 9.06 ...
 $ hour          : int  1 6 19 20 17 10 0 23 11 23 ...
 $ month         : num  3 5 6 7 9 9 2 5 6 5 ...
 $ date          : Date, format: "2017-03-18" "2018-05-10" ...
 $ weekend       : logi  TRUE FALSE FALSE FALSE TRU

   stationId              stationName       utc_time                  
 CD9    :12258   london_grid_451:36750   Min.   :2017-01-01 08:00:00  
 BL0    :12250   london_grid_430:36749   1st Qu.:2017-05-08 21:00:00  
 GN0    :12250   london_grid_388:36746   Median :2017-09-13 12:00:00  
 GN3    :12250   london_grid_409:24508   Mean   :2017-09-14 00:34:44  
 GR4    :12250   london_grid_472:12250   3rd Qu.:2018-01-19 03:00:00  
 GR9    :12250   london_grid_408:12249   Max.   :2018-06-01 00:00:00  
 (Other):85744   (Other)        :    0                                
     PM2.5             PM10              O3          temperature   
 Min.   :-10.50   Min.   :-11.80   Min.   : -4.70   Min.   :-4.37  
 1st Qu.:  6.60   1st Qu.: 12.00   1st Qu.: 21.60   1st Qu.: 6.11  
 Median : 10.40   Median : 18.00   Median : 37.80   Median :10.34  
 Mean   : 13.63   Mean   : 21.44   Mean   : 43.24   Mean   :10.74  
 3rd Qu.: 17.00   3rd Qu.: 27.00   3rd Qu.: 58.40   3rd Qu.:15.20  
 Max.   :189.70   Max.  

In [35]:
ld_aq_gm_combined_data[ld_aq_gm_combined_data$temperature %in% boxplot.stats(ld_aq_gm_combined_data$temperature)$out, ]$temperature <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$pressure %in% boxplot.stats(ld_aq_gm_combined_data$pressure)$out, ]$pressure <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$humidity %in% boxplot.stats(ld_aq_gm_combined_data$humidity)$out, ]$humidity <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$wind_speed.kph %in% boxplot.stats(ld_aq_gm_combined_data$wind_speed.kph)$out, ]$wind_speed.kph <- NA

In [36]:
# replacing outliers with NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM2.5 %in% boxplot.stats(ld_aq_gm_combined_data$PM2.5)$out, ]$PM2.5 <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM10 %in% boxplot.stats(ld_aq_gm_combined_data$PM10)$out, ]$PM10 <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$O3 %in% boxplot.stats(ld_aq_gm_combined_data$O3)$out, ]$O3 <- NA

In [37]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	159252 obs. of  16 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-03-18 01:00:00" "2018-05-10 06:00:00" ...
 $ PM2.5         : num  3.9 1.5 9.9 10.5 8.1 11.9 5.7 5.8 8.1 19.7 ...
 $ PM10          : num  5.4 6.4 14.1 18.3 8.9 13.8 5.7 12.9 NA 28.4 ...
 $ O3            : num  11.9 NA 20.7 44.9 48.8 29.5 18 NA 28.4 NA ...
 $ temperature   : num  8.35 10.86 17.35 18.75 13.75 ...
 $ pressure      : num  1010 1008 1005 1014 1008 ...
 $ humidity      : num  79.6 68 65.8 61.8 64.7 ...
 $ wind_direction: num  238.6 301.7 206.2 28.6 337 ...
 $ wind_speed.kph: num  28.39 18.93 NA 19.7 9.06 ...
 $ hour          : int  1 6 19 20 17 10 0 23 11 23 ...
 $ month         : num  3 5 6 7 9 9 2 5 6 5 ...
 $ date          : Date, format: "2017-03-18" "2018-05-10" ...
 $ weekend       : logi  TRUE FALSE FALSE FALSE TRUE T

   stationId              stationName       utc_time                  
 CD9    :12258   london_grid_451:36750   Min.   :2017-01-01 08:00:00  
 BL0    :12250   london_grid_430:36749   1st Qu.:2017-05-08 21:00:00  
 GN0    :12250   london_grid_388:36746   Median :2017-09-13 12:00:00  
 GN3    :12250   london_grid_409:24508   Mean   :2017-09-14 00:34:44  
 GR4    :12250   london_grid_472:12250   3rd Qu.:2018-01-19 03:00:00  
 GR9    :12250   london_grid_408:12249   Max.   :2018-06-01 00:00:00  
 (Other):85744   (Other)        :    0                                
     PM2.5            PM10             O3          temperature   
 Min.   :-8.60   Min.   :-8.90   Min.   : -4.70   Min.   :-4.37  
 1st Qu.: 6.30   1st Qu.:11.70   1st Qu.: 21.20   1st Qu.: 6.10  
 Median : 9.90   Median :17.30   Median : 36.90   Median :10.33  
 Mean   :11.39   Mean   :19.27   Mean   : 40.78   Mean   :10.73  
 3rd Qu.:15.20   3rd Qu.:25.30   3rd Qu.: 56.40   3rd Qu.:15.20  
 Max.   :32.60   Max.   :49.50   Max

In [38]:
# replace temperature NA with mean values based on certain groups
ld_aq_gm_combined_data_temperature<- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(temperature = mean(temperature, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_temperature, c("stationId", "hour", "month")) %>% 
                          mutate(temperature = coalesce(temperature.x, temperature.y)) %>% 
                          select(-temperature.x, -temperature.y) 

# replace pressure NA with mean values based on certain groups
ld_aq_gm_combined_data_pressure <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(pressure = mean(pressure, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_pressure, c("stationId", "hour", "month")) %>% 
                          mutate(pressure = coalesce(pressure.x, pressure.y)) %>% 
                          select(-pressure.x, -pressure.y)

# replace humidity NA with mean values based on certain groups
ld_aq_gm_combined_data_humidity <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(humidity = mean(humidity, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_humidity, c("stationId", "hour", "month")) %>% 
                          mutate(humidity = coalesce(humidity.x, humidity.y)) %>% 
                          select(-humidity.x, -humidity.y)


# replace wind_speed NA with mean values based on certain groups
ld_aq_gm_combined_data_wind_speed.kph <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month) %>% 
                                summarize(wind_speed.kph = mean(wind_speed.kph, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_wind_speed.kph, c("stationId", "hour", "month")) %>% 
                          mutate(wind_speed.kph = coalesce(wind_speed.kph.x, wind_speed.kph.y)) %>% 
                          select(-wind_speed.kph.x, -wind_speed.kph.y)

In [39]:
#replacing PM2.5 NA with mean values of certain groupings
ld_aq_gm_combined_data_PM2.5 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM2.5, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)
ld_aq_gm_combined_data_PM2.5 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM2.5, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)

In [40]:
#replacing PM10 NA with mean values of certain groupings
ld_aq_gm_combined_data_PM10 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM10, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)
ld_aq_gm_combined_data_PM10 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM10, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)

In [41]:
#replacing O3 NA with mean values of certain groupings
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("stationId", "hour", "month", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("stationId", "hour", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(hour, month, weekend, holiday) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("hour", "month", "weekend", "holiday")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)

In [42]:
rm("ld_aq_gm_combined_data_PM2.5")
rm("ld_aq_gm_combined_data_PM10")
rm("ld_aq_gm_combined_data_O3")
rm("ld_aq_gm_combined_data_temperature")
rm("ld_aq_gm_combined_data_pressure")
rm("ld_aq_gm_combined_data_humidity")
rm("ld_aq_gm_combined_data_wind_speed.kph")

In [43]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	159252 obs. of  16 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-03-18 01:00:00" "2018-05-10 06:00:00" ...
 $ wind_direction: num  238.6 301.7 206.2 28.6 337 ...
 $ hour          : int  1 6 19 20 17 10 0 23 11 23 ...
 $ month         : num  3 5 6 7 9 9 2 5 6 5 ...
 $ date          : Date, format: "2017-03-18" "2018-05-10" ...
 $ weekend       : logi  TRUE FALSE FALSE FALSE TRUE TRUE ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ temperature   : num  8.35 10.86 17.35 18.75 13.75 ...
 $ pressure      : num  1010 1008 1005 1014 1008 ...
 $ humidity      : num  79.6 68 65.8 61.8 64.7 ...
 $ wind_speed.kph: num  28.39 18.93 18.53 19.7 9.06 ...
 $ PM2.5         : num  3.9 1.5 9.9 10.5 8.1 11.9 5.7 5.8 8.1 19.7 ...
 $ PM10          : num  5.4 6.4 14.1 18.3 8.9 ...
 $ O3            : num  11.9 45 2

   stationId              stationName       utc_time                  
 CD9    :12258   london_grid_451:36750   Min.   :2017-01-01 08:00:00  
 BL0    :12250   london_grid_430:36749   1st Qu.:2017-05-08 21:00:00  
 GN0    :12250   london_grid_388:36746   Median :2017-09-13 12:00:00  
 GN3    :12250   london_grid_409:24508   Mean   :2017-09-14 00:34:44  
 GR4    :12250   london_grid_472:12250   3rd Qu.:2018-01-19 03:00:00  
 GR9    :12250   london_grid_408:12249   Max.   :2018-06-01 00:00:00  
 (Other):85744   (Other)        :    0                                
 wind_direction       hour           month             date           
 Min.   :  0.0   Min.   : 0.00   Min.   : 1.000   Min.   :2017-01-01  
 1st Qu.:147.1   1st Qu.: 5.00   1st Qu.: 3.000   1st Qu.:2017-05-08  
 Median :227.9   Median :11.00   Median : 5.000   Median :2017-09-13  
 Mean   :206.9   Mean   :11.49   Mean   : 5.521   Mean   :2017-09-13  
 3rd Qu.:271.9   3rd Qu.:17.00   3rd Qu.: 8.000   3rd Qu.:2018-01-19  
 Max. 

In [44]:
#validating for no NAs
ld_aq_gm_combined_data[!complete.cases(ld_aq_gm_combined_data),]

stationId,stationName,utc_time,wind_direction,hour,month,date,weekend,holiday,temperature,pressure,humidity,wind_speed.kph,PM2.5,PM10,O3


In [45]:
# converting logical weekend value to numeric for modeling
ld_aq_gm_combined_data$weekend <- as.integer(ld_aq_gm_combined_data$weekend)

In [46]:
# selecting only required columns
ld_aq_gm_combined_data <- 
ld_aq_gm_combined_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                    "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                    "hour", "month", "date", "weekend", "holiday"))

In [47]:
# retreiving lat long data 
ld_lat_long_file <- "Datasets/London_AirQuality_Stations.csv"
ld_lat_long_data <- read.csv(ld_lat_long_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_lat_long_data <- ld_lat_long_data %>% select(station_id, Latitude, Longitude)
colnames(ld_lat_long_data) <- c("stationId", "latitude", "longitude")

In [48]:
ld_aq_gm_combined_data <- merge(ld_aq_gm_combined_data,ld_lat_long_data, by = "stationId")

In [49]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_aq_gm_combined_data)

'data.frame':	159252 obs. of  18 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2017-03-18 01:00:00" "2018-05-10 06:00:00" ...
 $ PM2.5         : num  3.9 1.5 9.9 10.5 8.1 11.9 5.7 5.8 8.1 19.7 ...
 $ PM10          : num  5.4 6.4 14.1 18.3 8.9 ...
 $ O3            : num  11.9 45 20.7 44.9 48.8 ...
 $ temperature   : num  8.35 10.86 17.35 18.75 13.75 ...
 $ pressure      : num  1010 1008 1005 1014 1008 ...
 $ humidity      : num  79.6 68 65.8 61.8 64.7 ...
 $ wind_direction: num  238.6 301.7 206.2 28.6 337 ...
 $ wind_speed.kph: num  28.39 18.93 18.53 19.7 9.06 ...
 $ hour          : int  1 6 19 20 17 10 0 23 11 23 ...
 $ month         : num  3 5 6 7 9 9 2 5 6 5 ...
 $ date          : Date, format: "2017-03-18" "2018-05-10" ...
 $ weekend       : int  1 0 0 0 1 1 0 0 0 1 ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 

min_date,max_date
2017-01-01 08:00:00,2018-06-01


   stationId              stationName       utc_time                  
 CD9    :12258   london_grid_451:36750   Min.   :2017-01-01 08:00:00  
 BL0    :12250   london_grid_430:36749   1st Qu.:2017-05-08 21:00:00  
 GN0    :12250   london_grid_388:36746   Median :2017-09-13 12:00:00  
 GN3    :12250   london_grid_409:24508   Mean   :2017-09-14 00:34:44  
 GR4    :12250   london_grid_472:12250   3rd Qu.:2018-01-19 03:00:00  
 GR9    :12250   london_grid_408:12249   Max.   :2018-06-01 00:00:00  
 (Other):85744   (Other)        :    0                                
     PM2.5            PM10             O3          temperature   
 Min.   :-8.60   Min.   :-8.90   Min.   : -4.70   Min.   :-4.37  
 1st Qu.: 6.90   1st Qu.:12.30   1st Qu.: 24.88   1st Qu.: 6.11  
 Median :10.40   Median :17.60   Median : 39.16   Median :10.34  
 Mean   :11.42   Mean   :19.21   Mean   : 41.60   Mean   :10.73  
 3rd Qu.:14.69   3rd Qu.:24.50   3rd Qu.: 54.61   3rd Qu.:15.20  
 Max.   :32.60   Max.   :49.50   Max

### Model Training and validation

In [50]:
set.seed(2306)

In [51]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(ld_aq_gm_combined_data))
train_index <- sample(seq_len(nrow(ld_aq_gm_combined_data)), size = sample_size)
train_ld <- ld_aq_gm_combined_data[train_index, ]
test_ld <- ld_aq_gm_combined_data[-train_index, ]

In [52]:
str(train_ld)
str(test_ld)

'data.frame':	127401 obs. of  18 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 8 12 17 12 9 4 1 10 15 8 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 8 10 5 10 8 4 6 8 4 8 ...
 $ utc_time      : POSIXct, format: "2018-05-01 08:00:00" "2018-03-18 22:00:00" ...
 $ PM2.5         : num  8.7 13.8 27 4.4 13.4 ...
 $ PM10          : num  13.4 18.9 23.8 11.1 23.8 ...
 $ O3            : num  56.3 11.3 73.6 31.9 45.4 ...
 $ temperature   : num  7.99 0.18 -0.19 19.49 13.6 ...
 $ pressure      : num  1006 1006 1027 1008 1011 ...
 $ humidity      : num  53 72.3 93 78.1 75 ...
 $ wind_direction: num  259.1 63.7 131 163.6 240.7 ...
 $ wind_speed.kph: num  10.96 23.9 7.33 9.66 7.17 ...
 $ hour          : int  8 22 15 14 5 19 20 20 15 22 ...
 $ month         : num  5 3 1 8 8 7 8 3 6 3 ...
 $ date          : Date, format: "2018-05-01" "2018-03-18" ...
 $ weekend       : int  0 1 0 0 0 1 0 0 0 0 ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ latitude  

In [55]:
subsets <- c(1, 3, 5, 7, 10)

In [56]:
# dependent and independent variable 
PM2.5_x <- train_ld[,c("latitude", "longitude", "hour", "month", "weekend", "holiday", "temperature","humidity","wind_direction","wind_speed.kph")]
PM2.5_y <- train_ld[,"PM2.5"]

# controls for the rfe - repeated 10 fold cross validation (5 times) and linear regression
PM2.5_ctrl <- rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 5, number = 10)

# experimenting with 1,3,5,7 and 10 size subset variables
PM2.5_lmProfile <- rfe(PM2.5_x , PM2.5_y, sizes = subsets, rfeControl = PM2.5_ctrl )

# summary of the feature selection
PM2.5_lmProfile
summary(PM2.5_lmProfile$fit)


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE  RMSESD RsquaredSD   MAESD Selected
         1 6.375  0.01305 4.922 0.03887   0.002151 0.02531         
         3 6.152  0.08106 4.737 0.06972   0.017007 0.05408         
         5 6.040  0.11410 4.636 0.04837   0.009238 0.03940         
         7 6.012  0.12224 4.605 0.03904   0.004698 0.02500         
        10 5.896  0.15579 4.532 0.03797   0.004372 0.02561        *

The top 5 variables (out of 10):
   longitude, holiday, wind_speed.kph, month, latitude



Call:
lm(formula = y ~ ., data = tmp)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.6989  -3.9880  -0.9017   2.9854  25.8090 

Coefficients:
                 Estimate Std. Error  t value Pr(>|t|)    
(Intercept)    17.8137054 21.1801235    0.841    0.400    
longitude      -5.9756767  0.1356865  -44.040  < 2e-16 ***
holiday        -2.7324087  0.1107615  -24.669  < 2e-16 ***
wind_speed.kph -0.2641125  0.0024558 -107.548  < 2e-16 ***
month          -0.1541984  0.0054097  -28.504  < 2e-16 ***
latitude       -0.0095138  0.4112156   -0.023    0.982    
hour            0.1235363  0.0024732   49.950  < 2e-16 ***
temperature    -0.0399943  0.0031924  -12.528  < 2e-16 ***
weekend        -0.0153582  0.0367894   -0.417    0.676    
wind_direction -0.0133276  0.0001875  -71.086  < 2e-16 ***
humidity        0.0065284  0.0013495    4.838 1.32e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.896 on 127390 degrees of freedom
Multip

In [57]:
# predicting and model performance on test dataset 
test_ld$PM2.5_pred <- predict(PM2.5_lmProfile$fit, test_ld)
metrics(test_ld, truth = PM2.5, estimate = PM2.5_pred)

rmse,rsq
5.891424,0.1550984


In [59]:
PM10_x <- train_ld[,c("PM2.5","latitude", "longitude", "hour", "month", "weekend", "holiday", "temperature","humidity","wind_direction","wind_speed.kph")]
PM10_y <- train_ld[,"PM10"]
PM10_ctrl <- rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 5, number = 10)
PM10_lmProfile <- rfe(PM10_x , PM10_y, sizes = subsets, rfeControl = PM10_ctrl )
PM10_lmProfile
summary(PM10_lmProfile$fit)


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE  RMSESD RsquaredSD   MAESD Selected
         1 9.538  0.01182 7.518 0.05174   0.001981 0.03579         
         3 9.512  0.01713 7.500 0.05223   0.002379 0.03716         
         5 6.904  0.48224 4.845 0.05849   0.006102 0.03687         
         7 6.890  0.48429 4.849 0.05892   0.006189 0.03750         
        10 6.832  0.49297 4.829 0.05681   0.006082 0.03831        *
        11 6.832  0.49296 4.829 0.05684   0.006090 0.03833         

The top 5 variables (out of 10):
   latitude, weekend, holiday, PM2.5, longitude



Call:
lm(formula = y ~ ., data = tmp)

Residuals:
    Min      1Q  Median      3Q     Max 
-32.892  -3.944  -1.046   2.680  38.493 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     1.468e+03  2.451e+01  59.890  < 2e-16 ***
latitude       -2.828e+01  4.760e-01 -59.421  < 2e-16 ***
weekend        -1.579e+00  4.262e-02 -37.045  < 2e-16 ***
holiday        -1.555e+00  1.286e-01 -12.091  < 2e-16 ***
PM2.5           9.653e-01  3.246e-03 297.349  < 2e-16 ***
longitude      -1.962e-01  1.584e-01  -1.239    0.215    
wind_speed.kph -6.297e-02  2.963e-03 -21.250  < 2e-16 ***
hour            5.346e-02  2.837e-03  18.846  < 2e-16 ***
month          -3.762e-02  6.267e-03  -6.003 1.94e-09 ***
temperature    -2.805e-02  3.487e-03  -8.045 8.72e-16 ***
wind_direction -9.367e-03  2.215e-04 -42.294  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.832 on 127390 degrees of freedom
Multiple R-squared:  0.493,	

In [60]:
test_ld$PM10_pred <- predict(PM10_lmProfile$fit, test_ld)
metrics(test_ld, truth = PM10, estimate = PM10_pred)

rmse,rsq
6.889495,0.4911601


In [62]:
O3_x <- train_ld[,c("PM2.5", "PM10", "latitude", "longitude", "hour", "month", "weekend", "holiday", "temperature","humidity","wind_direction","wind_speed.kph")]
O3_y <- train_ld[,"O3"]
O3_ctrl <- rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 5, number = 10)
O3_lmProfile <- rfe(O3_x , O3_y, sizes = subsets, rfeControl = O3_ctrl )
O3_lmProfile
summary(O3_lmProfile$fit)


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE RMSESD RsquaredSD   MAESD Selected
         1 21.35  0.09228 17.01 0.1106   0.004581 0.08556         
         3 20.88  0.13196 16.67 0.1221   0.005222 0.09488         
         5 20.35  0.17512 16.12 0.1208   0.005655 0.10099         
         7 18.40  0.32558 14.46 0.1161   0.005943 0.09315         
        10 18.08  0.34925 14.15 0.1142   0.005771 0.09518         
        12 18.00  0.35465 14.08 0.1145   0.005643 0.09194        *

The top 5 variables (out of 12):
   latitude, longitude, holiday, weekend, temperature



Call:
lm(formula = y ~ ., data = tmp)

Residuals:
    Min      1Q  Median      3Q     Max 
-63.877 -12.419  -2.779  10.167  84.177 

Coefficients:
                 Estimate Std. Error  t value Pr(>|t|)    
(Intercept)    -8.550e+03  6.557e+01 -130.402   <2e-16 ***
latitude        1.667e+02  1.273e+00  130.985   <2e-16 ***
longitude      -2.882e+01  4.174e-01  -69.045   <2e-16 ***
holiday        -8.209e+00  3.392e-01  -24.203   <2e-16 ***
weekend        -5.881e+00  1.129e-01  -52.083   <2e-16 ***
temperature    -7.613e-01  9.755e-03  -78.039   <2e-16 ***
hour            6.447e-01  7.634e-03   84.444   <2e-16 ***
PM10            5.973e-01  7.382e-03   80.918   <2e-16 ***
wind_speed.kph -4.317e-01  7.844e-03  -55.031   <2e-16 ***
PM2.5           2.914e-01  1.113e-02   26.178   <2e-16 ***
month           1.432e-01  1.657e-02    8.639   <2e-16 ***
humidity       -5.378e-02  4.121e-03  -13.053   <2e-16 ***
wind_direction  1.777e-02  5.877e-04   30.236   <2e-16 ***
---
Signif. codes:  0 '***

In [63]:
test_ld$O3_pred <- predict(O3_lmProfile$fit, test_ld)
metrics(test_ld, truth = O3, estimate = O3_pred)

rmse,rsq
17.98576,0.3549936


In [65]:
rm("train_ld")
rm("test_ld")

### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [66]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [67]:
# building 1 hour intervals for next 2 days and stationIds
ld_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
ld_time <- with_tz(ld_time, tzone = "UTC")
ld_future_data <- data.frame(ld_time)
# This id will be used in the creation of final submission file
ld_future_data$id <- seq.int(nrow(ld_future_data)) -1
ld_future_data <- merge(ld_future_data, data.frame(unique(ld_aq_gm_combined_data$stationId)))
names(ld_future_data) <- c("utc_time", "id", "stationId")

In [68]:
# building datetime features for the next 2 days
ld_future_data$hour <- hour(ld_future_data$utc_time)
ld_future_data$month <- month(ld_future_data$utc_time)
ld_future_data$date <- date(ld_future_data$utc_time)
ld_future_data$weekend = chron::is.weekend(ld_future_data$date)

In [69]:
ld_future_data$weekend <- as.integer(ld_future_data$weekend)
ld_future_data$holiday <- ifelse(ld_future_data$date %in% as.Date(london_holidays), 1, 0)

In [74]:
# closest grid
ld_future_data <-  merge(ld_future_data,ld_closest_stations, by = "stationId")
# lat long data
ld_future_data <- merge(ld_future_data,ld_lat_long_data, by = "stationId")

In [77]:
hour <- as.character('12')
ld_future_weather_url <- paste0("http://kdd.caiyunapp.com/competition/forecast/ld/",tomorrow - 1,"-",hour,"/2k0d1d8")
ld_future_weather_url

In [78]:
ld_future_weather_file <- getURL(ld_future_weather_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_future_weather_data <- read.csv(textConnection(ld_future_weather_file), header=TRUE)

In [80]:
ld_future_weather_data$forecast_time <- anytime(ld_future_weather_data$forecast_time)
ld_future_weather_data <- ld_future_weather_data %>% select(-c(id, weather))
colnames(ld_future_weather_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_speed.kph", "wind_direction") 

In [81]:
str(ld_future_weather_data)
summary(ld_future_weather_data)
ld_future_weather_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	41328 obs. of  7 variables:
 $ stationName   : Factor w/ 861 levels "london_grid_000",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-05-31 13:00:00" "2018-05-31 14:00:00" ...
 $ temperature   : num  14.7 14.9 15 15.1 15.1 ...
 $ pressure      : num  1014 1014 1014 1014 1014 ...
 $ humidity      : num  95 94 93 93 92 92 92 92 92 91 ...
 $ wind_speed.kph: num  10.33 10.74 10.98 9.84 7.67 ...
 $ wind_direction: num  127 142 153 159 165 ...


          stationName       utc_time                    temperature   
 london_grid_000:   48   Min.   :2018-05-31 13:00:00   Min.   :11.34  
 london_grid_001:   48   1st Qu.:2018-06-01 00:45:00   1st Qu.:14.32  
 london_grid_002:   48   Median :2018-06-01 12:30:00   Median :15.76  
 london_grid_003:   48   Mean   :2018-06-01 12:30:00   Mean   :16.53  
 london_grid_004:   48   3rd Qu.:2018-06-02 00:15:00   3rd Qu.:18.69  
 london_grid_005:   48   Max.   :2018-06-02 12:00:00   Max.   :25.77  
 (Other)        :41040                                                
    pressure       humidity      wind_speed.kph  wind_direction  
 Min.   : 997   Min.   : 46.00   Min.   : 0.05   Min.   :  0.01  
 1st Qu.:1007   1st Qu.: 86.00   1st Qu.: 6.21   1st Qu.:122.96  
 Median :1011   Median : 91.00   Median :12.23   Median :225.96  
 Mean   :1011   Mean   : 89.34   Mean   :12.01   Mean   :191.55  
 3rd Qu.:1016   3rd Qu.: 95.00   3rd Qu.:16.09   3rd Qu.:248.62  
 Max.   :1021   Max.   :100.00   Max

min_date,max_date
2018-05-31 13:00:00,2018-06-02 12:00:00


In [82]:
ld_future_data <- ld_future_data %>% left_join(ld_future_weather_data, by= c("stationName", "utc_time"))

"Column `stationName` joining factors with different levels, coercing to character vector"

In [84]:
# replace pressure NA with mean values based on certain groups
ld_future_data_pressure <- ld_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(pressure = mean(pressure, na.rm = TRUE))
ld_future_data <- ld_future_data %>% 
                          left_join(ld_future_data_pressure, c("stationId", "hour")) %>% 
                          mutate(pressure = coalesce(pressure.x, pressure.y)) %>% 
                          select(-pressure.x, -pressure.y)
# replace temperature NA with mean values based on certain groups
ld_future_data_temperature <- ld_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(temperature = mean(temperature, na.rm = TRUE))
ld_future_data <- ld_future_data %>% 
                          left_join(ld_future_data_temperature, c("stationId", "hour")) %>% 
                          mutate(temperature = coalesce(temperature.x, temperature.y)) %>% 
                          select(-temperature.x, -temperature.y)

# replace humidity NA with mean values based on certain groups
ld_future_data_humidity <- ld_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(humidity = mean(humidity, na.rm = TRUE))
ld_future_data <- ld_future_data %>% 
                          left_join(ld_future_data_humidity, c("stationId", "hour")) %>% 
                          mutate(humidity = coalesce(humidity.x, humidity.y)) %>% 
                          select(-humidity.x, -humidity.y)


# replace wind_speed NA with mean values based on certain groups
ld_future_data_wind_speed.kph <- ld_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(wind_speed.kph = mean(wind_speed.kph, na.rm = TRUE))
ld_future_data <- ld_future_data %>% 
                          left_join(ld_future_data_wind_speed.kph, c("stationId", "hour")) %>% 
                          mutate(wind_speed.kph = coalesce(wind_speed.kph.x, wind_speed.kph.y)) %>% 
                          select(-wind_speed.kph.x, -wind_speed.kph.y)

# replace wind direction NA with mean values based on certain groups
ld_future_data_wind_direction <- ld_future_data %>% 
                                group_by(stationId, hour) %>% 
                                summarize(wind_direction = mean(wind_direction, na.rm = TRUE))
ld_future_data <- ld_future_data %>% 
                          left_join(ld_future_data_wind_direction, c("stationId", "hour")) %>% 
                          mutate(wind_direction = coalesce(wind_direction.x, wind_direction.y)) %>% 
                          select(-wind_direction.x, -wind_direction.y)

In [86]:
# predicting PM2.5, PM10 and O3 values
ld_future_data$PM2.5 <- predict(PM2.5_ld_model, ld_future_data)
ld_future_data$PM10 <- predict(PM10_ld_model, ld_future_data)
ld_future_data$O3 <- predict(O3_ld_model, ld_future_data)

In [87]:
#creating the test_id for final submission file
ld_future_data$test_id <- paste(ld_future_data$stationId, "#", ld_future_data$id, sep = "")

In [88]:
str(ld_future_data)
summary(ld_future_data)
ld_future_data %>% summarize(min_date = min(utc_time ), max_date = max(utc_time ))

'data.frame':	624 obs. of  22 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time      : POSIXct, format: "2018-06-01 00:00:00" "2018-06-01 01:00:00" ...
 $ id            : num  0 1 2 3 4 5 6 7 8 9 ...
 $ hour          : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month         : num  6 6 6 6 6 6 6 6 6 6 ...
 $ date          : Date, format: "2018-06-01" "2018-06-01" ...
 $ weekend       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ holiday       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ x             : int  3 3 3 3 3 3 3 3 3 3 ...
 $ stationName   : chr  "london_grid_409" "london_grid_409" "london_grid_409" "london_grid_409" ...
 $ distance      : num  3060 3060 3060 3060 3060 ...
 $ latitude      : num  51.5 51.5 51.5 51.5 51.5 ...
 $ longitude     : num  -0.126 -0.126 -0.126 -0.126 -0.126 ...
 $ pressure      : num  1010 1010 1010 1011 1012 ...
 $ temperature   : num  23.7 23 22 20.9 19.8 ...
 $ humidity      : num  55 58 63 69 74 77 79 81 82 83 ...
 $ wind_sp

   stationId      utc_time                         id             hour      
 BL0    : 48   Min.   :2018-06-01 00:00:00   Min.   : 0.00   Min.   : 0.00  
 CD1    : 48   1st Qu.:2018-06-01 11:45:00   1st Qu.:11.75   1st Qu.: 5.75  
 CD9    : 48   Median :2018-06-01 23:30:00   Median :23.50   Median :11.50  
 GN0    : 48   Mean   :2018-06-01 23:30:00   Mean   :23.50   Mean   :11.50  
 GN3    : 48   3rd Qu.:2018-06-02 11:15:00   3rd Qu.:35.25   3rd Qu.:17.25  
 GR4    : 48   Max.   :2018-06-02 23:00:00   Max.   :47.00   Max.   :23.00  
 (Other):336                                                                
     month        date               weekend       holiday        x        
 Min.   :6   Min.   :2018-06-01   Min.   :0.0   Min.   :0   Min.   : 3.00  
 1st Qu.:6   1st Qu.:2018-06-01   1st Qu.:0.0   1st Qu.:0   1st Qu.: 9.00  
 Median :6   Median :2018-06-01   Median :0.5   Median :0   Median :12.00  
 Mean   :6   Mean   :2018-06-01   Mean   :0.5   Mean   :0   Mean   :13.46  
 3rd

min_date,max_date
2018-06-01,2018-06-02 23:00:00


In [105]:
head(ld_future_data, 20)

stationId,utc_time,id,hour,month,date,weekend,holiday,latitude,longitude,PM2.5,PM10,O3,test_id
BL0,2018-05-30 00:00:00,0,0,5,2018-05-30,0,0,51.52229,-0.125848,10.87809,17.87551,43.5395,BL0#0
BL0,2018-05-30 01:00:00,1,1,5,2018-05-30,0,0,51.52229,-0.125848,10.96902,18.01037,44.19121,BL0#1
BL0,2018-05-30 02:00:00,2,2,5,2018-05-30,0,0,51.52229,-0.125848,11.05995,18.14523,44.84292,BL0#2
BL0,2018-05-30 03:00:00,3,3,5,2018-05-30,0,0,51.52229,-0.125848,11.15087,18.2801,45.49463,BL0#3
BL0,2018-05-30 04:00:00,4,4,5,2018-05-30,0,0,51.52229,-0.125848,11.2418,18.41496,46.14634,BL0#4
BL0,2018-05-30 05:00:00,5,5,5,2018-05-30,0,0,51.52229,-0.125848,11.33272,18.54982,46.79805,BL0#5
BL0,2018-05-30 06:00:00,6,6,5,2018-05-30,0,0,51.52229,-0.125848,11.42365,18.68468,47.44976,BL0#6
BL0,2018-05-30 07:00:00,7,7,5,2018-05-30,0,0,51.52229,-0.125848,11.51457,18.81954,48.10147,BL0#7
BL0,2018-05-30 08:00:00,8,8,5,2018-05-30,0,0,51.52229,-0.125848,11.6055,18.95441,48.75318,BL0#8
BL0,2018-05-30 09:00:00,9,9,5,2018-05-30,0,0,51.52229,-0.125848,11.69643,19.08927,49.40489,BL0#9


In [None]:
write.csv(ld_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("ld_submission",Sys.Date(),".csv"), row.names = FALSE)

In [None]:
# resetting the timezone
Sys.unsetenv("TZ")

In [89]:
rm("ld_future_data")
rm("ld_lat_long_data")
rm("ld_aq_gm_combined_data")
rm("ld_future_data_humidity")
rm("ld_future_data_temperature")
rm("ld_future_data_pressure")
rm("ld_future_data_wind_direction")
rm("ld_future_data_wind_speed.kph")
rm("ld_future_weather_data")
rm("ld_closest_stations")