### Import and transformations of API data

In [1]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [2]:
library(RCurl)
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)
library(lubridate)
library(chron)
require(caret)
require(rattle)
require(yardstick)

"package 'RCurl' was built under R version 3.4.4"Loading required package: bitops
Loading required package: gsubfn
Loading required package: proto
Could not load tcltk.  Will use slower R code instead.
Loading required package: RSQLite
"package 'dplyr' was built under R version 3.4.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'lubridate' was built under R version 3.4.3"
Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date

"package 'chron' was built under R version 3.4.3"
Attaching package: 'chron'

The following objects are masked from 'package:lubridate':

    days, hours, minutes, seconds, years

Loading required package: caret
"package 'caret' was built under R version 3.4.4"Loading required package: lattice
"package 'lattice' was built under R version 3.4.3"Loading required packa

In [3]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

In [4]:
# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [5]:
# pull data for London

# acquire air quality data
ld_aq_url <- paste0("https://biendata.com/competition/airquality/ld/",startDate,"/",endDate,"/2k0d1d8")
ld_aq_file <- getURL(ld_aq_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_aq_data <- read.csv(textConnection(ld_aq_file), header=TRUE)
  
# acquire API grid meteorology data
ld_gm_url <- paste0("https://biendata.com/competition/meteorology/ld_grid/",startDate,"/",endDate,"/2k0d1d8")
ld_gm_file <- getURL(ld_gm_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
ld_gm_data <- read.csv(textConnection(ld_gm_file), header=TRUE)

In [6]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

In [7]:
# Converting character to datetime
ld_gm_data$time <- anytime(ld_gm_data$time)
ld_aq_data$time <- anytime(ld_aq_data$time)

In [8]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "ld_gm_data"
'data.frame':	1186361 obs. of  9 variables:
 $ id            : int  2000096 2000097 2000098 2000099 2000100 2000101 2000102 2000103 2000104 2000105 ...
 $ station_id    : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ time          : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ weather       : Factor w/ 8 levels "CLEAR_DAY","CLEAR_NIGHT",..: 3 3 3 6 6 6 6 6 6 6 ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed    : num  22 18.9 15.8 14.1 13.6 ...
NULL
[1] "ld_aq_data"
'data.frame':	26616 obs. of  9 variables:
 $ id                : int  2941506 2941507 2941508 2941509 2941510 2941511 2941512 2941513 2941514 2941515 ...
 $ station_id        : Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ time       

In [9]:
# Printing the min and max dates of all datasets
print("ld_gm_data")
ld_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("ld_aq_data")
ld_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "ld_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-28 22:00:00


[1] "ld_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-28 20:00:00


In [10]:
# using only required columns
ld_aq_data <- ld_aq_data %>% select(-id)
ld_gm_data <- ld_gm_data %>% select(-c(id, weather))

In [11]:
# London closest grids to stations
ld_closest_stations <- read.csv('SL_london_closest_stations.csv')

In [12]:
# Modifying column names for consistency
colnames(ld_closest_stations) <- c('x',"stationId","stationName","distance") 
colnames(ld_aq_data) <- c("stationId","utc_time","PM2.5","PM10","NO2","CO","O3", "SO2") 
colnames(ld_gm_data) <- c("stationName","utc_time","temperature","pressure","humidity","wind_direction","wind_speed.kph") 

In [13]:
str(ld_closest_stations)
str(ld_gm_data)
str(ld_aq_data)
head(ld_closest_stations)

'data.frame':	24 obs. of  4 variables:
 $ x          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ stationId  : Factor w/ 24 levels "BL0","BX1","BX9",..: 3 2 1 5 4 7 8 6 10 12 ...
 $ stationName: Factor w/ 10 levels "london_grid_346",..: 10 10 6 6 4 6 6 5 8 8 ...
 $ distance   : num  3929 3929 3060 3683 5212 ...
'data.frame':	1186361 obs. of  7 variables:
 $ stationName   : Factor w/ 861 levels "london_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ temperature   : num  6.14 5.43 4.73 4.25 3.99 3.72 3.75 3.78 3.86 3.98 ...
 $ pressure      : num  995 994 993 990 986 ...
 $ humidity      : num  85 88 91 93 95 97 96 96 96 96 ...
 $ wind_direction: num  304 303 302 303 306 ...
 $ wind_speed.kph: num  22 18.9 15.8 14.1 13.6 ...
'data.frame':	26616 obs. of  8 variables:
 $ stationId: Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ utc_time : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:

x,stationId,stationName,distance
1,BX9,london_grid_472,3929.17
2,BX1,london_grid_472,3929.17
3,BL0,london_grid_409,3059.76
4,CD9,london_grid_409,3682.914
5,CD1,london_grid_388,5211.633
6,CT2,london_grid_409,1646.945


In [14]:
# Mapping stationIds with respective grids
ld_aq_map <- merge(ld_aq_data,ld_closest_stations, by = "stationId")

In [15]:
# Merge of AirQuality and Meteorology data
ld_aq_gm_data <- merge(ld_aq_map, ld_gm_data, by = c("stationName","utc_time"))

In [16]:
str(ld_aq_gm_data)
head(ld_aq_gm_data)
ld_aq_gm_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))

'data.frame':	26064 obs. of  16 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ NO2           : num  NA NA NA NA NA NA NA NA NA NA ...
 $ CO            : logi  NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ SO2           : logi  NA NA NA NA NA NA ...
 $ x             : int  21 21 21 21 21 21 21 21 21 21 ...
 $ distance      : num  4235 4235 4235 4235 4235 ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num 

stationName,utc_time,stationId,PM2.5,PM10,NO2,CO,O3,SO2,x,distance,temperature,pressure,humidity,wind_direction,wind_speed.kph
london_grid_366,2018-03-31 07:00:00,TD5,8.7,,,,,,21,4234.613,4.83,986.8419,87,205.39,11.3
london_grid_366,2018-03-31 08:00:00,TD5,6.8,,,,,,21,4234.613,5.42,987.2517,84,210.33,11.95
london_grid_366,2018-03-31 09:00:00,TD5,4.5,,,,,,21,4234.613,6.05,987.7612,81,214.35,12.01
london_grid_366,2018-03-31 10:00:00,TD5,8.2,,,,,,21,4234.613,6.1,988.8336,79,233.7,10.18
london_grid_366,2018-03-31 11:00:00,TD5,11.8,,,,,,21,4234.613,7.05,991.6767,74,242.44,7.95
london_grid_366,2018-03-31 12:00:00,TD5,10.2,,,,,,21,4234.613,7.8,992.0664,71,267.43,6.76


min_date,max_date
2018-03-31 07:00:00,2018-05-28 20:00:00


In [17]:
rm("ld_gm_data")
rm("ld_aq_data")
rm("ld_aq_map")
rm("ld_closest_stations")

In [18]:
# selecting only required columns
ld_aq_gm_data <- ld_aq_gm_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                            "temperature","pressure","humidity","wind_direction","wind_speed.kph"))

In [19]:
# adding additional columns for modeling
ld_aq_gm_data$hour <- hour(ld_aq_gm_data$utc_time)
ld_aq_gm_data$month <- month(ld_aq_gm_data$utc_time)
ld_aq_gm_data$date <- date(ld_aq_gm_data$utc_time)
ld_aq_gm_data$weekend = chron::is.weekend(ld_aq_gm_data$date)

In [20]:
str(ld_aq_gm_data)

'data.frame':	26064 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 19 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : logi  NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...


### Import and transformations of Historical data

In [21]:
# retreiving historical files
ld_aq_gm_hist_file <- "Ready for Modeling/ld_aq_gm_hist_data.csv"
ld_aq_gm_hist_data <- read.csv(ld_aq_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)

In [22]:
# data transformations for consistency
ld_aq_gm_hist_data <- ld_aq_gm_hist_data %>% select(-X)
ld_aq_gm_hist_data$utc_time <- anytime(ld_aq_gm_hist_data$utc_time)
ld_aq_gm_hist_data$date <- as.Date(ld_aq_gm_hist_data$date , "%Y-%m-%d")

In [23]:
colnames(ld_aq_gm_hist_data) <- c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                 "hour","month", "date", "weekend")

In [24]:
str(ld_aq_gm_hist_data)

'data.frame':	257936 obs. of  15 variables:
 $ stationName   : chr  "london_grid_346" "london_grid_346" "london_grid_346" "london_grid_346" ...
 $ utc_time      : POSIXct, format: "2017-01-01 08:00:00" "2017-01-01 09:00:00" ...
 $ stationId     : chr  "LH0" "LH0" "LH0" "LH0" ...
 $ PM2.5         : num  18.3 16.3 13.3 9.4 6.1 6.7 2.1 0.9 1.1 1 ...
 $ PM10          : num  21.3 19.5 16.2 11.6 8.5 13.4 4.6 2 2.1 2 ...
 $ O3            : num  41.6 44.1 49.1 45.2 41.4 53.6 11.7 12.1 12 13.5 ...
 $ temperature   : num  6.05 6.05 6.04 6.04 6.25 6.46 6.67 6.91 7.15 7.39 ...
 $ pressure      : num  1019 1018 1017 1016 1015 ...
 $ humidity      : num  90 89.2 88.3 87.4 87.8 ...
 $ wind_direction: num  221 220 218 217 216 ...
 $ wind_speed.kph: num  16.4 17.1 17.7 18.4 18.8 ...
 $ hour          : int  8 9 10 11 12 13 14 15 16 17 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ date          : Date, format: "2017-01-01" "2017-01-01" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TRUE .

In [25]:
# Append API and hist data
ld_aq_gm_combined_data <- rbind(ld_aq_gm_data, ld_aq_gm_hist_data)

In [26]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_aq_gm_combined_data)

'data.frame':	284000 obs. of  15 variables:
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ utc_time      : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 08:00:00" ...
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 18 18 18 18 18 18 18 18 18 18 ...
 $ PM2.5         : num  8.7 6.8 4.5 8.2 11.8 10.2 11.6 10.3 8.8 8.6 ...
 $ PM10          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ O3            : num  NA NA NA NA NA NA NA NA NA NA ...
 $ temperature   : num  4.83 5.42 6.05 6.1 7.05 7.8 8.08 8.44 8.57 8.02 ...
 $ pressure      : num  987 987 988 989 992 ...
 $ humidity      : num  87 84 81 79 74 71 73 71 70 73 ...
 $ wind_direction: num  205 210 214 234 242 ...
 $ wind_speed.kph: num  11.3 11.95 12.01 10.18 7.95 ...
 $ hour          : int  7 8 9 10 11 12 13 14 15 16 ...
 $ month         : num  3 3 3 3 3 3 3 3 3 3 ...
 $ date          : Date, format: "2018-03-31" "2018-03-31" ...
 $ weekend       : logi  TRUE TRUE TRUE TRUE TRUE TR

min_date,max_date
2017-01-01 08:00:00,2018-05-28 20:00:00


          stationName       utc_time                     stationId     
 london_grid_451:47340   Min.   :2017-01-01 08:00:00   CD9    : 12186  
 london_grid_388:47336   1st Qu.:2017-05-06 23:45:00   BL0    : 12178  
 london_grid_409:47310   Median :2017-09-07 06:00:00   GN0    : 12178  
 london_grid_430:36533   Mean   :2017-09-08 01:49:36   GN3    : 12178  
 london_grid_472:36532   3rd Qu.:2018-01-08 13:00:00   GR4    : 12178  
 london_grid_408:22983   Max.   :2018-05-28 20:00:00   GR9    : 12178  
 (Other)        :45966                                 (Other):210924  
     PM2.5             PM10              O3          temperature   
 Min.   :-14.00   Min.   :-11.80   Min.   : -8.30   Min.   :-4.57  
 1st Qu.:  6.50   1st Qu.: 11.40   1st Qu.: 19.50   1st Qu.: 5.97  
 Median : 10.20   Median : 17.20   Median : 34.20   Median :10.16  
 Mean   : 13.43   Mean   : 20.58   Mean   : 39.72   Mean   :10.58  
 3rd Qu.: 16.60   3rd Qu.: 26.00   3rd Qu.: 53.80   3rd Qu.:15.03  
 Max.   :313.00 

In [27]:
rm("ld_aq_gm_data")
rm("ld_aq_gm_hist_data")

In [28]:
# retrieving only air quality stations that need predictions
ld_pred_stations <- read.csv("Datasets/London_AirQuality_Stations.csv",  header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_pred_stations <- ld_pred_stations %>% filter(need_prediction == "TRUE") %>% select(station_id)
colnames(ld_pred_stations) <- "stationId"

"package 'bindrcpp' was built under R version 3.4.3"

In [29]:
ld_aq_gm_combined_data <- merge(ld_aq_gm_combined_data,ld_pred_stations, by = "stationId")

In [30]:
rm("ld_pred_stations")

In [31]:
summary(ld_aq_gm_combined_data)

   stationId              stationName       utc_time                  
 CD9    :12186   london_grid_451:36534   Min.   :2017-01-01 08:00:00  
 BL0    :12178   london_grid_430:36533   1st Qu.:2017-05-08 03:00:00  
 GN0    :12178   london_grid_388:36530   Median :2017-09-12 00:00:00  
 GN3    :12178   london_grid_409:24364   Mean   :2017-09-12 11:54:56  
 GR4    :12178   london_grid_472:12178   3rd Qu.:2018-01-16 21:00:00  
 GR9    :12178   london_grid_408:12177   Max.   :2018-05-28 20:00:00  
 (Other):85240   (Other)        :    0                                
     PM2.5             PM10             O3          temperature   
 Min.   :-10.50   Min.   :-11.8   Min.   : -4.70   Min.   :-4.37  
 1st Qu.:  6.50   1st Qu.: 12.0   1st Qu.: 21.60   1st Qu.: 6.08  
 Median : 10.40   Median : 17.9   Median : 37.80   Median :10.28  
 Mean   : 13.59   Mean   : 21.4   Mean   : 43.24   Mean   :10.69  
 3rd Qu.: 16.90   3rd Qu.: 27.0   3rd Qu.: 58.40   3rd Qu.:15.14  
 Max.   :189.70   Max.   :667.

In [1]:
# replacing outliers with NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM2.5 %in% boxplot.stats(ld_aq_gm_combined_data$PM2.5)$out, ]$PM2.5 <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM10 %in% boxplot.stats(ld_aq_gm_combined_data$PM10)$out, ]$PM10 <- NA
ld_aq_gm_combined_data[ld_aq_gm_combined_data$O3 %in% boxplot.stats(ld_aq_gm_combined_data$O3)$out, ]$O3 <- NA

ERROR: Error in ld_aq_gm_combined_data[ld_aq_gm_combined_data$PM2.5 %in% boxplot.stats(ld_aq_gm_combined_data$PM2.5)$out, : object 'ld_aq_gm_combined_data' not found


In [33]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	158316 obs. of  15 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2018-03-02 08:00:00" "2018-04-28 13:00:00" ...
 $ PM2.5         : num  NA 6.2 4.8 28 8.9 NA 8.2 7.1 26.6 12.9 ...
 $ PM10          : num  NA 7.3 8.3 37 4.9 NA 13.9 5.8 41.2 15.9 ...
 $ O3            : num  38.4 NA 17.2 54.7 49.3 7.2 45 16.4 NA 46.1 ...
 $ temperature   : num  -0.45 11.59 13.16 7.62 12.65 ...
 $ pressure      : num  991 1006 1013 996 998 ...
 $ humidity      : num  69.5 66 72.1 77.3 89.6 ...
 $ wind_direction: num  83.8 353.8 243.8 107 196.1 ...
 $ wind_speed.kph: num  28.47 9.02 25.64 6.57 9.34 ...
 $ hour          : int  8 13 5 16 15 3 21 4 13 0 ...
 $ month         : num  3 4 10 3 11 6 4 10 4 9 ...
 $ date          : Date, format: "2018-03-02" "2018-04-28" ...
 $ weekend       : logi  FALSE TRUE FALSE TRUE TRUE FALSE ...

   stationId              stationName       utc_time                  
 CD9    :12186   london_grid_451:36534   Min.   :2017-01-01 08:00:00  
 BL0    :12178   london_grid_430:36533   1st Qu.:2017-05-08 03:00:00  
 GN0    :12178   london_grid_388:36530   Median :2017-09-12 00:00:00  
 GN3    :12178   london_grid_409:24364   Mean   :2017-09-12 11:54:56  
 GR4    :12178   london_grid_472:12178   3rd Qu.:2018-01-16 21:00:00  
 GR9    :12178   london_grid_408:12177   Max.   :2018-05-28 20:00:00  
 (Other):85240   (Other)        :    0                                
     PM2.5            PM10             O3          temperature   
 Min.   :-8.60   Min.   :-8.90   Min.   : -4.70   Min.   :-4.37  
 1st Qu.: 6.30   1st Qu.:11.70   1st Qu.: 21.20   1st Qu.: 6.08  
 Median : 9.90   Median :17.30   Median : 36.90   Median :10.28  
 Mean   :11.35   Mean   :19.23   Mean   : 40.78   Mean   :10.69  
 3rd Qu.:15.10   3rd Qu.:25.20   3rd Qu.: 56.40   3rd Qu.:15.14  
 Max.   :32.50   Max.   :49.50   Max

In [34]:
#replacing PM2.5 NA with mean values of certain groupings
ld_aq_gm_combined_data_PM2.5 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM2.5, c("stationId", "hour", "month", "weekend")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)
ld_aq_gm_combined_data_PM2.5 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend) %>% 
                                summarize(PM2.5 = mean(PM2.5, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM2.5, c("stationId", "hour", "weekend")) %>% 
                          mutate(PM2.5 = coalesce(PM2.5.x, PM2.5.y)) %>% 
                          select(-PM2.5.x, -PM2.5.y)

In [35]:
#replacing PM10 NA with mean values of certain groupings
ld_aq_gm_combined_data_PM10 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM10, c("stationId", "hour", "month", "weekend")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)
ld_aq_gm_combined_data_PM10 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend) %>% 
                                summarize(PM10 = mean(PM10, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_PM10, c("stationId", "hour", "weekend")) %>% 
                          mutate(PM10 = coalesce(PM10.x, PM10.y)) %>% 
                          select(-PM10.x, -PM10.y)

In [36]:
#replacing O3 NA with mean values of certain groupings
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, month, weekend) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("stationId", "hour", "month", "weekend")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(stationId, hour, weekend) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("stationId", "hour", "weekend")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)
ld_aq_gm_combined_data_O3 <- ld_aq_gm_combined_data %>% 
                                group_by(hour, month, weekend) %>% 
                                summarize(O3 = mean(O3, na.rm = TRUE))
ld_aq_gm_combined_data <- ld_aq_gm_combined_data %>% 
                          left_join(ld_aq_gm_combined_data_O3, c("hour", "month", "weekend")) %>% 
                          mutate(O3 = coalesce(O3.x, O3.y)) %>% 
                          select(-O3.x, -O3.y)

In [37]:
rm("ld_aq_gm_combined_data_PM2.5")
rm("ld_aq_gm_combined_data_PM10")
rm("ld_aq_gm_combined_data_O3")

In [38]:
str(ld_aq_gm_combined_data)
summary(ld_aq_gm_combined_data)

'data.frame':	158316 obs. of  15 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2018-03-02 08:00:00" "2018-04-28 13:00:00" ...
 $ temperature   : num  -0.45 11.59 13.16 7.62 12.65 ...
 $ pressure      : num  991 1006 1013 996 998 ...
 $ humidity      : num  69.5 66 72.1 77.3 89.6 ...
 $ wind_direction: num  83.8 353.8 243.8 107 196.1 ...
 $ wind_speed.kph: num  28.47 9.02 25.64 6.57 9.34 ...
 $ hour          : int  8 13 5 16 15 3 21 4 13 0 ...
 $ month         : num  3 4 10 3 11 6 4 10 4 9 ...
 $ date          : Date, format: "2018-03-02" "2018-04-28" ...
 $ weekend       : logi  FALSE TRUE FALSE TRUE TRUE FALSE ...
 $ PM2.5         : num  10.7 6.2 4.8 28 8.9 ...
 $ PM10          : num  20.2 7.3 8.3 37 4.9 ...
 $ O3            : num  38.4 19.7 17.2 54.7 49.3 ...


   stationId              stationName       utc_time                  
 CD9    :12186   london_grid_451:36534   Min.   :2017-01-01 08:00:00  
 BL0    :12178   london_grid_430:36533   1st Qu.:2017-05-08 03:00:00  
 GN0    :12178   london_grid_388:36530   Median :2017-09-12 00:00:00  
 GN3    :12178   london_grid_409:24364   Mean   :2017-09-12 11:54:56  
 GR4    :12178   london_grid_472:12178   3rd Qu.:2018-01-16 21:00:00  
 GR9    :12178   london_grid_408:12177   Max.   :2018-05-28 20:00:00  
 (Other):85240   (Other)        :    0                                
  temperature       pressure         humidity     wind_direction 
 Min.   :-4.37   Min.   : 964.4   Min.   : 7.00   Min.   :  0.0  
 1st Qu.: 6.08   1st Qu.:1005.0   1st Qu.:66.70   1st Qu.:148.9  
 Median :10.28   Median :1012.2   Median :77.46   Median :228.4  
 Mean   :10.69   Mean   :1010.9   Mean   :75.92   Mean   :207.7  
 3rd Qu.:15.14   3rd Qu.:1018.2   3rd Qu.:87.30   3rd Qu.:272.2  
 Max.   :30.28   Max.   :1035.8   Ma

In [39]:
#validating for no NAs
ld_aq_gm_combined_data[!complete.cases(ld_aq_gm_combined_data),]

stationId,stationName,utc_time,temperature,pressure,humidity,wind_direction,wind_speed.kph,hour,month,date,weekend,PM2.5,PM10,O3


In [40]:
# converting logical weekend value to numeric for modeling
ld_aq_gm_combined_data$weekend <- as.integer(ld_aq_gm_combined_data$weekend)

In [41]:
# selecting only required columns
ld_aq_gm_combined_data <- 
ld_aq_gm_combined_data %>% select(c("stationName","utc_time","stationId","PM2.5","PM10", "O3",
                                    "temperature","pressure","humidity","wind_direction","wind_speed.kph", 
                                    "hour", "month", "date", "weekend"))

In [42]:
# retreiving lat long data 
ld_lat_long_file <- "Datasets/London_AirQuality_Stations.csv"
ld_lat_long_data <- read.csv(ld_lat_long_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_lat_long_data <- ld_lat_long_data %>% select(station_id, Latitude, Longitude)
colnames(ld_lat_long_data) <- c("stationId", "latitude", "longitude")

In [44]:
ld_aq_gm_combined_data <- merge(ld_aq_gm_combined_data,ld_lat_long_data, by = "stationId")

In [45]:
str(ld_aq_gm_combined_data)
ld_aq_gm_combined_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_aq_gm_combined_data)

'data.frame':	158316 obs. of  17 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ utc_time      : POSIXct, format: "2018-03-02 08:00:00" "2018-04-28 13:00:00" ...
 $ PM2.5         : num  10.7 6.2 4.8 28 8.9 ...
 $ PM10          : num  20.2 7.3 8.3 37 4.9 ...
 $ O3            : num  38.4 19.7 17.2 54.7 49.3 ...
 $ temperature   : num  -0.45 11.59 13.16 7.62 12.65 ...
 $ pressure      : num  991 1006 1013 996 998 ...
 $ humidity      : num  69.5 66 72.1 77.3 89.6 ...
 $ wind_direction: num  83.8 353.8 243.8 107 196.1 ...
 $ wind_speed.kph: num  28.47 9.02 25.64 6.57 9.34 ...
 $ hour          : int  8 13 5 16 15 3 21 4 13 0 ...
 $ month         : num  3 4 10 3 11 6 4 10 4 9 ...
 $ date          : Date, format: "2018-03-02" "2018-04-28" ...
 $ weekend       : int  0 1 0 1 1 0 0 0 1 0 ...
 $ latitude      : num  51.5 51.5 51.5 51.5 51.5 ...
 $ longitude     

min_date,max_date
2017-01-01 08:00:00,2018-05-28 20:00:00


   stationId              stationName       utc_time                  
 CD9    :12186   london_grid_451:36534   Min.   :2017-01-01 08:00:00  
 BL0    :12178   london_grid_430:36533   1st Qu.:2017-05-08 03:00:00  
 GN0    :12178   london_grid_388:36530   Median :2017-09-12 00:00:00  
 GN3    :12178   london_grid_409:24364   Mean   :2017-09-12 11:54:56  
 GR4    :12178   london_grid_472:12178   3rd Qu.:2018-01-16 21:00:00  
 GR9    :12178   london_grid_408:12177   Max.   :2018-05-28 20:00:00  
 (Other):85240   (Other)        :    0                                
     PM2.5            PM10             O3          temperature   
 Min.   :-8.60   Min.   :-8.90   Min.   : -4.70   Min.   :-4.37  
 1st Qu.: 6.90   1st Qu.:12.30   1st Qu.: 24.90   1st Qu.: 6.08  
 Median :10.38   Median :17.60   Median : 39.17   Median :10.28  
 Mean   :11.37   Mean   :19.16   Mean   : 41.56   Mean   :10.69  
 3rd Qu.:14.60   3rd Qu.:24.40   3rd Qu.: 54.55   3rd Qu.:15.14  
 Max.   :32.50   Max.   :49.50   Max

### Model Training and validation

In [46]:
set.seed(2306)

In [47]:
# Training and test data set partition
sample_size <- floor(0.8 * nrow(ld_aq_gm_combined_data))
train_index <- sample(seq_len(nrow(ld_aq_gm_combined_data)), size = sample_size)
train_ld <- ld_aq_gm_combined_data[train_index, ]
test_ld <- ld_aq_gm_combined_data[-train_index, ]

In [48]:
str(train_ld)
str(test_ld)

'data.frame':	126652 obs. of  17 variables:
 $ stationId     : Factor w/ 24 levels "BL0","BX1","BX9",..: 8 12 17 12 9 4 1 10 15 8 ...
 $ stationName   : Factor w/ 10 levels "london_grid_346",..: 8 10 5 10 8 4 6 8 4 8 ...
 $ utc_time      : POSIXct, format: "2017-12-18 06:00:00" "2017-08-03 04:00:00" ...
 $ PM2.5         : num  8.1 4.7 9.82 2.8 9.6 ...
 $ PM10          : num  14.5 12.1 25.8 7.1 12.1 ...
 $ O3            : num  73.8 16.7 31.7 25.8 34.4 ...
 $ temperature   : num  8.03 18.67 15.21 21.15 9.22 ...
 $ pressure      : num  1016 1005 1007 1010 1028 ...
 $ humidity      : num  93.3 84.5 79.6 54.6 89.4 ...
 $ wind_direction: num  288 207 195 263 246 ...
 $ wind_speed.kph: num  14.9 25.2 12.4 15.7 11.3 ...
 $ hour          : int  6 4 12 1 6 0 0 11 1 3 ...
 $ month         : num  12 8 8 8 3 11 3 8 10 9 ...
 $ date          : Date, format: "2017-12-18" "2017-08-03" ...
 $ weekend       : int  0 0 0 0 0 1 0 0 1 0 ...
 $ latitude      : num  51.5 51.5 51.4 51.5 51.5 ...
 $ longitude 

In [49]:
# Predicting PM2.5 using stationId, hour, month and weekend variables
PM2.5_ld_formula <- as.formula("PM2.5 ~ longitude + latitude + hour + month + weekend")
PM2.5_ld_model <- train(PM2.5_ld_formula, data = train_ld, method = "lm" )
summary(PM2.5_ld_model)
test_ld$PM2.5_pred <- predict(PM2.5_ld_model, test_ld)
metrics(test_ld, truth = PM2.5, estimate = PM2.5_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-18.8229  -4.3904  -0.9288   3.0121  24.4960 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.384942  22.485145   0.551    0.582    
longitude   -5.736287   0.144138 -39.797   <2e-16 ***
latitude    -0.021891   0.436670  -0.050    0.960    
hour         0.090201   0.002538  35.543   <2e-16 ***
month       -0.232270   0.005137 -45.216   <2e-16 ***
weekend      0.456488   0.038706  11.794   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.243 on 126646 degrees of freedom
Multiple R-squared:  0.03867,	Adjusted R-squared:  0.03863 
F-statistic:  1019 on 5 and 126646 DF,  p-value: < 2.2e-16


rmse,rsq
6.307451,0.0363391


In [53]:
# Predicting PM10 using stationId, hour, month, weekend variables along with previously predicted PM2.5 
PM10_ld_formula <- as.formula("PM10 ~ PM2.5 + longitude + latitude + hour + month + weekend")
PM10_ld_model <- train(PM10_ld_formula, data = train_ld, method = "lm" )
summary(PM10_ld_model)
test_ld$PM10_pred <- predict(PM10_ld_model, test_ld)
metrics(test_ld, truth = PM10, estimate = PM10_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-29.910  -3.912  -1.051   2.622  37.965 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.483e+03  2.476e+01  59.914   <2e-16 ***
PM2.5        1.016e+00  3.094e-03 328.471   <2e-16 ***
longitude    2.131e-01  1.597e-01   1.335    0.182    
latitude    -2.865e+01  4.808e-01 -59.586   <2e-16 ***
hour         4.112e-02  2.808e-03  14.643   <2e-16 ***
month       -9.382e-02  5.701e-03 -16.456   <2e-16 ***
weekend     -1.428e+00  4.264e-02 -33.480   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.874 on 126645 degrees of freedom
Multiple R-squared:  0.483,	Adjusted R-squared:  0.483 
F-statistic: 1.972e+04 on 6 and 126645 DF,  p-value: < 2.2e-16


rmse,rsq
6.877007,0.4919387


In [51]:
# Predicting O3 using stationId, hour, month, weekend variables along with previously predicted PM2.5, PM10 and O3 variables
O3_ld_formula <- as.formula("O3 ~ PM2.5 + PM10 + longitude + latitude + hour + month + weekend")
O3_ld_model <- train(O3_ld_formula, data = train_ld, method = "lm" )
summary(O3_ld_model)
test_ld$O3_pred <- predict(O3_ld_model, test_ld)
metrics(test_ld, truth = O3, estimate = O3_pred)


Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
   Min     1Q Median     3Q    Max 
-62.20 -13.23  -2.97  11.00  84.72 

Coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept) -8.478e+03  6.832e+01 -124.098  < 2e-16 ***
PM2.5        3.906e-01  1.146e-02   34.092  < 2e-16 ***
PM10         6.149e-01  7.647e-03   80.407  < 2e-16 ***
longitude   -2.794e+01  4.346e-01  -64.293  < 2e-16 ***
latitude     1.650e+02  1.327e+00  124.405  < 2e-16 ***
hour         5.414e-01  7.648e-03   70.793  < 2e-16 ***
month       -7.800e-02  1.553e-02   -5.022 5.12e-07 ***
weekend     -5.299e+00  1.166e-01  -45.462  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.71 on 126644 degrees of freedom
Multiple R-squared:  0.2991,	Adjusted R-squared:  0.2991 
F-statistic:  7721 on 7 and 126644 DF,  p-value: < 2.2e-16


rmse,rsq
18.84163,0.2948813


In [54]:
rm("train_ld")
rm("test_ld")

### Building the next 2 days dataset and predicting PM2.5, PM10 and O3 values

In [76]:
#setting system timezone to UTC for consistent datetime usage
Sys.setenv(TZ='GMT')
tomorrow <- Sys.Date() + 1

In [77]:
# building 1 hour intervals for next 2 days and stationIds
ld_time <-seq(from= as.POSIXct(tomorrow), by = "1 hour", length.out = 48)
ld_time <- with_tz(ld_time, tzone = "UTC")
ld_future_data <- data.frame(ld_time)
# This id will be used in the creation of final submission file
ld_future_data$id <- seq.int(nrow(ld_future_data)) -1
ld_future_data <- merge(ld_future_data, data.frame(unique(ld_aq_gm_combined_data$stationId)))
names(ld_future_data) <- c("utc_time", "id", "stationId")

In [78]:
# building datetime features for the next 2 days
ld_future_data$hour <- hour(ld_future_data$utc_time)
ld_future_data$month <- month(ld_future_data$utc_time)
ld_future_data$date <- date(ld_future_data$utc_time)
ld_future_data$weekend = chron::is.weekend(ld_future_data$date)

In [79]:
ld_future_data$weekend <- as.integer(ld_future_data$weekend)

In [82]:
# lat long data
ld_future_data <- merge(ld_future_data,ld_lat_long_data, by = "stationId")

In [85]:
# predicting PM2.5, PM10 and O3 values
ld_future_data$PM2.5 <- predict(PM2.5_ld_model, ld_future_data)
ld_future_data$PM10 <- predict(PM10_ld_model, ld_future_data)
ld_future_data$O3 <- predict(O3_ld_model, ld_future_data)

In [86]:
#creating the test_id for final submission file
ld_future_data$test_id <- paste(ld_future_data$stationId, "#", ld_future_data$id, sep = "")

In [89]:
str(ld_future_data)
summary(ld_future_data)
ld_future_data %>% summarize(min_date = min(utc_time ), max_date = max(utc_time ))

'data.frame':	624 obs. of  13 variables:
 $ stationId: Factor w/ 24 levels "BL0","BX1","BX9",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ utc_time : POSIXct, format: "2018-05-29 00:00:00" "2018-05-29 01:00:00" ...
 $ id       : num  0 1 2 3 4 5 6 7 8 9 ...
 $ hour     : int  0 1 2 3 4 5 6 7 8 9 ...
 $ month    : int  5 5 5 5 5 5 5 5 5 5 ...
 $ date     : Date, format: "2018-05-29" "2018-05-29" ...
 $ weekend  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ latitude : num  51.5 51.5 51.5 51.5 51.5 ...
 $ longitude: num  -0.126 -0.126 -0.126 -0.126 -0.126 ...
 $ PM2.5    : num  13 13.1 13.2 13.3 13.3 ...
 $ PM10     : num  19.5 19.7 19.8 20 20.1 ...
 $ O3       : num  45.3 46 46.7 47.4 48.1 ...
 $ test_id  : chr  "BL0#0" "BL0#1" "BL0#2" "BL0#3" ...


   stationId      utc_time                         id             hour      
 BL0    : 48   Min.   :2018-05-29 00:00:00   Min.   : 0.00   Min.   : 0.00  
 CD1    : 48   1st Qu.:2018-05-29 11:45:00   1st Qu.:11.75   1st Qu.: 5.75  
 CD9    : 48   Median :2018-05-29 23:30:00   Median :23.50   Median :11.50  
 GN0    : 48   Mean   :2018-05-29 23:30:00   Mean   :23.50   Mean   :11.50  
 GN3    : 48   3rd Qu.:2018-05-30 11:15:00   3rd Qu.:35.25   3rd Qu.:17.25  
 GR4    : 48   Max.   :2018-05-30 23:00:00   Max.   :47.00   Max.   :23.00  
 (Other):336                                                                
     month        date               weekend     latitude    
 Min.   :5   Min.   :2018-05-29   Min.   :0   Min.   :51.39  
 1st Qu.:5   1st Qu.:2018-05-29   1st Qu.:0   1st Qu.:51.47  
 Median :5   Median :2018-05-29   Median :0   Median :51.52  
 Mean   :5   Mean   :2018-05-29   Mean   :0   Mean   :51.49  
 3rd Qu.:5   3rd Qu.:2018-05-30   3rd Qu.:0   3rd Qu.:51.52  
 Max.   :5  

min_date,max_date
2018-05-29,2018-05-30 23:00:00


In [88]:
head(ld_future_data, 20)

stationId,utc_time,id,hour,month,date,weekend,latitude,longitude,PM2.5,PM10,O3,test_id
BL0,2018-05-29 00:00:00,0,0,5,2018-05-29,0,51.52229,-0.125848,13.01298,19.52435,45.2782,BL0#0
BL0,2018-05-29 01:00:00,1,1,5,2018-05-29,0,51.52229,-0.125848,13.0956,19.66696,45.99015,BL0#1
BL0,2018-05-29 02:00:00,2,2,5,2018-05-29,0,51.52229,-0.125848,13.17822,19.80957,46.7021,BL0#2
BL0,2018-05-29 03:00:00,3,3,5,2018-05-29,0,51.52229,-0.125848,13.26084,19.95219,47.41405,BL0#3
BL0,2018-05-29 04:00:00,4,4,5,2018-05-29,0,51.52229,-0.125848,13.34346,20.0948,48.126,BL0#4
BL0,2018-05-29 05:00:00,5,5,5,2018-05-29,0,51.52229,-0.125848,13.42608,20.23741,48.83794,BL0#5
BL0,2018-05-29 06:00:00,6,6,5,2018-05-29,0,51.52229,-0.125848,13.5087,20.38002,49.54989,BL0#6
BL0,2018-05-29 07:00:00,7,7,5,2018-05-29,0,51.52229,-0.125848,13.59132,20.52264,50.26184,BL0#7
BL0,2018-05-29 08:00:00,8,8,5,2018-05-29,0,51.52229,-0.125848,13.67394,20.66525,50.97379,BL0#8
BL0,2018-05-29 09:00:00,9,9,5,2018-05-29,0,51.52229,-0.125848,13.75656,20.80786,51.68574,BL0#9


In [93]:
write.csv(ld_future_data[,c("test_id", "PM2.5", "PM10", "O3")], file = paste("ld_submission",Sys.Date(),".csv"), row.names = FALSE)

In [94]:
# resetting the timezone
Sys.unsetenv("TZ")

In [95]:
rm("ld_future_data")
rm("ld_lat_long_data")
rm("ld_aq_gm_combined_data")