#### Basic setup:

In [151]:
if(!is.null(dev.list())) dev.off() # Clear Plots
rm(list=ls()) # Clear objects from Memory
cat("\014") # Clear Console
# writeClipboard(as.character(x)) # copy data frame to clipboard



In [152]:
#install.packages("RCurl")
library(RCurl)
#install.packages("digest")
#install.packages("sqldf")
library(sqldf)
library(digest)
library(dplyr)
library(anytime)
library(geosphere)

In [153]:
# set working directory
setwd("C:/Users/vanethi/Documents/GitHub/DS420_Factoria")

#### Getting API Data:

In [154]:
# set city id
cityID <- data.frame(c('bj','ld'))
colnames(cityID) <- c('CityID')

# set start and end date
startDate <- '2017-12-31-0'
endDate <- '2018-06-01-0'

In [155]:
# pull data for each city id
for (i in 1:nrow(cityID)) {

  # acquire air quality data
  assign(paste0(cityID[i,1],"_aq_url"),paste0("https://biendata.com/competition/airquality/",cityID[i,1],"/",startDate,"/",endDate,"/2k0d1d8"))
  assign(paste0(cityID[i,1],"_aq_file"),getURL(eval(as.symbol(paste0(cityID[i,1],"_aq_url"))), ssl.verifyhost=FALSE, ssl.verifypeer=FALSE))
  assign(paste0(cityID[i,1],"_aq_data"),read.csv(textConnection(eval(as.symbol(paste0(cityID[i,1],"_aq_file")))), header=TRUE))
  
  # acquire API grid meteorology data
  assign(paste0(cityID[i,1],"_gm_url"),paste0("https://biendata.com/competition/meteorology/",cityID[i,1],"_grid/",startDate,"/",endDate,"/2k0d1d8"))
  assign(paste0(cityID[i,1],"_gm_file"),getURL(eval(as.symbol(paste0(cityID[i,1],"_gm_url"))), ssl.verifyhost=FALSE, ssl.verifypeer=FALSE))
  assign(paste0(cityID[i,1],"_gm_data"),read.csv(textConnection(eval(as.symbol(paste0(cityID[i,1],"_gm_file")))), header=TRUE))
}



In [156]:
#acquire beijing observed meteorology data
bj_om_url <- paste0("https://biendata.com/competition/meteorology/bj/",startDate,"/",endDate,"/2k0d1d8")
bj_om_file <- getURL(bj_om_url, ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
bj_om_data <- read.csv(textConnection(bj_om_file), header=TRUE)

In [157]:
rm("cityID")

In [158]:
# list of available data frames
df.list <- names(which(unlist(eapply(.GlobalEnv,is.data.frame))))
df.list

** I have added the code for all datasets to convert time from character to posixCt format datetime. Noticed we use substr to extract the hour, we can switch it out to converting to time and using time functions if we need more functions **

In [159]:
# Converting character to datetime
bj_om_data$time <- anytime(bj_om_data$time)
bj_gm_data$time <- anytime(bj_gm_data$time)
bj_aq_data$time <- anytime(bj_aq_data$time)
ld_gm_data$time <- anytime(ld_gm_data$time)
ld_aq_data$time <- anytime(ld_aq_data$time)

In [160]:
# printing structure of all the datasets
for (i in 1:length(df.list)) {
 print(df.list[i])
  print(str(get(df.list[i])))
}

[1] "ld_aq_data"
'data.frame':	16679 obs. of  9 variables:
 $ id                : int  2941506 2941507 2941508 2941509 2941510 2941511 2941512 2941513 2941514 2941515 ...
 $ station_id        : Factor w/ 19 levels "BL0","BX1","BX9",..: 4 1 10 15 12 9 11 14 8 13 ...
 $ time              : POSIXct, format: "2018-03-31 07:00:00" "2018-03-31 07:00:00" ...
 $ PM25_Concentration: num  9.9 8.5 4.8 9.3 8.6 7.4 3.8 14.4 7.4 5.9 ...
 $ PM10_Concentration: num  14.3 8.8 8.9 10.4 10.5 10.6 6.6 24.1 13.1 6.9 ...
 $ NO2_Concentration : num  70.4 44.8 16.5 NA 35.7 48.9 36.5 43.3 57.5 NA ...
 $ CO_Concentration  : logi  NA NA NA NA NA NA ...
 $ O3_Concentration  : logi  NA NA NA NA NA NA ...
 $ SO2_Concentration : logi  NA NA NA NA NA NA ...
NULL
[1] "bj_gm_data"
'data.frame':	568931 obs. of  9 variables:
 $ id            : int  2000958 2000959 2000960 2000961 2000962 2000963 2000964 2000965 2000966 2000967 ...
 $ station_id    : Factor w/ 651 levels "beijing_grid_000",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ 

In [161]:
# Printing the min and max dates of all datasets
print("bj_om_data")
bj_om_data %>% summarize(min_date = min(time), max_date = max(time))
print("bj_gm_data")
bj_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("bj_aq_data")
bj_aq_data %>% summarize(min_date = min(time), max_date = max(time))
print("ld_gm_data")
ld_gm_data %>% summarize(min_date = min(time), max_date = max(time))
print("ld_aq_data")
ld_aq_data %>% summarize(min_date = min(time), max_date = max(time))

[1] "bj_om_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-07 02:00:00


[1] "bj_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-07 03:00:00


[1] "bj_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-07 01:00:00


[1] "ld_gm_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-07 03:00:00


[1] "ld_aq_data"


min_date,max_date
2018-03-31 07:00:00,2018-05-07 01:00:00


In [162]:
# printing summary of numerical columns in Air Quality datasets
print("bj_aq_data")
summary(bj_aq_data[,4:9])
print("ld_aq_data")
summary(ld_aq_data[,4:6])

[1] "bj_aq_data"


 PM25_Concentration PM10_Concentration NO2_Concentration CO_Concentration
 Min.   :  3.00     Min.   :   6.0     Min.   :  2.00    Min.   :0.1000  
 1st Qu.: 19.00     1st Qu.:  63.0     1st Qu.: 20.00    1st Qu.:0.3000  
 Median : 41.00     Median : 118.0     Median : 36.00    Median :0.5000  
 Mean   : 61.52     Mean   : 135.9     Mean   : 43.58    Mean   :0.6836  
 3rd Qu.: 90.00     3rd Qu.: 187.0     3rd Qu.: 61.00    3rd Qu.:0.9000  
 Max.   :396.00     Max.   :2030.0     Max.   :249.00    Max.   :4.8000  
 NA's   :1531       NA's   :5345       NA's   :1251      NA's   :1257    
 O3_Concentration SO2_Concentration
 Min.   :  2      Min.   :  2.000  
 1st Qu.: 39      1st Qu.:  2.000  
 Median : 74      Median :  3.000  
 Mean   : 78      Mean   :  6.511  
 3rd Qu.:104      3rd Qu.:  9.000  
 Max.   :342      Max.   :300.000  
 NA's   :1427     NA's   :1248     

[1] "ld_aq_data"


 PM25_Concentration PM10_Concentration NO2_Concentration
 Min.   :-14.00     Min.   : -2.30     Min.   : -9.10   
 1st Qu.:  7.20     1st Qu.: 11.90     1st Qu.: 18.30   
 Median : 12.00     Median : 18.60     Median : 31.00   
 Mean   : 16.46     Mean   : 22.66     Mean   : 37.43   
 3rd Qu.: 22.20     3rd Qu.: 30.20     3rd Qu.: 50.80   
 Max.   :111.00     Max.   :146.10     Max.   :223.90   
 NA's   :3770       NA's   :5067       NA's   :5784     

In [163]:
# printing summary of numerical columns in Geometeorological datasets
print("bj_om_data")
summary(bj_om_data[,5:9])
print("bj_gm_data")
summary(bj_gm_data[,5:9])
print("ld_gm_data")
summary(ld_gm_data[,5:9])

[1] "bj_om_data"


  temperature          pressure           humidity          wind_speed      
 Min.   :    -2.6   Min.   :   944.2   Min.   :     3.0   Min.   :     0.0  
 1st Qu.:    10.9   1st Qu.:   996.4   1st Qu.:    28.0   1st Qu.:     1.1  
 Median :    15.8   Median :  1004.8   Median :    46.0   Median :     1.9  
 Mean   :   730.4   Mean   :  1777.7   Mean   :   763.7   Mean   :   717.2  
 3rd Qu.:    20.8   3rd Qu.:  1010.8   3rd Qu.:    68.0   3rd Qu.:     3.1  
 Max.   :999999.0   Max.   :999999.0   Max.   :999999.0   Max.   :999999.0  
 wind_direction  
 Min.   :     0  
 1st Qu.:    80  
 Median :   186  
 Mean   : 39257  
 3rd Qu.:   267  
 Max.   :999999  

[1] "bj_gm_data"


  temperature        pressure         humidity      wind_direction  
 Min.   :-12.00   Min.   : 829.9   Min.   :  6.00   Min.   :  0.00  
 1st Qu.:  9.60   1st Qu.: 929.9   1st Qu.: 26.00   1st Qu.: 87.75  
 Median : 14.80   Median : 977.0   Median : 38.00   Median :188.94  
 Mean   : 14.45   Mean   : 966.0   Mean   : 41.27   Mean   :182.57  
 3rd Qu.: 20.00   3rd Qu.:1009.2   3rd Qu.: 53.00   3rd Qu.:275.85  
 Max.   : 33.00   Max.   :1030.5   Max.   :100.00   Max.   :360.00  
   wind_speed   
 Min.   : 0.01  
 1st Qu.: 6.46  
 Median :10.12  
 Mean   :11.99  
 3rd Qu.:15.50  
 Max.   :62.38  

[1] "ld_gm_data"


  temperature       pressure         humidity      wind_direction 
 Min.   :-1.47   Min.   : 971.2   Min.   :  7.00   Min.   :  0.0  
 1st Qu.: 7.79   1st Qu.: 998.3   1st Qu.: 78.00   1st Qu.:109.9  
 Median : 9.84   Median :1005.3   Median : 85.00   Median :190.8  
 Mean   :10.12   Mean   :1005.3   Mean   : 83.67   Mean   :177.2  
 3rd Qu.:11.96   3rd Qu.:1012.2   3rd Qu.: 92.00   3rd Qu.:245.8  
 Max.   :28.23   Max.   :1030.0   Max.   :100.00   Max.   :360.0  
   wind_speed   
 Min.   : 0.03  
 1st Qu.:10.87  
 Median :16.53  
 Mean   :18.00  
 3rd Qu.:23.67  
 Max.   :74.25  

**The weather component doesn't have a consistent naming conventions; We have ignored this for now, which is good. If this appears to be an important factor, we can identify a way to match between the different nomenclatures**

In [164]:
print("bj_om_data")
unique(bj_om_data$weather)
print("bj_gm_data")
unique(bj_gm_data$weather)
print("ld_gm_data")
unique(ld_gm_data$weather)

[1] "bj_om_data"


[1] "bj_gm_data"


[1] "ld_gm_data"


### Historical Datasets

#### Beijing - Observed geometeorology dataset for the Air Quality stations

This is similar to the observed geometeorology dataset from the API but just historical information. Currently, we aren't using the observed geometeorology dataset in any of our analysis because of the high number of NAs and error prone values. Once we build a basic model, it appears that there is potential to clean up this dataset and work on integrating this information too. 

**The file is divided into 2 sections - one from Jan 1, 2017 to Jan 31, 2018; another from Feb 1, 2018 to March 31, 2018**

In [165]:
# read.csv Beijing historical GM
bj_om_hist_file <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/beijing_17_18_meo.csv"
bj_om_hist_data <- read.csv(bj_om_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
bj_om_hist_data$utc_time <- anytime(bj_om_hist_data$utc_time)

In [167]:
print("Beijing Geo Historical data")
str(bj_om_hist_data)
bj_om_hist_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_om_hist_data[,5:9])
unique(bj_om_hist_data$weather)

[1] "Beijing Geo Historical data"
'data.frame':	158047 obs. of  10 variables:
 $ station_id    : chr  "shunyi_meo" "shunyi_meo" "shunyi_meo" "shunyi_meo" ...
 $ longitude     : num  117 117 117 117 117 ...
 $ latitude      : num  40.1 40.1 40.1 40.1 40.1 ...
 $ utc_time      : POSIXct, format: "2017-01-30 16:00:00" "2017-01-30 17:00:00" ...
 $ temperature   : num  -1.7 -3.5 -3.7 -3.9 -4.1 -4.2 -4 -4.7 -4.9 -3.6 ...
 $ pressure      : num  1029 1028 1028 1028 1027 ...
 $ humidity      : int  15 24 27 27 26 28 27 29 28 28 ...
 $ wind_direction: num  215 16 32 21 17 68 329 28 17 29 ...
 $ wind_speed    : num  1.6 1 1.1 0.9 1.1 0.9 1.1 0.7 1.2 1.1 ...
 $ weather       : chr  "Sunny/clear" "Haze" "Haze" "Haze" ...


min_date,max_date
2017-01-30 16:00:00,2018-01-31 15:00:00


  temperature          pressure           humidity        wind_direction  
 Min.   :   -21.3   Min.   :   940.0   Min.   :     4.0   Min.   :     0  
 1st Qu.:     2.5   1st Qu.:   994.2   1st Qu.:    27.0   1st Qu.:    78  
 Median :    13.8   Median :  1005.6   Median :    48.0   Median :   190  
 Mean   :    38.2   Mean   :  1026.8   Mean   :   354.3   Mean   : 35488  
 3rd Qu.:    23.2   3rd Qu.:  1016.9   3rd Qu.:    73.0   3rd Qu.:   280  
 Max.   :999999.0   Max.   :999999.0   Max.   :999999.0   Max.   :999999  
                                                          NA's   :234     
   wind_speed      
 Min.   :     0.0  
 1st Qu.:     0.9  
 Median :     1.5  
 Mean   :    96.9  
 3rd Qu.:     2.5  
 Max.   :999999.0  
 NA's   :234       

*Notice the 999,999s in all fields and NAs in wind_direction & wind_speed. I guess if we are planning to integrate, we can substitute these values to their mean/median*

** Feb & March dataset: **

In [58]:
# Beijing recent geometeorolgy data (Jan - Mar 2018)
bj_om_recent_file <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/beijing_201802_201803_me.csv"
bj_om_recent_data <- read.csv(bj_om_recent_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
bj_om_recent_data$utc_time <- anytime(bj_om_recent_data$utc_time)

In [59]:
print("Beijing Weather Recent data")
str(bj_om_recent_data)
bj_om_recent_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_om_recent_data[,4:8])
unique(bj_om_recent_data$weather)

[1] "Beijing Weather Recent data"
'data.frame':	26677 obs. of  8 variables:
 $ station_id    : chr  "beijing_meo" "beijing_meo" "beijing_meo" "beijing_meo" ...
 $ utc_time      : POSIXct, format: "2018-01-31 16:00:00" "2018-01-31 17:00:00" ...
 $ weather       : chr  "Sunny/clear" "Sunny/clear" "Sunny/clear" "Haze" ...
 $ temperature   : num  -4.8 -6.1 -7 -7.7 -6.3 -5.6 -5.8 -7.7 -7.1 -3.5 ...
 $ pressure      : num  1024 1023 1023 1023 1022 ...
 $ humidity      : int  39 42 45 47 44 40 39 46 47 41 ...
 $ wind_speed    : num  0.9 0 0.5 0 2 1.3 1.2 0 0.2 0.7 ...
 $ wind_direction: int  59 999017 178 999017 71 72 70 999017 999017 279 ...


min_date,max_date
2018-01-31 16:00:00,2018-04-01


  temperature         pressure         humidity       wind_speed    
 Min.   :-18.200   Min.   : 944.5   Min.   : 3.00   Min.   : 0.000  
 1st Qu.: -2.600   1st Qu.:1000.3   1st Qu.:21.00   1st Qu.: 1.000  
 Median :  1.900   Median :1013.3   Median :34.00   Median : 1.600  
 Mean   :  3.007   Mean   :1006.3   Mean   :37.29   Mean   : 2.057  
 3rd Qu.:  7.800   3rd Qu.:1020.5   3rd Qu.:50.00   3rd Qu.: 2.700  
 Max.   : 28.200   Max.   :1038.0   Max.   :99.00   Max.   :13.500  
 wind_direction  
 Min.   :     0  
 1st Qu.:    79  
 Median :   182  
 Mean   : 40460  
 3rd Qu.:   276  
 Max.   :999017  

#### Beijing - Geometeorology historical dataset for the grids:

Dataset from Jan 1, 2017 to March 27, 2018

In [168]:
# read.csv Beijing historical Grid GM
bj_gm_hist_file <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/Beijing_historical_meo_grid.csv"
bj_gm_hist_data <- read.csv(bj_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
bj_gm_hist_data$utc_time <- anytime(bj_gm_hist_data$utc_time)

In [169]:
print("Beijing Geo Historical Grid data")
str(bj_gm_hist_data)
bj_gm_hist_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_gm_hist_data[,5:9])

[1] "Beijing Geo Historical Grid data"
'data.frame':	7034706 obs. of  9 variables:
 $ stationName   : chr  "beijing_grid_000" "beijing_grid_001" "beijing_grid_002" "beijing_grid_003" ...
 $ longitude     : num  115 115 115 115 115 115 115 115 115 115 ...
 $ latitude      : num  39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 ...
 $ utc_time      : POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 00:00:00" ...
 $ temperature   : num  -5.47 -5.53 -5.7 -5.88 -5.34 -4.81 -4.98 -5.49 -6.17 -7.17 ...
 $ pressure      : num  985 979 963 947 929 ...
 $ humidity      : num  76.6 75.4 71.8 68.2 58.8 ...
 $ wind_direction: num  53.71 43.59 0.97 327.65 317.85 ...
 $ wind_speed.kph: num  3.53 3.11 2.75 3.84 6.14 ...


min_date,max_date
2017-01-01,2018-03-27 05:00:00


  temperature         pressure         humidity      wind_direction 
 Min.   :-25.500   Min.   : 826.4   Min.   :  3.04   Min.   :  0.0  
 1st Qu.: -1.750   1st Qu.: 930.5   1st Qu.: 26.05   1st Qu.:136.4  
 Median :  8.380   Median : 978.0   Median : 39.19   Median :216.5  
 Mean   :  8.899   Mean   : 968.2   Mean   : 43.40   Mean   :209.1  
 3rd Qu.: 19.580   3rd Qu.:1012.3   3rd Qu.: 58.02   3rd Qu.:302.6  
 Max.   : 36.870   Max.   :1040.6   Max.   :100.00   Max.   :360.0  
 wind_speed.kph  
 Min.   : 0.010  
 1st Qu.: 5.120  
 Median : 8.290  
 Mean   : 9.893  
 3rd Qu.:12.760  
 Max.   :68.820  

#### Beijing Historical Air Quality Dataset 

**The file is divided into 2 sections - one from Jan 1, 2017 to Jan 31, 2018; another from Feb 1, 2018 to March 31, 2018**

I am going to be adding the Feb 1, 2018 to March 31, 2018 information to our dataset

In [170]:
# read.csv Beijing historical AQ
bj_aq_hist_file <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/beijing_17_18_aq.csv"
bj_aq_hist_data <- read.csv(bj_aq_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
bj_aq_hist_data$utc_time <- anytime(bj_aq_hist_data$utc_time)

In [171]:
print("Beijing Air Quality Historical data")
str(bj_aq_hist_data)
bj_aq_hist_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_aq_hist_data[,3:8])

[1] "Beijing Air Quality Historical data"
'data.frame':	311010 obs. of  8 variables:
 $ stationId: chr  "aotizhongxin_aq" "aotizhongxin_aq" "aotizhongxin_aq" "aotizhongxin_aq" ...
 $ utc_time : POSIXct, format: "2017-01-01 14:00:00" "2017-01-01 15:00:00" ...
 $ PM2.5    : num  453 417 395 420 453 429 211 116 51 38 ...
 $ PM10     : num  467 443 467 484 520 NA NA NA NA NA ...
 $ NO2      : num  156 143 141 139 157 141 110 87 58 55 ...
 $ CO       : num  7.2 6.8 6.9 7.4 7.6 6.5 3.3 2.2 1.3 1.1 ...
 $ O3       : num  3 2 3 3 4 3 NA 4 26 28 ...
 $ SO2      : num  9 8 8 9 9 9 11 13 14 14 ...


min_date,max_date
2017-01-01 14:00:00,2018-01-31 15:00:00


     PM2.5              PM10              NO2               CO       
 Min.   :   2.00   Min.   :   5.00   Min.   :  1.00   Min.   : 0.10  
 1st Qu.:  16.00   1st Qu.:  37.00   1st Qu.: 20.00   1st Qu.: 0.40  
 Median :  39.00   Median :  70.00   Median : 39.00   Median : 0.70  
 Mean   :  58.79   Mean   :  88.06   Mean   : 45.79   Mean   : 0.96  
 3rd Qu.:  77.00   3rd Qu.: 113.00   3rd Qu.: 66.00   3rd Qu.: 1.20  
 Max.   :1004.00   Max.   :3000.00   Max.   :300.00   Max.   :15.00  
 NA's   :20389     NA's   :83263     NA's   :18651    NA's   :42813  
       O3              SO2         
 Min.   :  1.00   Min.   :  1.000  
 1st Qu.: 12.00   1st Qu.:  2.000  
 Median : 45.00   Median :  5.000  
 Mean   : 55.69   Mean   :  8.981  
 3rd Qu.: 79.00   3rd Qu.: 11.000  
 Max.   :504.00   Max.   :307.000  
 NA's   :20421    NA's   :18548    

In [172]:
# read.csv Beijing recent Air Quality (Jan - Mar 2018)
bj_aq_recent_file <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/beijing_201802_201803_aq.csv"
bj_aq_recent_data <- read.csv(bj_aq_recent_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
bj_aq_recent_data$utc_time <- anytime(bj_aq_recent_data$utc_time)

In [173]:
print("Beijing Air Quality Recent data")
str(bj_aq_recent_data)
bj_aq_recent_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(bj_aq_recent_data[,3:8])

[1] "Beijing Air Quality Recent data"
'data.frame':	49420 obs. of  8 variables:
 $ stationId: chr  "aotizhongxin_aq" "aotizhongxin_aq" "aotizhongxin_aq" "aotizhongxin_aq" ...
 $ utc_time : POSIXct, format: "2018-01-31 16:00:00" "2018-01-31 17:00:00" ...
 $ PM2.5    : num  49 47 46 60 52 38 30 29 26 28 ...
 $ PM10     : num  82 80 91 95 91 80 70 75 79 95 ...
 $ NO2      : num  90 90 91 85 81 72 70 73 73 73 ...
 $ CO       : num  0.9 0.9 1.3 2 1.9 1.2 0.9 0.8 0.9 1.1 ...
 $ O3       : num  6 5 5 6 5 4 3 3 4 7 ...
 $ SO2      : num  10 10 28 38 30 14 8 10 7 10 ...


min_date,max_date
2018-01-31 16:00:00,2018-03-31 15:00:00


     PM2.5              PM10             NO2              CO       
 Min.   :   3.00   Min.   :   6.0   Min.   :  2.0   Min.   :0.100  
 1st Qu.:  21.00   1st Qu.:  45.0   1st Qu.: 23.0   1st Qu.:0.500  
 Median :  54.00   Median :  82.0   Median : 42.0   Median :0.900  
 Mean   :  74.85   Mean   : 111.9   Mean   : 47.5   Mean   :1.012  
 3rd Qu.: 108.00   3rd Qu.: 132.0   3rd Qu.: 67.0   3rd Qu.:1.400  
 Max.   :1574.00   Max.   :3280.0   Max.   :262.0   Max.   :8.800  
 NA's   :3070      NA's   :12912    NA's   :3069    NA's   :3331   
       O3              SO2        
 Min.   :  2.00   Min.   :  2.00  
 1st Qu.: 22.00   1st Qu.:  4.00  
 Median : 52.00   Median :  8.00  
 Mean   : 52.13   Mean   : 10.45  
 3rd Qu.: 76.00   3rd Qu.: 14.00  
 Max.   :288.00   Max.   :297.00  
 NA's   :3311     NA's   :3116    

#### London - Geometeorology historical dataset for the grids:

In [175]:
# read.csv London historical GM
ld_gm_hist_file <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/London_historical_meo_grid.csv"
ld_gm_hist_data <- read.csv(ld_gm_hist_file, header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_gm_hist_data$utc_time <- anytime(ld_gm_hist_data$utc_time)

In [176]:
print("London Historical weather grid data")
str(ld_gm_hist_data)
ld_gm_hist_data %>% summarize(min_date = min(utc_time), max_date = max(utc_time))
summary(ld_gm_hist_data[,5:9])

[1] "London Historical weather grid data"
'data.frame':	9303966 obs. of  9 variables:
 $ stationName   : chr  "london_grid_000" "london_grid_001" "london_grid_002" "london_grid_003" ...
 $ longitude     : num  -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 ...
 $ latitude      : num  50.5 50.6 50.7 50.8 50.9 51 51.1 51.2 51.3 51.4 ...
 $ utc_time      : POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 00:00:00" ...
 $ temperature   : num  9.36 9.09 8.3 7.5 6.92 6.33 6.13 6.12 6.19 6.43 ...
 $ pressure      : num  1025 1024 1023 1021 1016 ...
 $ humidity      : num  77.9 79 82.1 85.3 88.5 ...
 $ wind_direction: num  251 250 245 236 228 ...
 $ wind_speed.kph: num  23.7 21.8 16.1 10.6 11 ...


min_date,max_date
2017-01-01,2018-03-27 05:00:00


  temperature       pressure         humidity      wind_direction 
 Min.   :-7.65   Min.   : 956.6   Min.   : 24.80   Min.   :  0.0  
 1st Qu.: 5.97   1st Qu.:1001.8   1st Qu.: 74.68   1st Qu.:160.2  
 Median : 9.76   Median :1010.2   Median : 82.96   Median :232.6  
 Mean   :10.02   Mean   :1009.0   Mean   : 81.61   Mean   :212.2  
 3rd Qu.:14.40   3rd Qu.:1017.5   3rd Qu.: 89.90   3rd Qu.:274.1  
 Max.   :31.33   Max.   :1039.9   Max.   :100.01   Max.   :360.0  
 wind_speed.kph 
 Min.   : 0.01  
 1st Qu.:12.12  
 Median :18.32  
 Mean   :20.24  
 3rd Qu.:26.10  
 Max.   :89.09  

#### London - Historical Air Quality dataset

** There are two data files - divided by the air quality stations; These are already combined in our data files ** 

In [177]:
# read.csv London historical AQ
ld_aq_hist_file_a <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/London_historical_aqi_forecast_stations_20180331.csv"
ld_aq_hist_data_a <- read.csv(ld_aq_hist_file_a, header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_aq_hist_data_a$MeasurementDateGMT <-
as.POSIXct(strptime(ld_aq_hist_data_a$MeasurementDateGMT, tz= "GMT", format = "%Y/%m/%d %H:%M" ))

In [178]:
print("London Historical Air Quaility data - 1st part")
str(ld_aq_hist_data_a)
ld_aq_hist_data_a %>% summarize(min_date = min(MeasurementDateGMT), max_date = max(MeasurementDateGMT))
summary(ld_aq_hist_data_a[,4:6])

[1] "London Historical Air Quaility data - 1st part"
'data.frame':	141661 obs. of  6 variables:
 $ X                 : int  0 1 2 3 4 5 6 7 8 9 ...
 $ MeasurementDateGMT: POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 01:00:00" ...
 $ station_id        : chr  "CD1" "CD1" "CD1" "CD1" ...
 $ PM2.5..ug.m3.     : num  40 31.6 24.7 21.2 24.9 24.6 23.9 22 19 19.9 ...
 $ PM10..ug.m3.      : num  44.4 34.4 28.1 24.5 23 23.9 22 22.9 20.1 24.4 ...
 $ NO2..ug.m3.       : num  36.6 46.2 38.3 32.8 28.1 29.3 28.8 34.6 44.6 55.3 ...


min_date,max_date
2017-01-01,2018-03-31


 PM2.5..ug.m3.     PM10..ug.m3.     NO2..ug.m3.    
 Min.   :-10.50   Min.   :-11.80   Min.   : -4.70  
 1st Qu.:  6.40   1st Qu.: 11.80   1st Qu.: 21.60  
 Median : 10.20   Median : 17.50   Median : 37.80  
 Mean   : 13.23   Mean   : 21.01   Mean   : 43.26  
 3rd Qu.: 16.40   3rd Qu.: 26.30   3rd Qu.: 58.40  
 Max.   :189.70   Max.   :667.10   Max.   :336.10  
 NA's   :18676    NA's   :14553    NA's   :25445   

In [179]:
# read.csv London historical AQ
ld_aq_hist_file_b <- "C:/Users/vanethi/Documents/GitHub/DS420_Factoria/Datasets/London_historical_aqi_other_stations_20180331.csv"
ld_aq_hist_data_b <- read.csv(ld_aq_hist_file_b, header=TRUE, sep=",", stringsAsFactors = FALSE)
ld_aq_hist_data_b$MeasurementDateGMT <- as.POSIXct(strptime(ld_aq_hist_data_b$MeasurementDateGMT, tz= "GMT", format = "%Y/%m/%d %H:%M" ))
ld_aq_hist_data_b <- ld_aq_hist_data_b[, -c(6,7)]

In [180]:
print("London Historical Air Quaility data - 2nd part")
str(ld_aq_hist_data_b)
ld_aq_hist_data_a %>% summarize(min_date = min(MeasurementDateGMT, na.rm = TRUE), max_date = max(MeasurementDateGMT, na.rm = TRUE))
summary(ld_aq_hist_data_b[,2:5])

[1] "London Historical Air Quaility data - 2nd part"
'data.frame':	141633 obs. of  5 variables:
 $ Station_ID        : chr  "LH0" "LH0" "LH0" "LH0" ...
 $ MeasurementDateGMT: POSIXct, format: "2017-01-01 00:00:00" "2017-01-01 01:00:00" ...
 $ PM2.5..ug.m3.     : num  30.2 25.4 24.7 23.6 24.2 22.8 21.6 19.9 18.3 16.3 ...
 $ PM10..ug.m3.      : num  34.6 29.2 28.1 27 27.4 26 24.8 23.1 21.3 19.5 ...
 $ NO2..ug.m3.       : num  15.9 11.8 11.6 13 27.1 22.9 26.8 39.4 41.6 44.1 ...


min_date,max_date
2017-01-01,2018-03-31


 MeasurementDateGMT            PM2.5..ug.m3.     PM10..ug.m3.   
 Min.   :2017-01-01 00:00:00   Min.   :-14.00   Min.   : -5.60  
 1st Qu.:2017-04-28 17:00:00   1st Qu.:  6.20   1st Qu.:  9.60  
 Median :2017-08-19 03:00:00   Median : 10.00   Median : 14.50  
 Mean   :2017-08-18 18:52:03   Mean   : 12.89   Mean   : 17.44  
 3rd Qu.:2017-12-09 12:00:00   3rd Qu.: 16.00   3rd Qu.: 21.80  
 Max.   :2018-04-01 00:00:00   Max.   :313.00   Max.   :635.60  
 NA's   :22958                 NA's   :53621    NA's   :102438  
  NO2..ug.m3.    
 Min.   : -8.30  
 1st Qu.: 16.10  
 Median : 27.30  
 Mean   : 31.51  
 3rd Qu.: 43.20  
 Max.   :209.00  
 NA's   :91703   