In [3]:
# load libraries
library(geosphere)
library(dplyr)

### Step 1: Load and clean location data

Make sure in your data:
1. The first column is the station identifiers, data type as 'character'.
2. The location columns are named 'longitude' and 'latitude'.

In [4]:
# Read location file and assign column names
bj_aq_station <- read.csv('Beijing_AirQuality_Stations.csv')
colnames(bj_aq_station) <- c('Station','longitude','latitude')
bj_grid <- read.csv('Beijing_grid_weather_station.csv', header = F)
colnames(bj_grid) <- c('GridID','latitude','longitude')

In [6]:
# Modify data types
bj_aq_station$Station <- as.character(bj_aq_station$Station)
bj_grid$GridID <- as.character(bj_grid$GridID)

In [7]:
# browse data
str(bj_aq_station)
str(bj_grid)
head(bj_aq_station)
head(bj_grid)

'data.frame':	35 obs. of  3 variables:
 $ Station  : chr  "dongsi_aq" "tiantan_aq" "guanyuan_aq" "wanshouxigong_aq" ...
 $ longitude: num  116 116 116 116 116 ...
 $ latitude : num  39.9 39.9 39.9 39.9 40 ...
'data.frame':	651 obs. of  3 variables:
 $ GridID   : chr  "beijing_grid_000" "beijing_grid_001" "beijing_grid_002" "beijing_grid_003" ...
 $ latitude : num  39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 ...
 $ longitude: num  115 115 115 115 115 115 115 115 115 115 ...


Station,longitude,latitude
dongsi_aq,116.417,39.929
tiantan_aq,116.407,39.886
guanyuan_aq,116.339,39.929
wanshouxigong_aq,116.352,39.878
aotizhongxin_aq,116.397,39.982
nongzhanguan_aq,116.461,39.937


GridID,latitude,longitude
beijing_grid_000,39.0,115
beijing_grid_001,39.1,115
beijing_grid_002,39.2,115
beijing_grid_003,39.3,115
beijing_grid_004,39.4,115
beijing_grid_005,39.5,115


### Step 2: Calculate distances and find the closest ones

In [15]:
closest_stations <- function(df1,df2){
    n_df1 <- nrow(df1)
    n_df2 <- nrow(df2)

    # initial dataframes for calculated results
    distances <- data.frame(station1=character(),
                               station2=character(), 
                               distance=double(),
                               stringsAsFactors=FALSE) 
    closest_distance <- data.frame(station1=character(),
                                   station2=character(), 
                                   distance=double(),
                                   stringsAsFactors=FALSE)
    
    # loop through df1 and df2 to calculate distance
    k = 1
    for(i in 1:n_df1){
        for(j in 1:n_df2){
            distances[k,1] <- df1[i,1]
            distances[k,2] <- df2[j,1]
            distances[k,3] <- distm(c(df1[i,'longitude'],df1[i,'latitude']), c(df2[j,'longitude'],df2[j,'latitude']), fun = distHaversine)
            k <- k+1
        }  
        closest <- distances %>% 
            filter(station1 == df1[i,1]) %>% 
            filter(distance == min(distance))
        closest_distance[i,] <- closest[1,]
    }
    
    closest_distance
}

#### It is HIGHLY recommended to put the smaller df as df1

In [17]:
# Beijing AQ stations & weather grids
beijing_closest_stations <- closest_stations(bj_aq_station,bj_grid)
beijing_closest_stations

station1,station2,distance
dongsi_aq,beijing_grid_303,3539.569
tiantan_aq,beijing_grid_303,1669.2151
guanyuan_aq,beijing_grid_282,4637.8885
wanshouxigong_aq,beijing_grid_303,4775.641
aotizhongxin_aq,beijing_grid_304,2020.0203
nongzhanguan_aq,beijing_grid_324,5296.3861
wanliu_aq,beijing_grid_283,1823.0318
beibuxinqu_aq,beijing_grid_263,2478.1752
zhiwuyuan_aq,beijing_grid_262,637.0893
fengtaihuayuan_aq,beijing_grid_282,4492.5205


### London data

In [23]:
# Read location file and assign column names
ld_aq_station <- read.csv('London_AirQuality_Stations.csv')
ld_aq_station <- ld_aq_station[,c(1,5,6)]
colnames(ld_aq_station) <- c('Station','latitude','longitude')
ld_grid <- read.csv('London_grid_weather_station.csv', header = F)
colnames(ld_grid) <- c('GridID','latitude','longitude')

In [25]:
# Modify data types
ld_aq_station$Station <- as.character(ld_aq_station$Station)
ld_grid$GridID <- as.character(ld_grid$GridID)

In [26]:
# browse data
str(ld_aq_station)
str(ld_grid)
head(ld_aq_station)
head(ld_grid)

'data.frame':	24 obs. of  3 variables:
 $ Station  : chr  "BX9" "BX1" "BL0" "CD9" ...
 $ latitude : num  51.5 51.5 51.5 51.5 51.5 ...
 $ longitude: num  0.185 0.185 -0.126 -0.129 -0.175 ...
'data.frame':	861 obs. of  3 variables:
 $ GridID   : chr  "london_grid_000" "london_grid_001" "london_grid_002" "london_grid_003" ...
 $ latitude : num  50.5 50.6 50.7 50.8 50.9 51 51.1 51.2 51.3 51.4 ...
 $ longitude: num  -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 ...


Station,latitude,longitude
BX9,51.46598,0.1848771
BX1,51.46598,0.1848771
BL0,51.52229,-0.125848
CD9,51.52771,-0.1290532
CD1,51.54422,-0.175284
CT2,51.51453,-0.1045156


GridID,latitude,longitude
london_grid_000,50.5,-2
london_grid_001,50.6,-2
london_grid_002,50.7,-2
london_grid_003,50.8,-2
london_grid_004,50.9,-2
london_grid_005,51.0,-2


In [27]:
# London AQ stations & weather grids
london_closest_stations <- closest_stations(ld_aq_station,ld_grid)
london_closest_stations

station1,station2,distance
BX9,london_grid_472,3929.17
BX1,london_grid_472,3929.17
BL0,london_grid_409,3059.76
CD9,london_grid_409,3682.914
CD1,london_grid_388,5211.633
CT2,london_grid_409,1646.945
CT3,london_grid_409,2179.318
CR8,london_grid_408,2214.003
GN0,london_grid_451,2087.362
GR4,london_grid_451,5654.537


### Step 3: Write output to local

In [None]:
#write.csv(beijing_closest_stations, file = "beijing_closest_stations.csv")
#write.csv(london_closest_stations, file = "london_closest_stations.csv")