In [103]:
library(tidyverse)
library(xml2)
library(rvest)
library(httr)
#install.packages("readxl")
library(readxl)
#install.packages("furrr")
library(furrr)
future::plan(multicore)

## Information variables

In [4]:
# Main site address
main_url <- "https://www.emi.ea.govt.nz"
# Sub address for electricity generation information
power_url <- "/Wholesale/Datasets/Generation/Generation_MD"
# Sub address for hydrostations fleet
stations_url <- "/Wholesale/Datasets/Generation/Generation_fleet/Existing"
# Sub address for network supply points
points_url <- "/Wholesale/Reports/R_NSPL_DR?_si=v|3"

## Retrieve generation output data

### Web Scraping

In [105]:
# Acquire html code from sub address
source <- paste(main_url, power_url, sep="") %>% read_html()
"Source checkpoint"
# Identify all links in the given html class
all_links <- source %>% html_nodes(".table") %>% html_elements("a") %>% html_attr("href")
"Links identification checkpoint"
# Identify appropriate urls within links and remove duplicates
filtered_links <- all_links[startsWith(all_links, power_url) & !duplicated(all_links)]
"Links filtered checkpoint"
# Extract date information from filtered links
start_pos <- nchar(power_url) + 2
dates <- substring(filtered_links , start_pos, start_pos + 3) 
"Date extraction checkpoint"
# Set dates wanted
dates_wanted <- seq(from=2015,to=2020)
# Select links containing csv data
selected_links <- filtered_links[dates %in% dates_wanted]
"Links selected checkpoint"
# Extract csv data into dataframes
link <- paste(main_url, selected_links, sep="") 
generation_md <- link %>% future_map_dfr(read_csv, show_col_types = FALSE)
"Generation retrieval checkpoint"

# Filter out non hydro stations
generation_md_hydro <- generation_md %>% filter(Tech_Code == "Hydro")
"Data filter checkpoint"
"RETRIEVAL COMPLETE"

## Retrieve hydrostations fleet information

In [6]:
# Acquire html code from sub address
source <- paste(main_url, stations_url, sep="") %>% read_html()
# Identify all links in the given html class
all_links <- source %>% html_nodes(".xls") %>% html_elements("a") %>% html_attr("href")
# Identify appropriate urls within links and remove duplicates
filtered_links <- all_links[startsWith(all_links, stations_url) & !duplicated(all_links)]
selected_link <- filtered_links
# Save spreadsheet into a temporary file and the data into a tibble
url <- paste(main_url, selected_link,sep="")
output <- GET(url, write_disk(spreadsheet <- tempfile(fileext = ".xls")))
generation_fleet <- read_excel(spreadsheet, sheet = "Generating Stations") %>% filter(Generation_Type == "Hydro")
generation_fleet_SI <- generation_fleet %>% 
                        filter(Island_Name == "SI - South Island") %>% 
                        group_by(GroupName) %>% 
                        arrange(.by_group = TRUE)
generation_fleet_SI <- generation_fleet_SI[, c('Station_Name', 
                                               'GroupName', 
                                               'Owner_Name', 
                                               'Operators_Name', 
                                               'Node_Name', 
                                               'Region_Name', 
                                               'Island_Name')]

## Retrieve Network supply points

In [11]:
source <- paste(main_url, points_url, sep="") %>% read_html()
all_links <- source %>% html_nodes(".emi-btn-no-border") %>% html_elements("a") %>% html_attr("href")
filtered_links <- all_links[startsWith(all_links, "/Wholesale") & !duplicated(all_links)] %>% na.omit()
selected_link <- filtered_links
url <- paste(main_url, selected_link, sep="")
supply_points <- url %>% read.csv(skip=6) %>% 
                            select(POC.code, NZTM.easting, NZTM.northing) %>% 
                            rename(Node_Name = POC.code)

## Add NZTM coordinated from supply points to generation fleet

In [12]:
gen_fleet <- merge(generation_fleet_SI, supply_points, by='Node_Name')
gen_fleet <- gen_fleet[!duplicated(gen_fleet$Station_Name),]
# Gen fleet contains hydrstation fleet with their locations, ot including one hydrostation without a location.

In [13]:
gen_fleet 

Unnamed: 0_level_0,Node_Name,Station_Name,GroupName,Owner_Name,Operators_Name,Region_Name,Island_Name,NZTM.easting,NZTM.northing
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,ABY0111,Opuha,,Alpine Energy,Contact Energy,SCN - South Canterbury,SI - South Island,1424397,5097843
4,ARA2201,Amethyst,,Westpower,Westpower,WEC - West Coast,SI - South Island,1873657,5721161
6,ARG1101,Argyle,Branch Hydro Scheme,Trustpower,Trustpower,NEL - Nelson/Marlbourough,SI - South Island,1616837,5386748
9,ARG1101,Wairau,Branch Hydro Scheme,Trustpower,Trustpower,NEL - Nelson/Marlbourough,SI - South Island,1616837,5386748
12,ASB0331,Montalto,Rangitata Diversion Race,Trustpower,Trustpower,CAN - Canterbury,SI - South Island,1503871,5133909
14,ASB0661,Highbank,Rangitata Diversion Race,Trustpower,Trustpower,CAN - Canterbury,SI - South Island,1503871,5133909
32,ASB0661,Cleardale,,MainPower,MainPower,CAN - Canterbury,SI - South Island,1503871,5133909
50,AVI2201,Aviemore,Waitaki Hydro Scheme,Meridian Energy,Meridian Energy,SCN - South Canterbury,SI - South Island,1390245,5051586
52,BEN2202,Benmore,Waitaki Hydro Scheme,Meridian Energy,Meridian Energy,SCN - South Canterbury,SI - South Island,1377232,5061385
53,BLN0331,Waihopai,,Trustpower,Trustpower,NEL - Nelson/Marlbourough,SI - South Island,1677835,5405479


ERROR: Error in eval(expr, envir, enclos): object 'gen_fleet' not found
