The purpose of this notebook is to use the APRS World website to download daily summary data 
for the Bergey turbines in our study. This level of data resolution is available prior to June 2018
for most turbines. The notebook uses the BeautifulSoup library to screen-scrape the data from the web
and saves it out as a compressed CSV named 'daily_summaries.csv.bz2' in this directory.

Author: Caleb Phillips (caleb.phillips@nrel.gov)

In [17]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

In [18]:
sites = pd.read_csv("bergey_sites.csv")
sites.head()

Unnamed: 0,APRS ID,AID,Public Site Name,Internal Site Name,State,Latitude,Longitude,Hub Height (m),Lidar Quality,Lidar Collection Year,...,Building Data Quality,Turbine,Periods with Consistent Generation Data,Met Tower,Met Tower Latitude,Met Tower Longitude,Measurement Height (m),Measurement Privacy,Bergey Annual Average Wind Speed (m/s),Bergey Generation (kWh) (0% Loss Assumption)
0,t007,A2744,Fremont,Fremont,MN,43.918622,-91.899498,41,N/A (Legacy Data),,...,"Partial, some missing",Bergey Excel 10,2012/01 - 2013/12 (outage starting mid-2013/11...,mn_prairiestar.PROPRIETARY.pruf.csv,43.673,-92.698,50.0,Proprietary,5.51,16459
1,t024,A2690,Washoe,Thomas Danzinger,NV,39.331088,-119.82023,30,QL1,2020.0,...,Good coverage,Bergey Excel 10,2011/01 - 2022/12,,,,,,5.86,19184
2,t028,A2672,Towamensing,Derr,PA,40.851353,-75.598395,30,QL2,2019.0,...,"Partial, some missing",Bergey Excel 10,2011/01 - 2022/12,,,,,,4.6,9670
3,t034,A3685,Lycoming,Dan Poor,NY,43.524158,-76.37229,31,QL2,2018.0,...,Good coverage,Bergey Excel 10,"2013/01 - 2017/12, 2019/01 - 2021/12",ny_oswego.qc.csv,43.464,-76.511,15.0,Public,6.55,24828
4,t041,A3933,Rockford,Fossil Park,IA,43.047418,-92.981674,37,QL2,2020.0,...,Good coverage,Bergey Excel 10,"2013/01 - 2013/12, 2015/01 - 2022/12",,,,,,5.94,19829


In [19]:
def get_months(aid):
    url = "http://mybergey.aprsworld.com/data/ps2/historical.php?station_id="+aid
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    input_elements = soup.find_all("input")
    months = []
    for i in input_elements:
        if i.has_attr('name') and (i['name'] != "months[]"):
            continue
        m = re.match(r'\d{4}-\d{2}',i['value'])
        if m is None:
            continue
        months.append(i['value'])
    
    return months

In [20]:
def get_data(aid,months):
    url = "http://mybergey.aprsworld.com/data/ps2/historicalMonthS.php?months%5B%5D="+"&months%5B%5D=".join(months)+"&station_id="+aid
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    row_elements = soup.find_all("tr")
    rows = []
    for r in row_elements:
        td = r.find_all("td")
        # skip rows that don't look like data rows
        if len(td) == 0 or td[0] is None or td[0].text.strip == "":
            continue
        m = re.match(r'\d{4}-\d{2}-\d{2}',td[0].text.strip())
        if m is None:
            continue
        date = td[0].text.strip()
        n = td[1].text.strip()
        energy_kwh = td[2].text.strip()
        power_min = td[3].text.strip()
        power_max = td[4].text.strip()
        power_avg = td[5].text.strip()
        soft_grid = td[6].text.strip()
        faults = td[7].text.strip()
        rows.append([date,n,energy_kwh,power_min,power_max,power_avg,soft_grid,faults])
    
    return rows
    
def format_data(rows):
    df = pd.DataFrame(rows,columns=["date","n","energy_kwh","power_min_w","power_max_w","power_avg_w","soft_grid","faults"])
    df["n"] = df["n"].str.replace(',','').astype('int64')
    df["date"] = df["date"].str.replace('(Today)',"",regex=False)
    df["power_max_w"] = df["power_max_w"].str.replace(',','')
    df["power_min_w"] = df["power_min_w"].str.replace(',','')
    df["power_avg_w"] = df["power_avg_w"].str.replace(',','')
    df["energy_kwh"] = df["energy_kwh"].str.replace(',','').replace("—","")
    df = df.replace("",np.nan)
    return df

In [21]:
dfs = []
for aid in tqdm(sites['AID']):
    months = get_months(aid)
    rows = get_data(aid,months)
    df = format_data(rows)
    df["AID"] = aid
    dfs.append(df)

combined_df = pd.concat(dfs)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [03:06<00:00,  9.79s/it]


In [22]:
combined_df

Unnamed: 0,date,n,energy_kwh,power_min_w,power_max_w,power_avg_w,soft_grid,faults,AID
0,2023-05-16,8616,18,-56,5906,840,No,,A2744
1,2023-05-16,8616,18,-56,5906,840,No,,A2744
2,2023-05-15,8642,6,-59,4719,275,No,,A2744
3,2023-05-14,8637,59,13,12435,2531,No,,A2744
4,2023-05-13,8637,56,9,11224,2452,No,,A2744
...,...,...,...,...,...,...,...,...,...
3329,2011-02-11,8715,10,-63,6530,424,No,,A2671
3330,2011-02-10,8717,8,-56,7727,393,No,,A2671
3331,2011-02-09,8705,18,-38,6038,845,No,,A2671
3332,2011-02-08,7136,13,-59,7383,709,No,,A2671


In [23]:
combined_df.to_csv("daily_summaries.csv.bz2",index=False)