In [1]:
import os
import zipfile
import pandas as pd
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common import TimeoutException

In [2]:
schleswig_holstein_weather_station_ids = ['05516', '02564', '04466', '03032', '02115']

In [3]:
# Extract schleswig holstein wind data
if os.path.exists('wind_data_historic/schleswig_holstein'):
    print('Data have already been extracted. Aborting.')
else:
    for file in os.listdir('wind_data_historic/historical'):
        if not file.endswith('.zip'):
            continue
        
        weather_station_id = file.split('_')[2]
        
        if not (weather_station_id in schleswig_holstein_weather_station_ids):
            continue
        
        with zipfile.ZipFile(f'wind_data_historic/historical/{file}', 'r') as zip_ref:
            zip_ref.extractall(f'wind_data_historic/schleswig_holstein')

Data have already been extracted. Aborting.


Before continuing with the pre-processing and feature engineering, I will take a detour and look at the types of wind turbines in the data and their cut-in and cut-out speeds (ie. operational limits wrt wind speed).

In [4]:
# Load tab separated file with wind turbine data
wind_turbine_data = pd.read_csv('wind_turbine_data/2023-10-30.csv', sep=';', on_bad_lines='warn')

In [5]:
wind_turbine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3944 entries, 0 to 3943
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   KREIS                 3944 non-null   object
 1   GEMEINDE              3944 non-null   object
 2   TYP                   3931 non-null   object
 3   HERSTELLER            3944 non-null   object
 4   NABENHOEHE            3916 non-null   object
 5   ROTORDURCHMESSER      3915 non-null   object
 6   SCHALLLEISTUNGSPEGEL  3725 non-null   object
 7   LEISTUNG              3944 non-null   int64 
 8   LEISTUNGSBEZUG        3944 non-null   object
 9   OSTWERT               3944 non-null   int64 
 10  NORDWERT              3944 non-null   int64 
 11  GENEHMIGT_AM          3553 non-null   object
 12  INBETRIEBNAHME        3157 non-null   object
 13  STATUS                3944 non-null   object
 14  BST_NR                3944 non-null   int64 
 15  ANL_NR                3944 non-null   

Preprocess wind power plant type data to consolidate different data entries of the same type.

In [6]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.replace(pat='([^a-zA-Z0-9 ]|wka|WKA|)', repl='', regex=True)

In [7]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.lower()

In [8]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.strip()

In [9]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.replace(pat='^(typ|[0-9]+) ', repl='', regex=True)

In [10]:
options = webdriver.FirefoxOptions()
options.add_argument('-headless')

driver = webdriver.Firefox(options=options)
driver.get("https://en.wind-turbine-models.com/search?q=enercon+e70+e4+2.000")

ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)

# Reject cookies on first visit
reject_cookies_button = driver.find_element(By.CLASS_NAME, 'cc-decline-all')

wait = WebDriverWait(driver, 10)
wait.until(lambda driver: reject_cookies_button.is_displayed() and reject_cookies_button.is_enabled())

reject_cookies_button.click()

data = []
for wind_turbine_type in tqdm(wind_turbine_data['TYP'].value_counts().index, desc='Scraping product specifications for wind turbine types in our dataset:'):
    search_input = driver.find_element(By.CSS_SELECTOR, 'input.searchbar')
    search_input.clear()
    search_input.send_keys(wind_turbine_type)
    search_input.submit()
    
    # Navigate to wind turbine results category
    try:
        turbine_category = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.visibility_of_element_located((By.XPATH, '//a[@href="#turbines"]')))
        turbine_category.click()
    except TimeoutException:
        continue
    
    # Click into detailed view of wind turbine
    wind_turbine_link = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.visibility_of_element_located((By.XPATH, '//div[@id="turbines"]/div/div/div[@class="section-body"]/div[1]')))    
    wind_turbine_link.click()
    
    wind_turbine_specification_rows = driver.find_elements(By.CSS_SELECTOR, '.tabbox-table > .row')
    
    wind_turbine_specification = []
    for row in wind_turbine_specification_rows:  
        label = row.find_element(By.CSS_SELECTOR, '.col-left').text
        content = row.find_element(By.CSS_SELECTOR, '.col-right').text
        
        label = label.replace(':', '')
        
        wind_turbine_specification.append({label: content})
    data.append({wind_turbine_type: wind_turbine_specification})
        
driver.quit()

Scraping product specifications for wind turbine types in our dataset::   0%|          | 3/801 [00:27<2:02:27,  9.21s/it]


KeyboardInterrupt: 

In [11]:
data

[{'enercon e70 e4': [{'Rated power': '2,000.0 kW'},
   {'Flexible power ratings': '-'},
   {'Cut-in wind speed': '2.5 m/s'},
   {'Rated wind speed': '13.5 m/s'},
   {'Cut-out wind speed': '34.0 m/s'},
   {'Survival wind speed': '-'},
   {'Wind zone (DIBt)': '-'},
   {'Wind class (IEC)': '-'},
   {'Diameter': '71.0 m'},
   {'Swept area': '3,959.0 m²'},
   {'Number of blades': '3'},
   {'Rotor speed, max': '21.5 U/min'},
   {'Tipspeed': '80 m/s'},
   {'Type': '-'},
   {'Material': 'GFK'},
   {'Manufacturer': 'Enercon'},
   {'Power density 1': '505.2 W/m²'},
   {'Power density 2': '2.0 m²/kW'},
   {'Type': 'with out. direct drive'},
   {'Stages': '-'},
   {'Ratio': '-'},
   {'Manufacturer': '-'},
   {'Type': 'Synchronous'},
   {'Number': '1'},
   {'Speed, max': '21.5 U/min'},
   {'Voltage': '690.0 V'},
   {'Grid connection': 'IGBT'},
   {'Grid frequency': '50 Hz'},
   {'Manufacturer': 'Enercon'},
   {'Hub height': '64/85/98/99/113 m'},
   {'Type': 'Steel tube/ concrete'},
   {'Shape': 'co