In [99]:
import os
import zipfile
import pandas as pd
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

In [3]:
schleswig_holstein_weather_station_ids = ['05516', '02564', '04466', '03032', '02115']

In [4]:
# Extract schleswig holstein wind data
if os.path.exists('wind_data_historic/schleswig_holstein'):
    print('Data have already been extracted. Aborting.')
else:
    for file in os.listdir('wind_data_historic/historical'):
        if not file.endswith('.zip'):
            continue
        
        weather_station_id = file.split('_')[2]
        
        if not (weather_station_id in schleswig_holstein_weather_station_ids):
            continue
        
        with zipfile.ZipFile(f'wind_data_historic/historical/{file}', 'r') as zip_ref:
            zip_ref.extractall(f'wind_data_historic/schleswig_holstein')

Data have already been extracted. Aborting.


Before continuing with the pre-processing and feature engineering, I will take a detour and look at the types of wind turbines in the data and their cut-in and cut-out speeds (ie. operational limits wrt wind speed).

In [81]:
# Load tab separated file with wind turbine data
wind_turbine_data = pd.read_csv('wind_turbine_data/2023-10-30.csv', sep=';', on_bad_lines='warn')

In [82]:
wind_turbine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3944 entries, 0 to 3943
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   KREIS                 3944 non-null   object
 1   GEMEINDE              3944 non-null   object
 2   TYP                   3931 non-null   object
 3   HERSTELLER            3944 non-null   object
 4   NABENHOEHE            3916 non-null   object
 5   ROTORDURCHMESSER      3915 non-null   object
 6   SCHALLLEISTUNGSPEGEL  3725 non-null   object
 7   LEISTUNG              3944 non-null   int64 
 8   LEISTUNGSBEZUG        3944 non-null   object
 9   OSTWERT               3944 non-null   int64 
 10  NORDWERT              3944 non-null   int64 
 11  GENEHMIGT_AM          3553 non-null   object
 12  INBETRIEBNAHME        3157 non-null   object
 13  STATUS                3944 non-null   object
 14  BST_NR                3944 non-null   int64 
 15  ANL_NR                3944 non-null   

Preprocess wind power plant type data to consolidate different data entries of the same type.

In [83]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.replace(pat='([^a-zA-Z0-9 ]|wka|WKA|)', repl='', regex=True)

In [84]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.lower()

In [85]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.strip()

In [86]:
wind_turbine_data['TYP'] = wind_turbine_data['TYP'].str.replace(pat='^(typ|[0-9]+) ', repl='', regex=True)

In [101]:
driver = webdriver.Firefox()
driver.get("https://en.wind-turbine-models.com/search?q=enercon+e70+e4+2.000")

ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)

# Reject cookies on first visit
reject_cookies_button = driver.find_element(By.CLASS_NAME, 'cc-decline-all')

wait = WebDriverWait(driver, 10)
wait.until(lambda driver: reject_cookies_button.is_displayed() and reject_cookies_button.is_enabled())

reject_cookies_button.click()

for wind_turbine_type in tqdm(wind_turbine_data['TYP'].value_counts().index, desc='Scraping product specifications for wind turbine types in our dataset:'):
    search_input = driver.find_element(By.CSS_SELECTOR, 'input.searchbar')
    search_input.clear()
    search_input.send_keys(wind_turbine_type)
    search_input.submit()
    
    # Navigate to wind turbine results category
    # TODO: Might be unavailable if no result is found    
    turbine_category = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_element_located((By.XPATH, '//a[@href="#turbines"]')))
    
    turbine_category.click()
    
    # Click into detailed view of wind turbine
    # TODO: Might be unavailable if no result is found
    wind_turbine_link = driver.find_element(By.XPATH, '//div[@id="turbines"]/div/div/div[@class="section-body"]/div[1]')
    
    wait = WebDriverWait(driver, 10)
    wait.until(lambda driver: wind_turbine_link.is_displayed())
    
    wind_turbine_link.click()
    
    wind_turbine_specification_rows = driver.find_elements(By.CSS_SELECTOR, '.tabbox-table > .row')
    
    data = []
    for row in wind_turbine_specification_rows:
        label = row.find_element(By.CSS_SELECTOR, '.col-left').text
        content = row.find_element(By.CSS_SELECTOR, '.col-right').text
        
        label = label.replace(':', '')
        
        data.append({wind_turbine_type: {label: content}})

Scraping product specifications for wind turbine types in our dataset::   0%|          | 0/801 [00:00<?, ?it/s]


ElementNotInteractableException: Message: Element <a class="btn btn-default" href="#turbines"> could not be scrolled into view
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:353:5
webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:166:11
interaction.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:135:11
clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:204:29
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:84:31
