Preparing environment with necessary imports

In [1]:
import pandas as pd
import numpy as np
from pysolar.radiation import *
from pysolar.solar import *

Reading in already prepared csv file with data from https://wind-erleben.de <br>
Additionally converting data to correct types and removing columns that are not used in model

In [2]:
df = pd.read_csv('2018.csv', sep='\t')
df.measurement_date = pd.to_datetime(df.measurement_date).dt.tz_localize('Europe/Berlin', ambiguous=True)
df.drop(columns=["pitch_degrees", "rotor_speed_rpm", "wind_direction_degrees",
                 "rotation_gondola_degrees", "charging_station_w",
                 "battery_drain_or_load_w", "state_of_charge_percent", "battery_voltage_v",
                 "rlm_solar_kw", "slp_solar_kw"], inplace=True)

Reading in the weather data and merging with main csv

In [3]:
weather = pd.read_csv('full_weather_data.csv', sep='\t')
weather.time = pd.to_datetime(weather.time)
weather.time = weather.time.dt.tz_localize('Europe/Berlin', ambiguous='infer')

df = pd.merge_asof(df.sort_values('measurement_date'), weather, left_on=['measurement_date'], right_on=['time'])

Tidying up data, removing rows that do not really make sense

In [4]:
df = df[abs((df.solar_generation_kw + df.wind_generation_kw + df.chp_kw) - df.total_production_kw) < 1]
df = df[df.power_use_kw != 0]
df = df[(df.total_production_kw + df.electricity_purchase_kw) - df.power_use_kw > -1]
df = df[df.chp_kw < 1000]
df = df[df.electricity_purchase_kw < 10000]
df = df.dropna()

Important constants for feature generation

In [5]:
important_weather = ['Passing clouds.', 'Partly sunny.', 'Scattered clouds.', 
                     'Broken clouds.', 'Fog.', 'Cool.', 'Partly cloudy.', 
                     'Mild.', 'Overcast.', 'Mostly cloudy.']
uffenheim_lat = 49.5450 
uffenheim_lng = 10.2338

Feature generation

In [6]:
df['angle_of_sun'] = [max(get_altitude(uffenheim_lat, uffenheim_lng, df.measurement_date[i]), 0) for i in df.index] 
df['solar_radiation'] = [get_radiation_direct(df.measurement_date[i], df.angle_of_sun[i]) for i in df.index]
df['chp_online'] = (df.chp_kw > 10).astype(int)
df['weekday'] = df.measurement_date.dt.weekday
df['time_of_day'] = df.measurement_date.dt.hour
df['month'] = df.measurement_date.dt.month

# left in, but not used due to not impacting model that well
# df['filtered_weather'] = df.apply(lambda x: x.weather if x.weather in important_weather else 'Other', axis=1)

df['time_of_day_x'] = np.sin(2. * np.pi * df.time_of_day / 24.)
df['time_of_day_y'] = np.cos(2. * np.pi * df.time_of_day / 24.)

df['weekday_x'] = np.sin(2. * np.pi * df.weekday / 7.)
df['weekday_y'] = np.cos(2. * np.pi * df.weekday / 7.)

df['month_x'] = np.sin(2. * np.pi * df.month / 12.)
df['month_y'] = np.cos(2. * np.pi * df.month / 12.)

# uncomment to enable cheat mode
# wind_speed_m_s is as reported by the wind turbine, instead from weather prediction
# df.wind_speed = df.wind_speed_m_s

df['lagged_wind_speed'] = df['wind_speed'].shift(-1)

df['green_energy'] = df.total_production_kw / df.power_use_kw

df = df.dropna()

Creation of data used in model and outputting as csv

In [7]:
potential_labels = ['solar_generation_kw', 'power_use_kw', 'wind_generation_kw', 'green_energy']

label = 'green_energy'

features = df[['solar_generation_kw', 'power_use_kw', 'wind_generation_kw', 'green_energy',
               'time_of_day_x', 'time_of_day_y', 'weekday_x', 'weekday_y', 'month_x', 'month_y',
               'lagged_wind_speed', 'chp_online', 'wind_speed',
               'angle_of_sun', 'solar_radiation', 'humidity', 'pressure']].copy()

# using weather data does not work out well, leaving this in if ever reconsider the use of it
# features = pd.merge(features, pd.get_dummies(df.filtered_weather), left_index=True, right_index=True)

features.to_csv('features.csv')