Preparing environment with necessary imports

In [1]:
import pandas as pd
import numpy as np
from pysolar.radiation import *
from pysolar.solar import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
%matplotlib inline

Reading in already prepared csv file with data from https://wind-erleben.de <br>
Additionally converting data to correct types and removing columns that are not used in model

In [2]:
df = pd.read_csv('2018.csv', sep='\t')
df.measurement_date = pd.to_datetime(df.measurement_date).dt.tz_localize('Europe/Berlin', ambiguous=True)
df.drop(columns=["pitch_degrees", "rotor_speed_rpm", "wind_direction_degrees",
                 "rotation_gondola_degrees", "charging_station_w",
                 "battery_drain_or_load_w", "state_of_charge_percent", "battery_voltage_v",
                 "rlm_solar_kw", "slp_solar_kw"], inplace=True)

Tidying up data, removing rows that do not really make sense

In [3]:
df = df[abs((df.solar_generation_kw + df.wind_generation_kw + df.chp_kw) - df.total_production_kw) < 1]
df = df[df.power_use_kw != 0]
df = df[(df.total_production_kw + df.electricity_purchase_kw) - df.power_use_kw > -1]
df = df[df.chp_kw < 1000]
df = df[df.electricity_purchase_kw < 10000]

Feature generation

In [4]:
df['angle'] = [get_altitude(49.5450, 10.2338, df.measurement_date[i]) for i in df.index] 
df['radiation'] = [get_radiation_direct(df.measurement_date[i], df.angle[i]) for i in df.index]
df['green_energy'] = (df.total_production_kw > df.power_use_kw).astype(int)
df['chp_online'] = (df.chp_kw > 10).astype(int)
df['weekday'] = df.measurement_date.dt.weekday
df['time_of_day'] = df.measurement_date.dt.hour
df['month'] = df.measurement_date.dt.month

Creation of data used in model

In [5]:
features = df[["time_of_day", "weekday", "wind_speed_m_s", "chp_online", "green_energy", "angle", "radiation"]]

labels = features['green_energy']

features = features.drop(columns=['green_energy'])

Seperating data into training dataset and testing dataset

In [6]:
_ = train_test_split(features, labels, test_size=0.25, random_state=1337)
train_features, test_features, train_labels, test_labels = _

Model generation and prediction creation

In [7]:
rf = RandomForestRegressor(n_estimators=1000, random_state=1337)

rf.fit(train_features, train_labels)

predictions = rf.predict(test_features)

<h1>Final results

In [8]:
print('explained_variance_score:', round(explained_variance_score(test_labels, predictions), 6))
print('r2_score:                ', round(r2_score(test_labels, predictions), 6))
print('mean_absolute_error:     ', round(mean_absolute_error(test_labels, predictions), 6))
print('mean_squared_error:      ', round(mean_squared_error(test_labels, predictions), 6))
print()
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

explained_variance_score: 0.623502
r2_score:                 0.623341
mean_absolute_error:      0.144735
mean_squared_error:       0.075548

Variable: wind_speed_m_s       Importance: 0.38
Variable: radiation            Importance: 0.37
Variable: angle                Importance: 0.11
Variable: weekday              Importance: 0.08
Variable: time_of_day          Importance: 0.05
Variable: chp_online           Importance: 0.01
