In [39]:
# imports 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# 1. Data Manipulation for supervised learning problems


I would like to start with the most basic model imaginable possible which is ordinary least squares regression. Since we have time series data a necessary step before fitting the model is to process the data in a way such that we have a supervised learning problem. We do that by shifting the whole data set by one time-step. Repeating this five times yields a data set that have containts the information of the last five time steps which then can be used to predict the energy output of the current time step.

In [40]:
# getting familiar with the pandas shift function

# create a dataframe
df = pd.DataFrame(np.random.randn(1000, 4), index=pd.date_range('1/1/2000', periods=1000), columns=list('ABCD'))

# create new columns that are the shifted version of the columns
n_shifts = [1, 2, 3, 4, 5]

for i, column in enumerate(df.columns):
    for i in n_shifts:
        df[f'{column}_shift_{i}'] = df[column].shift(i)

df.head()

Unnamed: 0,A,B,C,D,A_shift_1,A_shift_2,A_shift_3,A_shift_4,A_shift_5,B_shift_1,...,C_shift_1,C_shift_2,C_shift_3,C_shift_4,C_shift_5,D_shift_1,D_shift_2,D_shift_3,D_shift_4,D_shift_5
2000-01-01,0.653174,0.084911,-0.454548,1.115888,,,,,,,...,,,,,,,,,,
2000-01-02,0.560634,0.522353,-0.881733,-1.093044,0.653174,,,,,0.084911,...,-0.454548,,,,,1.115888,,,,
2000-01-03,0.307025,0.549904,-0.510371,-0.87738,0.560634,0.653174,,,,0.522353,...,-0.881733,-0.454548,,,,-1.093044,1.115888,,,
2000-01-04,-0.957227,0.706703,-0.373004,0.370379,0.307025,0.560634,0.653174,,,0.549904,...,-0.510371,-0.881733,-0.454548,,,-0.87738,-1.093044,1.115888,,
2000-01-05,-0.180647,0.477552,0.529715,-0.409701,-0.957227,0.307025,0.560634,0.653174,,0.706703,...,-0.373004,-0.510371,-0.881733,-0.454548,,0.370379,-0.87738,-1.093044,1.115888,


In [41]:
# load the data
turbine_two = pd.read_csv('../data/cleaned/turbine_two.csv')

# use date column as index
turbine_two.set_index('Date', inplace=True)

In [42]:
# create shifts of the data 
n_shifts = [1, 2, 3, 4, 5]

for column in turbine_two.columns:
    for i in n_shifts:
        turbine_two[f'{column} (time {-i})'] = turbine_two[column].shift(i)

In [43]:
# drop the rows with NaN values
turbine_two.dropna(inplace=True)
turbine_two.head()

Unnamed: 0_level_0,Wind speed (m/s),Wind direction (°),Nacelle position (°),Energy Export (kWh),Power (kW),Nacelle ambient temperature (°C),Rotor speed (RPM),Wind speed (m/s) (time -1),Wind speed (m/s) (time -2),Wind speed (m/s) (time -3),...,Nacelle ambient temperature (°C) (time -1),Nacelle ambient temperature (°C) (time -2),Nacelle ambient temperature (°C) (time -3),Nacelle ambient temperature (°C) (time -4),Nacelle ambient temperature (°C) (time -5),Rotor speed (RPM) (time -1),Rotor speed (RPM) (time -2),Rotor speed (RPM) (time -3),Rotor speed (RPM) (time -4),Rotor speed (RPM) (time -5)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-05-03 10:40:00,5.657807,271.447266,274.899811,50.0,394.898346,11.5,10.659193,5.006634,4.905113,5.723492,...,11.955,11.7,11.655,11.305,10.835,9.273199,9.650973,11.043722,10.031982,12.381912
2016-05-03 10:50:00,4.163436,276.309357,259.268005,49.0,173.334015,12.405,8.994538,5.657807,5.006634,4.905113,...,11.5,11.955,11.7,11.655,11.305,10.659193,9.273199,9.650973,11.043722,10.031982
2016-05-03 11:00:00,4.43115,279.650452,279.763702,26.0,149.254593,11.905,8.744345,4.163436,5.657807,5.006634,...,12.405,11.5,11.955,11.7,11.655,8.994538,10.659193,9.273199,9.650973,11.043722
2016-05-03 11:10:00,5.683527,292.04129,292.092682,48.0,318.48291,11.845,10.005468,4.43115,4.163436,5.657807,...,11.905,12.405,11.5,11.955,11.7,8.744345,8.994538,10.659193,9.273199,9.650973
2016-05-03 11:20:00,5.770829,278.363129,294.390991,42.0,356.606689,11.985,10.284972,5.683527,4.43115,4.163436,...,11.845,11.905,12.405,11.5,11.955,10.005468,8.744345,8.994538,10.659193,9.273199
